Commit 82e1ba7f authored by Tom Lane's avatar Tom Lane

Make ANALYZE compute basic statistics even for types with no "=" operator.

Previously, ANALYZE simply ignored columns of datatypes that have neither
a btree nor hash opclass (which means they have no recognized equality
operator).  Without a notion of equality, we can't identify most-common
values nor estimate the number of distinct values.  But we can still
count nulls and compute the average physical column width, and those
stats might be of value.  Moreover there are some tools out there that
don't work so well if rows are missing from pg_statistic.  So let's
add suitable logic for this case.

While this is arguably a bug fix, it also has the potential to change
query plans, and the gain seems not worth taking a risk of that in
stable branches.  So back-patch into 9.5 but not further.

Oleksandr Shulgin, rewritten a bit by me.
parent a0d9f6e4
......@@ -1689,7 +1689,11 @@ typedef struct
} CompareScalarsContext;
static void compute_minimal_stats(VacAttrStatsP stats,
static void compute_trivial_stats(VacAttrStatsP stats,
AnalyzeAttrFetchFunc fetchfunc,
int samplerows,
double totalrows);
static void compute_distinct_stats(VacAttrStatsP stats,
AnalyzeAttrFetchFunc fetchfunc,
int samplerows,
double totalrows);
......@@ -1723,21 +1727,17 @@ std_typanalyze(VacAttrStats *stats)
&ltopr, &eqopr, NULL,
NULL);
/* If column has no "=" operator, we can't do much of anything */
if (!OidIsValid(eqopr))
return false;
/* Save the operator info for compute_stats routines */
mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData));
mystats->eqopr = eqopr;
mystats->eqfunc = get_opcode(eqopr);
mystats->eqfunc = OidIsValid(eqopr) ? get_opcode(eqopr) : InvalidOid;
mystats->ltopr = ltopr;
stats->extra_data = mystats;
/*
* Determine which standard statistics algorithm to use
*/
if (OidIsValid(ltopr))
if (OidIsValid(eqopr) && OidIsValid(ltopr))
{
/* Seems to be a scalar datatype */
stats->compute_stats = compute_scalar_stats;
......@@ -1762,10 +1762,17 @@ std_typanalyze(VacAttrStats *stats)
*/
stats->minrows = 300 * attr->attstattarget;
}
else if (OidIsValid(eqopr))
{
/* We can still recognize distinct values */
stats->compute_stats = compute_distinct_stats;
/* Might as well use the same minrows as above */
stats->minrows = 300 * attr->attstattarget;
}
else
{
/* Can't do much but the minimal stuff */
stats->compute_stats = compute_minimal_stats;
/* Can't do much but the trivial stuff */
stats->compute_stats = compute_trivial_stats;
/* Might as well use the same minrows as above */
stats->minrows = 300 * attr->attstattarget;
}
......@@ -1773,8 +1780,91 @@ std_typanalyze(VacAttrStats *stats)
return true;
}
/*
* compute_trivial_stats() -- compute very basic column statistics
*
* We use this when we cannot find a hash "=" operator for the datatype.
*
* We determine the fraction of non-null rows and the average datum width.
*/
static void
compute_trivial_stats(VacAttrStatsP stats,
AnalyzeAttrFetchFunc fetchfunc,
int samplerows,
double totalrows)
{
int i;
int null_cnt = 0;
int nonnull_cnt = 0;
double total_width = 0;
bool is_varlena = (!stats->attrtype->typbyval &&
stats->attrtype->typlen == -1);
bool is_varwidth = (!stats->attrtype->typbyval &&
stats->attrtype->typlen < 0);
for (i = 0; i < samplerows; i++)
{
Datum value;
bool isnull;
vacuum_delay_point();
value = fetchfunc(stats, i, &isnull);
/* Check for null/nonnull */
if (isnull)
{
null_cnt++;
continue;
}
nonnull_cnt++;
/*
* If it's a variable-width field, add up widths for average width
* calculation. Note that if the value is toasted, we use the toasted
* width. We don't bother with this calculation if it's a fixed-width
* type.
*/
if (is_varlena)
{
total_width += VARSIZE_ANY(DatumGetPointer(value));
}
else if (is_varwidth)
{
/* must be cstring */
total_width += strlen(DatumGetCString(value)) + 1;
}
}
/* We can only compute average width if we found some non-null values. */
if (nonnull_cnt > 0)
{
stats->stats_valid = true;
/* Do the simple null-frac and width stats */
stats->stanullfrac = (double) null_cnt / (double) samplerows;
if (is_varwidth)
stats->stawidth = total_width / (double) nonnull_cnt;
else
stats->stawidth = stats->attrtype->typlen;
stats->stadistinct = 0.0; /* "unknown" */
}
else if (null_cnt > 0)
{
/* We found only nulls; assume the column is entirely null */
stats->stats_valid = true;
stats->stanullfrac = 1.0;
if (is_varwidth)
stats->stawidth = 0; /* "unknown" */
else
stats->stawidth = stats->attrtype->typlen;
stats->stadistinct = 0.0; /* "unknown" */
}
}
/*
* compute_minimal_stats() -- compute minimal column statistics
* compute_distinct_stats() -- compute column statistics including ndistinct
*
* We use this when we can find only an "=" operator for the datatype.
*
......@@ -1789,7 +1879,7 @@ std_typanalyze(VacAttrStats *stats)
* depend mainly on the length of the list we are willing to keep.
*/
static void
compute_minimal_stats(VacAttrStatsP stats,
compute_distinct_stats(VacAttrStatsP stats,
AnalyzeAttrFetchFunc fetchfunc,
int samplerows,
double totalrows)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment