Commit bfd1ffa9 authored by Tom Lane's avatar Tom Lane

Change patternsel (LIKE/regex selectivity estimation) so that if there

is a large enough histogram, it will use the number of matches in the
histogram to derive a selectivity estimate, rather than the admittedly
pretty bogus heuristics involving examining the pattern contents.  I set
'large enough' at 100, but perhaps we should change that later.  Also
apply the same technique in contrib/ltree's <@ and @> estimator.  Per
discussion with Stefan Kaltenbrunner and Matteo Beccati.
parent 06b33f0e
/* /*
* op function for ltree * op function for ltree
* Teodor Sigaev <teodor@stack.net> * Teodor Sigaev <teodor@stack.net>
* $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.12 2006/05/30 22:12:13 tgl Exp $ * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.13 2006/09/20 19:50:21 tgl Exp $
*/ */
#include "ltree.h" #include "ltree.h"
#include <ctype.h> #include <ctype.h>
#include "catalog/pg_statistic.h"
#include "utils/lsyscache.h" #include "utils/lsyscache.h"
#include "utils/selfuncs.h" #include "utils/selfuncs.h"
#include "utils/syscache.h" #include "utils/syscache.h"
...@@ -606,6 +607,7 @@ ltreeparentsel(PG_FUNCTION_ARGS) ...@@ -606,6 +607,7 @@ ltreeparentsel(PG_FUNCTION_ARGS)
FmgrInfo contproc; FmgrInfo contproc;
double mcvsum; double mcvsum;
double mcvsel; double mcvsel;
double nullfrac;
fmgr_info(get_opcode(operator), &contproc); fmgr_info(get_opcode(operator), &contproc);
...@@ -616,10 +618,40 @@ ltreeparentsel(PG_FUNCTION_ARGS) ...@@ -616,10 +618,40 @@ ltreeparentsel(PG_FUNCTION_ARGS)
&mcvsum); &mcvsum);
/* /*
* We have the exact selectivity for values appearing in the MCV list; * If the histogram is large enough, see what fraction of it the
* use the default selectivity for the rest of the population. * constant is "<@" to, and assume that's representative of the
* non-MCV population. Otherwise use the default selectivity for
* the non-MCV population.
*/ */
selec = mcvsel + DEFAULT_PARENT_SEL * (1.0 - mcvsum); selec = histogram_selectivity(&vardata, &contproc,
constval, varonleft,
100, 1);
if (selec < 0)
{
/* Nope, fall back on default */
selec = DEFAULT_PARENT_SEL;
}
else
{
/* Yes, but don't believe extremely small or large estimates. */
if (selec < 0.0001)
selec = 0.0001;
else if (selec > 0.9999)
selec = 0.9999;
}
if (HeapTupleIsValid(vardata.statsTuple))
nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
else
nullfrac = 0.0;
/*
* Now merge the results from the MCV and histogram calculations,
* realizing that the histogram covers only the non-null values that
* are not listed in MCV.
*/
selec *= 1.0 - nullfrac - mcvsum;
selec += mcvsel;
} }
else else
selec = DEFAULT_PARENT_SEL; selec = DEFAULT_PARENT_SEL;
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.212 2006/09/19 22:49:53 tgl Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.213 2006/09/20 19:50:21 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -235,7 +235,7 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -235,7 +235,7 @@ eqsel(PG_FUNCTION_ARGS)
{ {
/* /*
* Constant is "=" to this common value. We know selectivity * Constant is "=" to this common value. We know selectivity
* exactly (or as exactly as VACUUM could calculate it, * exactly (or as exactly as ANALYZE could calculate it,
* anyway). * anyway).
*/ */
selec = numbers[i]; selec = numbers[i];
...@@ -315,7 +315,7 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -315,7 +315,7 @@ eqsel(PG_FUNCTION_ARGS)
else else
{ {
/* /*
* No VACUUM ANALYZE stats available, so make a guess using estimated * No ANALYZE stats available, so make a guess using estimated
* number of distinct values and assuming they are equally common. * number of distinct values and assuming they are equally common.
* (The guess is unlikely to be very good, but we do know a few * (The guess is unlikely to be very good, but we do know a few
* special cases.) * special cases.)
...@@ -446,7 +446,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -446,7 +446,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
} }
/* /*
* mcv_selectivity - Examine the MCV list for scalarineqsel * mcv_selectivity - Examine the MCV list for selectivity estimates
* *
* Determine the fraction of the variable's MCV population that satisfies * Determine the fraction of the variable's MCV population that satisfies
* the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft. Also * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft. Also
...@@ -500,6 +500,80 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, ...@@ -500,6 +500,80 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
return mcv_selec; return mcv_selec;
} }
/*
* histogram_selectivity - Examine the histogram for selectivity estimates
*
* Determine the fraction of the variable's histogram entries that satisfy
* the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
*
* This code will work for any boolean-returning predicate operator, whether
* or not it has anything to do with the histogram sort operator. We are
* essentially using the histogram just as a representative sample. However,
* small histograms are unlikely to be all that representative, so the caller
* should specify a minimum histogram size to use, and fall back on some
* other approach if this routine fails.
*
* The caller also specifies n_skip, which causes us to ignore the first and
* last n_skip histogram elements, on the grounds that they are outliers and
* hence not very representative. If in doubt, min_hist_size = 100 and
* n_skip = 1 are reasonable values.
*
* The function result is the selectivity, or -1 if there is no histogram
* or it's smaller than min_hist_size.
*
* Note that the result disregards both the most-common-values (if any) and
* null entries. The caller is expected to combine this result with
* statistics for those portions of the column population. It may also be
* prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
*/
double
histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
Datum constval, bool varonleft,
int min_hist_size, int n_skip)
{
double result;
Datum *values;
int nvalues;
/* check sanity of parameters */
Assert(n_skip >= 0);
Assert(min_hist_size > 2 * n_skip);
if (HeapTupleIsValid(vardata->statsTuple) &&
get_attstatsslot(vardata->statsTuple,
vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_HISTOGRAM, InvalidOid,
&values, &nvalues,
NULL, NULL))
{
if (nvalues >= min_hist_size)
{
int nmatch = 0;
int i;
for (i = n_skip; i < nvalues - n_skip; i++)
{
if (varonleft ?
DatumGetBool(FunctionCall2(opproc,
values[i],
constval)) :
DatumGetBool(FunctionCall2(opproc,
constval,
values[i])))
nmatch++;
}
result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
}
else
result = -1;
free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
}
else
result = -1;
return result;
}
/* /*
* ineq_histogram_selectivity - Examine the histogram for scalarineqsel * ineq_histogram_selectivity - Examine the histogram for scalarineqsel
* *
...@@ -521,12 +595,11 @@ ineq_histogram_selectivity(VariableStatData *vardata, ...@@ -521,12 +595,11 @@ ineq_histogram_selectivity(VariableStatData *vardata,
double hist_selec; double hist_selec;
Datum *values; Datum *values;
int nvalues; int nvalues;
int i;
hist_selec = 0.0; hist_selec = 0.0;
/* /*
* Someday, VACUUM might store more than one histogram per rel/att, * Someday, ANALYZE might store more than one histogram per rel/att,
* corresponding to more than one possible sort ordering defined for the * corresponding to more than one possible sort ordering defined for the
* column type. However, to make that work we will need to figure out * column type. However, to make that work we will need to figure out
* which staop to search for --- it's not necessarily the one we have at * which staop to search for --- it's not necessarily the one we have at
...@@ -544,43 +617,46 @@ ineq_histogram_selectivity(VariableStatData *vardata, ...@@ -544,43 +617,46 @@ ineq_histogram_selectivity(VariableStatData *vardata,
{ {
if (nvalues > 1) if (nvalues > 1)
{ {
/*
* Use binary search to find proper location, ie, the first
* slot at which the comparison fails. (If the given operator
* isn't actually sort-compatible with the histogram, you'll
* get garbage results ... but probably not any more garbage-y
* than you would from the old linear search.)
*/
double histfrac; double histfrac;
int lobound = 0; /* first possible slot to search */
int hibound = nvalues; /* last+1 slot to search */
while (lobound < hibound)
{
int probe = (lobound + hibound) / 2;
bool ltcmp; bool ltcmp;
ltcmp = DatumGetBool(FunctionCall2(opproc, ltcmp = DatumGetBool(FunctionCall2(opproc,
values[0], values[probe],
constval)); constval));
if (isgt) if (isgt)
ltcmp = !ltcmp; ltcmp = !ltcmp;
if (!ltcmp) if (ltcmp)
lobound = probe + 1;
else
hibound = probe;
}
if (lobound <= 0)
{ {
/* Constant is below lower histogram boundary. */ /* Constant is below lower histogram boundary. */
histfrac = 0.0; histfrac = 0.0;
} }
else else if (lobound >= nvalues)
{
/*
* Scan to find proper location. This could be made faster by
* using a binary-search method, but it's probably not worth
* the trouble for typical histogram sizes.
*/
for (i = 1; i < nvalues; i++)
{
ltcmp = DatumGetBool(FunctionCall2(opproc,
values[i],
constval));
if (isgt)
ltcmp = !ltcmp;
if (!ltcmp)
break;
}
if (i >= nvalues)
{ {
/* Constant is above upper histogram boundary. */ /* Constant is above upper histogram boundary. */
histfrac = 1.0; histfrac = 1.0;
} }
else else
{ {
int i = lobound;
double val, double val,
high, high,
low; low;
...@@ -643,7 +719,6 @@ ineq_histogram_selectivity(VariableStatData *vardata, ...@@ -643,7 +719,6 @@ ineq_histogram_selectivity(VariableStatData *vardata,
histfrac = (double) (i - 1) + binfrac; histfrac = (double) (i - 1) + binfrac;
histfrac /= (double) (nvalues - 1); histfrac /= (double) (nvalues - 1);
} }
}
/* /*
* Now histfrac = fraction of histogram entries below the * Now histfrac = fraction of histogram entries below the
...@@ -970,28 +1045,34 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) ...@@ -970,28 +1045,34 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
else else
{ {
/* /*
* Not exact-match pattern. We estimate selectivity of the fixed * Not exact-match pattern. If we have a sufficiently large
* prefix and remainder of pattern separately, then combine the two * histogram, estimate selectivity for the histogram part of the
* to get an estimate of the selectivity for the part of the column * population by counting matches in the histogram. If not, estimate
* population represented by the histogram. We then add up data for * selectivity of the fixed prefix and remainder of pattern
* any most-common-values values; these are not in the histogram * separately, then combine the two to get an estimate of the
* population, and we can get exact answers for them by applying * selectivity for the part of the column population represented by
* the pattern operator, so there's no reason to approximate. * the histogram. We then add up data for any most-common-values
* (If the MCVs cover a significant part of the total population, * values; these are not in the histogram population, and we can get
* this gives us a big leg up in accuracy.) * exact answers for them by applying the pattern operator, so there's
* no reason to approximate. (If the MCVs cover a significant part of
* the total population, this gives us a big leg up in accuracy.)
*/ */
Selectivity prefixsel;
Selectivity restsel;
Selectivity selec; Selectivity selec;
FmgrInfo opproc; FmgrInfo opproc;
double nullfrac, double nullfrac,
mcv_selec, mcv_selec,
sumcommon; sumcommon;
if (HeapTupleIsValid(vardata.statsTuple)) /* Try to use the histogram entries to get selectivity */
nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac; fmgr_info(get_opcode(operator), &opproc);
else
nullfrac = 0.0; selec = histogram_selectivity(&vardata, &opproc, constval, true,
100, 1);
if (selec < 0)
{
/* Nope, so fake it with the heuristic method */
Selectivity prefixsel;
Selectivity restsel;
if (pstatus == Pattern_Prefix_Partial) if (pstatus == Pattern_Prefix_Partial)
prefixsel = prefix_selectivity(&vardata, opclass, prefix); prefixsel = prefix_selectivity(&vardata, opclass, prefix);
...@@ -999,6 +1080,15 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) ...@@ -999,6 +1080,15 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
prefixsel = 1.0; prefixsel = 1.0;
restsel = pattern_selectivity(rest, ptype); restsel = pattern_selectivity(rest, ptype);
selec = prefixsel * restsel; selec = prefixsel * restsel;
}
else
{
/* Yes, but don't believe extremely small or large estimates. */
if (selec < 0.0001)
selec = 0.0001;
else if (selec > 0.9999)
selec = 0.9999;
}
/* /*
* If we have most-common-values info, add up the fractions of the MCV * If we have most-common-values info, add up the fractions of the MCV
...@@ -1006,10 +1096,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) ...@@ -1006,10 +1096,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
* directly to the result selectivity. Also add up the total fraction * directly to the result selectivity. Also add up the total fraction
* represented by MCV entries. * represented by MCV entries.
*/ */
fmgr_info(get_opcode(operator), &opproc);
mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true, mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
&sumcommon); &sumcommon);
if (HeapTupleIsValid(vardata.statsTuple))
nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
else
nullfrac = 0.0;
/* /*
* Now merge the results from the MCV and histogram calculations, * Now merge the results from the MCV and histogram calculations,
* realizing that the histogram covers only the non-null values that * realizing that the histogram covers only the non-null values that
...@@ -1332,7 +1426,7 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype, ...@@ -1332,7 +1426,7 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
else else
{ {
/* /*
* No VACUUM ANALYZE stats available, so make a guess * No ANALYZE stats available, so make a guess
*/ */
switch (nulltesttype) switch (nulltesttype)
{ {
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.34 2006/07/01 22:07:23 tgl Exp $ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.35 2006/09/20 19:50:21 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -110,6 +110,9 @@ extern double get_variable_numdistinct(VariableStatData *vardata); ...@@ -110,6 +110,9 @@ extern double get_variable_numdistinct(VariableStatData *vardata);
extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
Datum constval, bool varonleft, Datum constval, bool varonleft,
double *sumcommonp); double *sumcommonp);
extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
Datum constval, bool varonleft,
int min_hist_size, int n_skip);
extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt, extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
Pattern_Type ptype, Pattern_Type ptype,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment