Commit ce8fd39e authored by Tom Lane's avatar Tom Lane

Improve patternsel() by applying the operator itself to each value

listed in the column's most-common-values statistics entry.  This gives
us an exact selectivity result for the portion of the column population
represented by the MCV list, which can be a big leg up in accuracy if
that's a large fraction of the population.  The heuristics involving
pattern contents and prefix are applied only to the part of the population
not included in the MCV list.
parent ad24b8e6
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.194 2005/11/25 19:47:49 tgl Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.195 2006/01/10 17:35:52 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -131,6 +131,11 @@ typedef struct ...@@ -131,6 +131,11 @@ typedef struct
} while(0) } while(0)
static double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
Datum constval, double *sumcommonp);
static double ineq_histogram_selectivity(VariableStatData *vardata,
FmgrInfo *opproc, bool isgt,
Datum constval, Oid consttype);
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid, Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound); double *scaledlobound, double *scaledhibound);
...@@ -164,8 +169,8 @@ static void examine_variable(PlannerInfo *root, Node *node, int varRelid, ...@@ -164,8 +169,8 @@ static void examine_variable(PlannerInfo *root, Node *node, int varRelid,
static double get_variable_numdistinct(VariableStatData *vardata); static double get_variable_numdistinct(VariableStatData *vardata);
static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata, static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
Oid sortop, Datum *max); Oid sortop, Datum *max);
static Selectivity prefix_selectivity(PlannerInfo *root, Node *variable, static Selectivity prefix_selectivity(VariableStatData *vardata,
Oid opclass, Const *prefix); Oid opclass, Const *prefixcon);
static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype); static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
static Datum string_to_datum(const char *str, Oid datatype); static Datum string_to_datum(const char *str, Oid datatype);
static Const *string_to_const(const char *str, Oid datatype); static Const *string_to_const(const char *str, Oid datatype);
...@@ -426,15 +431,10 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -426,15 +431,10 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
{ {
Form_pg_statistic stats; Form_pg_statistic stats;
FmgrInfo opproc; FmgrInfo opproc;
Datum *values;
int nvalues;
float4 *numbers;
int nnumbers;
double mcv_selec, double mcv_selec,
hist_selec, hist_selec,
sumcommon; sumcommon;
double selec; double selec;
int i;
if (!HeapTupleIsValid(vardata->statsTuple)) if (!HeapTupleIsValid(vardata->statsTuple))
{ {
...@@ -451,10 +451,76 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -451,10 +451,76 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
* to the result selectivity. Also add up the total fraction represented * to the result selectivity. Also add up the total fraction represented
* by MCV entries. * by MCV entries.
*/ */
mcv_selec = mcv_selectivity(vardata, &opproc, constval,
&sumcommon);
/*
* If there is a histogram, determine which bin the constant falls in, and
* compute the resulting contribution to selectivity.
*/
hist_selec = ineq_histogram_selectivity(vardata, &opproc, isgt,
constval, consttype);
/*
* Now merge the results from the MCV and histogram calculations,
* realizing that the histogram covers only the non-null values that are
* not listed in MCV.
*/
selec = 1.0 - stats->stanullfrac - sumcommon;
if (hist_selec > 0.0)
selec *= hist_selec;
else
{
/*
* If no histogram but there are values not accounted for by MCV,
* arbitrarily assume half of them will match.
*/
selec *= 0.5;
}
selec += mcv_selec;
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* mcv_selectivity - Examine the MCV list for scalarineqsel
*
* Determine the fraction of the variable's MCV population that satisfies
* the predicate (VAR OP CONST), as well as the fraction of the total column
* population represented by the MCV list. This code will work for any
* boolean-returning predicate operator.
*
* The function result is the MCV selectivity, and the fraction of the
* total population is returned into *sumcommonp. Zeroes are returned
* if there is no MCV list.
*/
static double
mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval,
double *sumcommonp)
{
double mcv_selec,
sumcommon;
Datum *values;
int nvalues;
float4 *numbers;
int nnumbers;
int i;
mcv_selec = 0.0; mcv_selec = 0.0;
sumcommon = 0.0; sumcommon = 0.0;
if (get_attstatsslot(vardata->statsTuple, /*
* If we have most-common-values info, add up the fractions of the MCV
* entries that satisfy MCV OP CONST. Also add up the total fraction
* represented by MCV entries.
*/
if (HeapTupleIsValid(vardata->statsTuple) &&
get_attstatsslot(vardata->statsTuple,
vardata->atttype, vardata->atttypmod, vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_MCV, InvalidOid, STATISTIC_KIND_MCV, InvalidOid,
&values, &nvalues, &values, &nvalues,
...@@ -462,7 +528,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -462,7 +528,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
{ {
for (i = 0; i < nvalues; i++) for (i = 0; i < nvalues; i++)
{ {
if (DatumGetBool(FunctionCall2(&opproc, if (DatumGetBool(FunctionCall2(opproc,
values[i], values[i],
constval))) constval)))
mcv_selec += numbers[i]; mcv_selec += numbers[i];
...@@ -472,10 +538,36 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -472,10 +538,36 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
numbers, nnumbers); numbers, nnumbers);
} }
*sumcommonp = sumcommon;
return mcv_selec;
}
/*
* ineq_histogram_selectivity - Examine the histogram for scalarineqsel
*
* Determine the fraction of the variable's histogram population that
* satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
*
* Returns zero if there is no histogram (valid results will always be
* greater than zero).
*
* Note that the result disregards both the most-common-values (if any) and
* null entries. The caller is expected to combine this result with
* statistics for those portions of the column population.
*/
static double
ineq_histogram_selectivity(VariableStatData *vardata,
FmgrInfo *opproc, bool isgt,
Datum constval, Oid consttype)
{
double hist_selec;
Datum *values;
int nvalues;
int i;
hist_selec = 0.0;
/* /*
* If there is a histogram, determine which bin the constant falls in, and
* compute the resulting contribution to selectivity.
*
* Someday, VACUUM might store more than one histogram per rel/att, * Someday, VACUUM might store more than one histogram per rel/att,
* corresponding to more than one possible sort ordering defined for the * corresponding to more than one possible sort ordering defined for the
* column type. However, to make that work we will need to figure out * column type. However, to make that work we will need to figure out
...@@ -485,9 +577,8 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -485,9 +577,8 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
* appears in pg_statistic is sorted the same way our operator sorts, or * appears in pg_statistic is sorted the same way our operator sorts, or
* the reverse way if isgt is TRUE. * the reverse way if isgt is TRUE.
*/ */
hist_selec = 0.0; if (HeapTupleIsValid(vardata->statsTuple) &&
get_attstatsslot(vardata->statsTuple,
if (get_attstatsslot(vardata->statsTuple,
vardata->atttype, vardata->atttypmod, vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_HISTOGRAM, InvalidOid, STATISTIC_KIND_HISTOGRAM, InvalidOid,
&values, &nvalues, &values, &nvalues,
...@@ -498,7 +589,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -498,7 +589,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
double histfrac; double histfrac;
bool ltcmp; bool ltcmp;
ltcmp = DatumGetBool(FunctionCall2(&opproc, ltcmp = DatumGetBool(FunctionCall2(opproc,
values[0], values[0],
constval)); constval));
if (isgt) if (isgt)
...@@ -517,7 +608,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -517,7 +608,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
*/ */
for (i = 1; i < nvalues; i++) for (i = 1; i < nvalues; i++)
{ {
ltcmp = DatumGetBool(FunctionCall2(&opproc, ltcmp = DatumGetBool(FunctionCall2(opproc,
values[i], values[i],
constval)); constval));
if (isgt) if (isgt)
...@@ -618,30 +709,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, ...@@ -618,30 +709,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
} }
/* return hist_selec;
* Now merge the results from the MCV and histogram calculations,
* realizing that the histogram covers only the non-null values that are
* not listed in MCV.
*/
selec = 1.0 - stats->stanullfrac - sumcommon;
if (hist_selec > 0.0)
selec *= hist_selec;
else
{
/*
* If no histogram but there are values not accounted for by MCV,
* arbitrarily assume half of them will match.
*/
selec *= 0.5;
}
selec += mcv_selec;
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec);
return selec;
} }
/* /*
...@@ -801,10 +869,7 @@ static double ...@@ -801,10 +869,7 @@ static double
patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
{ {
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
#ifdef NOT_USED
Oid operator = PG_GETARG_OID(1); Oid operator = PG_GETARG_OID(1);
#endif
List *args = (List *) PG_GETARG_POINTER(2); List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3); int varRelid = PG_GETARG_INT32(3);
VariableStatData vardata; VariableStatData vardata;
...@@ -948,18 +1013,53 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) ...@@ -948,18 +1013,53 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
{ {
/* /*
* Not exact-match pattern. We estimate selectivity of the fixed * Not exact-match pattern. We estimate selectivity of the fixed
* prefix and remainder of pattern separately, then combine the two. * prefix and remainder of pattern separately, then combine the two
* to get an estimate of the selectivity for the part of the column
* population represented by the histogram. We then add up data for
* any most-common-values values; these are not in the histogram
* population, and we can get exact answers for them by applying
* the pattern operator, so there's no reason to approximate.
* (If the MCVs cover a significant part of the total population,
* this gives us a big leg up in accuracy.)
*/ */
Selectivity prefixsel; Selectivity prefixsel;
Selectivity restsel; Selectivity restsel;
Selectivity selec; Selectivity selec;
FmgrInfo opproc;
double nullfrac,
mcv_selec,
sumcommon;
if (HeapTupleIsValid(vardata.statsTuple))
nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
else
nullfrac = 0.0;
if (pstatus == Pattern_Prefix_Partial) if (pstatus == Pattern_Prefix_Partial)
prefixsel = prefix_selectivity(root, variable, opclass, prefix); prefixsel = prefix_selectivity(&vardata, opclass, prefix);
else else
prefixsel = 1.0; prefixsel = 1.0;
restsel = pattern_selectivity(rest, ptype); restsel = pattern_selectivity(rest, ptype);
selec = prefixsel * restsel; selec = prefixsel * restsel;
/*
* If we have most-common-values info, add up the fractions of the MCV
* entries that satisfy MCV OP PATTERN. These fractions contribute
* directly to the result selectivity. Also add up the total fraction
* represented by MCV entries.
*/
fmgr_info(get_opcode(operator), &opproc);
mcv_selec = mcv_selectivity(&vardata, &opproc, constval,
&sumcommon);
/*
* Now merge the results from the MCV and histogram calculations,
* realizing that the histogram covers only the non-null values that
* are not listed in MCV.
*/
selec *= 1.0 - nullfrac - sumcommon;
selec += mcv_selec;
/* result should be in range, but make sure... */ /* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec); CLAMP_PROBABILITY(selec);
result = selec; result = selec;
...@@ -2427,7 +2527,7 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets) ...@@ -2427,7 +2527,7 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
/* /*
* convert_to_scalar * convert_to_scalar
* Convert non-NULL values of the indicated types to the comparison * Convert non-NULL values of the indicated types to the comparison
* scale needed by scalarltsel()/scalargtsel(). * scale needed by scalarineqsel().
* Returns "true" if successful. * Returns "true" if successful.
* *
* XXX this routine is a hack: ideally we should look up the conversion * XXX this routine is a hack: ideally we should look up the conversion
...@@ -3841,6 +3941,10 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, ...@@ -3841,6 +3941,10 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
* A fixed prefix "foo" is estimated as the selectivity of the expression * A fixed prefix "foo" is estimated as the selectivity of the expression
* "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c). * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
* *
* The selectivity estimate is with respect to the portion of the column
* population represented by the histogram --- the caller must fold this
* together with info about MCVs and NULLs.
*
* We use the >= and < operators from the specified btree opclass to do the * We use the >= and < operators from the specified btree opclass to do the
* estimation. The given variable and Const must be of the associated * estimation. The given variable and Const must be of the associated
* datatype. * datatype.
...@@ -3851,25 +3955,28 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, ...@@ -3851,25 +3955,28 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
* more useful to use the upper-bound code than not. * more useful to use the upper-bound code than not.
*/ */
static Selectivity static Selectivity
prefix_selectivity(PlannerInfo *root, Node *variable, prefix_selectivity(VariableStatData *vardata, Oid opclass, Const *prefixcon)
Oid opclass, Const *prefixcon)
{ {
Selectivity prefixsel; Selectivity prefixsel;
Oid cmpopr; Oid cmpopr;
List *cmpargs; FmgrInfo opproc;
Const *greaterstrcon; Const *greaterstrcon;
cmpopr = get_opclass_member(opclass, InvalidOid, cmpopr = get_opclass_member(opclass, InvalidOid,
BTGreaterEqualStrategyNumber); BTGreaterEqualStrategyNumber);
if (cmpopr == InvalidOid) if (cmpopr == InvalidOid)
elog(ERROR, "no >= operator for opclass %u", opclass); elog(ERROR, "no >= operator for opclass %u", opclass);
cmpargs = list_make2(variable, prefixcon); fmgr_info(get_opcode(cmpopr), &opproc);
/* Assume scalargtsel is appropriate for all supported types */
prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel, prefixsel = ineq_histogram_selectivity(vardata, &opproc, true,
PointerGetDatum(root), prefixcon->constvalue,
ObjectIdGetDatum(cmpopr), prefixcon->consttype);
PointerGetDatum(cmpargs),
Int32GetDatum(0))); if (prefixsel <= 0.0)
{
/* No histogram is present ... return a suitable default estimate */
return 0.005;
}
/*------- /*-------
* If we can create a string larger than the prefix, say * If we can create a string larger than the prefix, say
...@@ -3885,49 +3992,30 @@ prefix_selectivity(PlannerInfo *root, Node *variable, ...@@ -3885,49 +3992,30 @@ prefix_selectivity(PlannerInfo *root, Node *variable,
BTLessStrategyNumber); BTLessStrategyNumber);
if (cmpopr == InvalidOid) if (cmpopr == InvalidOid)
elog(ERROR, "no < operator for opclass %u", opclass); elog(ERROR, "no < operator for opclass %u", opclass);
cmpargs = list_make2(variable, greaterstrcon); fmgr_info(get_opcode(cmpopr), &opproc);
/* Assume scalarltsel is appropriate for all supported types */
topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel, topsel = ineq_histogram_selectivity(vardata, &opproc, false,
PointerGetDatum(root), greaterstrcon->constvalue,
ObjectIdGetDatum(cmpopr), greaterstrcon->consttype);
PointerGetDatum(cmpargs),
Int32GetDatum(0))); /* ineq_histogram_selectivity worked before, it shouldn't fail now */
Assert(topsel > 0.0);
/* /*
* Merge the two selectivities in the same way as for a range query * Merge the two selectivities in the same way as for a range query
* (see clauselist_selectivity()). * (see clauselist_selectivity()). Note that we don't need to worry
* about double-exclusion of nulls, since ineq_histogram_selectivity
* doesn't count those anyway.
*/ */
prefixsel = topsel + prefixsel - 1.0; prefixsel = topsel + prefixsel - 1.0;
/* Adjust for double-exclusion of NULLs */
prefixsel += nulltestsel(root, IS_NULL, variable, 0);
/* /*
* A zero or slightly negative prefixsel should be converted into a * A zero or negative prefixsel should be converted into a small
* small positive value; we probably are dealing with a very tight * positive value; we probably are dealing with a very tight range
* range and got a bogus result due to roundoff errors. However, if * and got a bogus result due to roundoff errors.
* prefixsel is very negative, then we probably have default
* selectivity estimates on one or both sides of the range. In that
* case, insert a not-so-wildly-optimistic default estimate.
*/ */
if (prefixsel <= 0.0) if (prefixsel <= 0.0)
{ prefixsel = 1.0e-10;
if (prefixsel < -0.01)
{
/*
* No data available --- use a default estimate that is small,
* but not real small.
*/
prefixsel = 0.005;
}
else
{
/*
* It's just roundoff error; use a small positive value
*/
prefixsel = 1.0e-10;
}
}
} }
return prefixsel; return prefixsel;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment