Commit 422495d0 authored by Tom Lane's avatar Tom Lane

Modify prefix_selectivity() so that it will never estimate the selectivity

of the generated range condition var >= 'foo' AND var < 'fop' as being less
than what eqsel() would estimate for var = 'foo'.  This is intuitively
reasonable and it gets rid of the need for some entirely ad-hoc coding we
formerly used to reject bogus estimates.  The basic problem here is that
if the prefix is more than a few characters long, the two boundary values
are too close together to be distinguishable by comparison to the column
histogram, resulting in a selectivity estimate of zero, which is often
not very sane.  Change motivated by an example from Peter Eisentraut.

Arguably this is a bug fix, but I'll refrain from back-patching it
for the moment.
parent 6f10eb21
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.243 2008/01/01 19:45:52 momjian Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -103,6 +103,12 @@ ...@@ -103,6 +103,12 @@
#include "utils/syscache.h" #include "utils/syscache.h"
static double var_eq_const(VariableStatData *vardata, Oid operator,
Datum constval, bool constisnull,
bool varonleft);
static double var_eq_non_const(VariableStatData *vardata, Oid operator,
Node *other,
bool varonleft);
static double ineq_histogram_selectivity(VariableStatData *vardata, static double ineq_histogram_selectivity(VariableStatData *vardata,
FmgrInfo *opproc, bool isgt, FmgrInfo *opproc, bool isgt,
Datum constval, Oid consttype); Datum constval, Oid consttype);
...@@ -156,10 +162,6 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -156,10 +162,6 @@ eqsel(PG_FUNCTION_ARGS)
VariableStatData vardata; VariableStatData vardata;
Node *other; Node *other;
bool varonleft; bool varonleft;
Datum *values;
int nvalues;
float4 *numbers;
int nnumbers;
double selec; double selec;
/* /*
...@@ -171,29 +173,55 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -171,29 +173,55 @@ eqsel(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
/* /*
* If the something is a NULL constant, assume operator is strict and * We can do a lot better if the something is a constant. (Note: the
* return zero, ie, operator will never return TRUE. * Const might result from estimation rather than being a simple constant
* in the query.)
*/ */
if (IsA(other, Const) && if (IsA(other, Const))
((Const *) other)->constisnull) selec = var_eq_const(&vardata, operator,
{ ((Const *) other)->constvalue,
((Const *) other)->constisnull,
varonleft);
else
selec = var_eq_non_const(&vardata, operator, other,
varonleft);
ReleaseVariableStats(vardata); ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
if (HeapTupleIsValid(vardata.statsTuple)) PG_RETURN_FLOAT8((float8) selec);
{ }
Form_pg_statistic stats;
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); /*
* var_eq_const --- eqsel for var = const case
*
* This is split out so that some other estimation functions can use it.
*/
static double
var_eq_const(VariableStatData *vardata, Oid operator,
Datum constval, bool constisnull,
bool varonleft)
{
double selec;
if (IsA(other, Const)) /*
* If the constant is NULL, assume operator is strict and
* return zero, ie, operator will never return TRUE.
*/
if (constisnull)
return 0.0;
if (HeapTupleIsValid(vardata->statsTuple))
{ {
/* Variable is being compared to a known non-null constant */ Form_pg_statistic stats;
Datum constval = ((Const *) other)->constvalue; Datum *values;
int nvalues;
float4 *numbers;
int nnumbers;
bool match = false; bool match = false;
int i; int i;
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
/* /*
* Is the constant "=" to any of the column's most common values? * Is the constant "=" to any of the column's most common values?
* (Although the given operator may not really be "=", we will * (Although the given operator may not really be "=", we will
...@@ -201,8 +229,8 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -201,8 +229,8 @@ eqsel(PG_FUNCTION_ARGS)
* test. If you don't like this, maybe you shouldn't be using * test. If you don't like this, maybe you shouldn't be using
* eqsel for your operator...) * eqsel for your operator...)
*/ */
if (get_attstatsslot(vardata.statsTuple, if (get_attstatsslot(vardata->statsTuple,
vardata.atttype, vardata.atttypmod, vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_MCV, InvalidOid, STATISTIC_KIND_MCV, InvalidOid,
&values, &nvalues, &values, &nvalues,
&numbers, &nnumbers)) &numbers, &nnumbers))
...@@ -264,8 +292,7 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -264,8 +292,7 @@ eqsel(PG_FUNCTION_ARGS)
* fraction equally, so we divide by the number of other * fraction equally, so we divide by the number of other
* distinct values. * distinct values.
*/ */
otherdistinct = get_variable_numdistinct(&vardata) otherdistinct = get_variable_numdistinct(vardata) - nnumbers;
- nnumbers;
if (otherdistinct > 1) if (otherdistinct > 1)
selec /= otherdistinct; selec /= otherdistinct;
...@@ -277,12 +304,43 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -277,12 +304,43 @@ eqsel(PG_FUNCTION_ARGS)
selec = numbers[nnumbers - 1]; selec = numbers[nnumbers - 1];
} }
free_attstatsslot(vardata.atttype, values, nvalues, free_attstatsslot(vardata->atttype, values, nvalues,
numbers, nnumbers); numbers, nnumbers);
} }
else else
{ {
/*
* No ANALYZE stats available, so make a guess using estimated number
* of distinct values and assuming they are equally common. (The guess
* is unlikely to be very good, but we do know a few special cases.)
*/
selec = 1.0 / get_variable_numdistinct(vardata);
}
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* var_eq_non_const --- eqsel for var = something-other-than-const case
*/
static double
var_eq_non_const(VariableStatData *vardata, Oid operator,
Node *other,
bool varonleft)
{
double selec;
if (HeapTupleIsValid(vardata->statsTuple))
{
Form_pg_statistic stats;
double ndistinct; double ndistinct;
float4 *numbers;
int nnumbers;
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
/* /*
* Search is for a value that we do not know a priori, but we will * Search is for a value that we do not know a priori, but we will
...@@ -295,7 +353,7 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -295,7 +353,7 @@ eqsel(PG_FUNCTION_ARGS)
* good idea?) * good idea?)
*/ */
selec = 1.0 - stats->stanullfrac; selec = 1.0 - stats->stanullfrac;
ndistinct = get_variable_numdistinct(&vardata); ndistinct = get_variable_numdistinct(vardata);
if (ndistinct > 1) if (ndistinct > 1)
selec /= ndistinct; selec /= ndistinct;
...@@ -303,16 +361,15 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -303,16 +361,15 @@ eqsel(PG_FUNCTION_ARGS)
* Cross-check: selectivity should never be estimated as more than * Cross-check: selectivity should never be estimated as more than
* the most common value's. * the most common value's.
*/ */
if (get_attstatsslot(vardata.statsTuple, if (get_attstatsslot(vardata->statsTuple,
vardata.atttype, vardata.atttypmod, vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_MCV, InvalidOid, STATISTIC_KIND_MCV, InvalidOid,
NULL, NULL, NULL, NULL,
&numbers, &nnumbers)) &numbers, &nnumbers))
{ {
if (nnumbers > 0 && selec > numbers[0]) if (nnumbers > 0 && selec > numbers[0])
selec = numbers[0]; selec = numbers[0];
free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers); free_attstatsslot(vardata->atttype, NULL, 0, numbers, nnumbers);
}
} }
} }
else else
...@@ -322,15 +379,13 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -322,15 +379,13 @@ eqsel(PG_FUNCTION_ARGS)
* of distinct values and assuming they are equally common. (The guess * of distinct values and assuming they are equally common. (The guess
* is unlikely to be very good, but we do know a few special cases.) * is unlikely to be very good, but we do know a few special cases.)
*/ */
selec = 1.0 / get_variable_numdistinct(&vardata); selec = 1.0 / get_variable_numdistinct(vardata);
} }
ReleaseVariableStats(vardata);
/* result should be in range, but make sure... */ /* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec); CLAMP_PROBABILITY(selec);
PG_RETURN_FLOAT8((float8) selec); return selec;
} }
/* /*
...@@ -1047,16 +1102,11 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ...@@ -1047,16 +1102,11 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
*/ */
Oid eqopr = get_opfamily_member(opfamily, vartype, vartype, Oid eqopr = get_opfamily_member(opfamily, vartype, vartype,
BTEqualStrategyNumber); BTEqualStrategyNumber);
List *eqargs;
if (eqopr == InvalidOid) if (eqopr == InvalidOid)
elog(ERROR, "no = operator for opfamily %u", opfamily); elog(ERROR, "no = operator for opfamily %u", opfamily);
eqargs = list_make2(variable, prefix); result = var_eq_const(&vardata, eqopr, prefix->constvalue,
result = DatumGetFloat8(DirectFunctionCall4(eqsel, false, true);
PointerGetDatum(root),
ObjectIdGetDatum(eqopr),
PointerGetDatum(eqargs),
Int32GetDatum(varRelid)));
} }
else else
{ {
...@@ -4430,6 +4480,7 @@ prefix_selectivity(VariableStatData *vardata, ...@@ -4430,6 +4480,7 @@ prefix_selectivity(VariableStatData *vardata,
Oid cmpopr; Oid cmpopr;
FmgrInfo opproc; FmgrInfo opproc;
Const *greaterstrcon; Const *greaterstrcon;
Selectivity eq_sel;
cmpopr = get_opfamily_member(opfamily, vartype, vartype, cmpopr = get_opfamily_member(opfamily, vartype, vartype,
BTGreaterEqualStrategyNumber); BTGreaterEqualStrategyNumber);
...@@ -4444,7 +4495,7 @@ prefix_selectivity(VariableStatData *vardata, ...@@ -4444,7 +4495,7 @@ prefix_selectivity(VariableStatData *vardata,
if (prefixsel <= 0.0) if (prefixsel <= 0.0)
{ {
/* No histogram is present ... return a suitable default estimate */ /* No histogram is present ... return a suitable default estimate */
return 0.005; return DEFAULT_MATCH_SEL;
} }
/*------- /*-------
...@@ -4452,17 +4503,17 @@ prefix_selectivity(VariableStatData *vardata, ...@@ -4452,17 +4503,17 @@ prefix_selectivity(VariableStatData *vardata,
* "x < greaterstr". * "x < greaterstr".
*------- *-------
*/ */
greaterstrcon = make_greater_string(prefixcon, &opproc);
if (greaterstrcon)
{
Selectivity topsel;
cmpopr = get_opfamily_member(opfamily, vartype, vartype, cmpopr = get_opfamily_member(opfamily, vartype, vartype,
BTLessStrategyNumber); BTLessStrategyNumber);
if (cmpopr == InvalidOid) if (cmpopr == InvalidOid)
elog(ERROR, "no < operator for opfamily %u", opfamily); elog(ERROR, "no < operator for opfamily %u", opfamily);
fmgr_info(get_opcode(cmpopr), &opproc); fmgr_info(get_opcode(cmpopr), &opproc);
greaterstrcon = make_greater_string(prefixcon, &opproc);
if (greaterstrcon)
{
Selectivity topsel;
topsel = ineq_histogram_selectivity(vardata, &opproc, false, topsel = ineq_histogram_selectivity(vardata, &opproc, false,
greaterstrcon->constvalue, greaterstrcon->constvalue,
greaterstrcon->consttype); greaterstrcon->consttype);
...@@ -4477,15 +4528,29 @@ prefix_selectivity(VariableStatData *vardata, ...@@ -4477,15 +4528,29 @@ prefix_selectivity(VariableStatData *vardata,
* doesn't count those anyway. * doesn't count those anyway.
*/ */
prefixsel = topsel + prefixsel - 1.0; prefixsel = topsel + prefixsel - 1.0;
}
/* /*
* A zero or negative prefixsel should be converted into a small * If the prefix is long then the two bounding values might be too
* positive value; we probably are dealing with a very tight range and * close together for the histogram to distinguish them usefully,
* got a bogus result due to roundoff errors. * resulting in a zero estimate (plus or minus roundoff error).
* To avoid returning a ridiculously small estimate, compute the
* estimated selectivity for "variable = 'foo'", and clamp to that.
* (Obviously, the resultant estimate should be at least that.)
*
* We apply this even if we couldn't make a greater string. That case
* suggests that the prefix is near the maximum possible, and thus
* probably off the end of the histogram, and thus we probably got a
* very small estimate from the >= condition; so we still need to clamp.
*/ */
if (prefixsel <= 0.0) cmpopr = get_opfamily_member(opfamily, vartype, vartype,
prefixsel = 1.0e-10; BTEqualStrategyNumber);
} if (cmpopr == InvalidOid)
elog(ERROR, "no = operator for opfamily %u", opfamily);
eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
false, true);
prefixsel = Max(prefixsel, eq_sel);
return prefixsel; return prefixsel;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment