Commit 512c7356 authored by Tom Lane's avatar Tom Lane

Fix <> and pattern-NOT-match estimators to handle nulls correctly.

These estimators returned 1 minus the corresponding equality/match
estimate, which is incorrect: we need to subtract off the fraction
of nulls in the column, since those are neither equal nor not equal
to the comparison value.  The error only becomes obvious if the
nullfrac is large, but it could be very bad in a mostly-nulls
column, as reported in bug #14676 from Marko Tiikkaja.

To fix the <> case, refactor eqsel() and neqsel() to call a common
support routine, which can be made to account for nullfrac correctly.
The pattern-match cases were already factored that way, and it was
simply an oversight that patternsel() wasn't subtracting off nullfrac.

neqjoinsel() has a similar problem, but since we're elsewhere discussing
changing its behavior entirely, I left it alone for now.

This is a very longstanding bug, but I'm hesitant to back-patch a fix for
it.  Given the lack of prior complaints, such cases must not come up often,
so it's probably not worth the risk of destabilizing plans in stable
branches.

Discussion: https://postgr.es/m/20170529153847.4275.95416@wrigleys.postgresql.org
parent 23886581
...@@ -154,12 +154,13 @@ ...@@ -154,12 +154,13 @@
get_relation_stats_hook_type get_relation_stats_hook = NULL; get_relation_stats_hook_type get_relation_stats_hook = NULL;
get_index_stats_hook_type get_index_stats_hook = NULL; get_index_stats_hook_type get_index_stats_hook = NULL;
static double eqsel_internal(PG_FUNCTION_ARGS, bool negate);
static double var_eq_const(VariableStatData *vardata, Oid operator, static double var_eq_const(VariableStatData *vardata, Oid operator,
Datum constval, bool constisnull, Datum constval, bool constisnull,
bool varonleft); bool varonleft, bool negate);
static double var_eq_non_const(VariableStatData *vardata, Oid operator, static double var_eq_non_const(VariableStatData *vardata, Oid operator,
Node *other, Node *other,
bool varonleft); bool varonleft, bool negate);
static double ineq_histogram_selectivity(PlannerInfo *root, static double ineq_histogram_selectivity(PlannerInfo *root,
VariableStatData *vardata, VariableStatData *vardata,
FmgrInfo *opproc, bool isgt, FmgrInfo *opproc, bool isgt,
...@@ -226,6 +227,15 @@ static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals); ...@@ -226,6 +227,15 @@ static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals);
*/ */
Datum Datum
eqsel(PG_FUNCTION_ARGS) eqsel(PG_FUNCTION_ARGS)
{
PG_RETURN_FLOAT8((float8) eqsel_internal(fcinfo, false));
}
/*
* Common code for eqsel() and neqsel()
*/
static double
eqsel_internal(PG_FUNCTION_ARGS, bool negate)
{ {
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1); Oid operator = PG_GETARG_OID(1);
...@@ -236,13 +246,27 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -236,13 +246,27 @@ eqsel(PG_FUNCTION_ARGS)
bool varonleft; bool varonleft;
double selec; double selec;
/*
* When asked about <>, we do the estimation using the corresponding =
* operator, then convert to <> via "1.0 - eq_selectivity - nullfrac".
*/
if (negate)
{
operator = get_negator(operator);
if (!OidIsValid(operator))
{
/* Use default selectivity (should we raise an error instead?) */
return 1.0 - DEFAULT_EQ_SEL;
}
}
/* /*
* If expression is not variable = something or something = variable, then * If expression is not variable = something or something = variable, then
* punt and return a default estimate. * punt and return a default estimate.
*/ */
if (!get_restriction_variable(root, args, varRelid, if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft)) &vardata, &other, &varonleft))
PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); return negate ? (1.0 - DEFAULT_EQ_SEL) : DEFAULT_EQ_SEL;
/* /*
* We can do a lot better if the something is a constant. (Note: the * We can do a lot better if the something is a constant. (Note: the
...@@ -253,14 +277,14 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -253,14 +277,14 @@ eqsel(PG_FUNCTION_ARGS)
selec = var_eq_const(&vardata, operator, selec = var_eq_const(&vardata, operator,
((Const *) other)->constvalue, ((Const *) other)->constvalue,
((Const *) other)->constisnull, ((Const *) other)->constisnull,
varonleft); varonleft, negate);
else else
selec = var_eq_non_const(&vardata, operator, other, selec = var_eq_non_const(&vardata, operator, other,
varonleft); varonleft, negate);
ReleaseVariableStats(vardata); ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8((float8) selec); return selec;
} }
/* /*
...@@ -271,19 +295,32 @@ eqsel(PG_FUNCTION_ARGS) ...@@ -271,19 +295,32 @@ eqsel(PG_FUNCTION_ARGS)
static double static double
var_eq_const(VariableStatData *vardata, Oid operator, var_eq_const(VariableStatData *vardata, Oid operator,
Datum constval, bool constisnull, Datum constval, bool constisnull,
bool varonleft) bool varonleft, bool negate)
{ {
double selec; double selec;
double nullfrac = 0.0;
bool isdefault; bool isdefault;
Oid opfuncoid; Oid opfuncoid;
/* /*
* If the constant is NULL, assume operator is strict and return zero, ie, * If the constant is NULL, assume operator is strict and return zero, ie,
* operator will never return TRUE. * operator will never return TRUE. (It's zero even for a negator op.)
*/ */
if (constisnull) if (constisnull)
return 0.0; return 0.0;
/*
* Grab the nullfrac for use below. Note we allow use of nullfrac
* regardless of security check.
*/
if (HeapTupleIsValid(vardata->statsTuple))
{
Form_pg_statistic stats;
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
nullfrac = stats->stanullfrac;
}
/* /*
* If we matched the var to a unique index or DISTINCT clause, assume * If we matched the var to a unique index or DISTINCT clause, assume
* there is exactly one match regardless of anything else. (This is * there is exactly one match regardless of anything else. (This is
...@@ -292,11 +329,12 @@ var_eq_const(VariableStatData *vardata, Oid operator, ...@@ -292,11 +329,12 @@ var_eq_const(VariableStatData *vardata, Oid operator,
* ignoring the information.) * ignoring the information.)
*/ */
if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
return 1.0 / vardata->rel->tuples; {
selec = 1.0 / vardata->rel->tuples;
if (HeapTupleIsValid(vardata->statsTuple) && }
statistic_proc_security_check(vardata, else if (HeapTupleIsValid(vardata->statsTuple) &&
(opfuncoid = get_opcode(operator)))) statistic_proc_security_check(vardata,
(opfuncoid = get_opcode(operator))))
{ {
Form_pg_statistic stats; Form_pg_statistic stats;
AttStatsSlot sslot; AttStatsSlot sslot;
...@@ -363,7 +401,7 @@ var_eq_const(VariableStatData *vardata, Oid operator, ...@@ -363,7 +401,7 @@ var_eq_const(VariableStatData *vardata, Oid operator,
for (i = 0; i < sslot.nnumbers; i++) for (i = 0; i < sslot.nnumbers; i++)
sumcommon += sslot.numbers[i]; sumcommon += sslot.numbers[i];
selec = 1.0 - sumcommon - stats->stanullfrac; selec = 1.0 - sumcommon - nullfrac;
CLAMP_PROBABILITY(selec); CLAMP_PROBABILITY(selec);
/* /*
...@@ -396,6 +434,10 @@ var_eq_const(VariableStatData *vardata, Oid operator, ...@@ -396,6 +434,10 @@ var_eq_const(VariableStatData *vardata, Oid operator,
selec = 1.0 / get_variable_numdistinct(vardata, &isdefault); selec = 1.0 / get_variable_numdistinct(vardata, &isdefault);
} }
/* now adjust if we wanted <> rather than = */
if (negate)
selec = 1.0 - selec - nullfrac;
/* result should be in range, but make sure... */ /* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec); CLAMP_PROBABILITY(selec);
...@@ -408,11 +450,23 @@ var_eq_const(VariableStatData *vardata, Oid operator, ...@@ -408,11 +450,23 @@ var_eq_const(VariableStatData *vardata, Oid operator,
static double static double
var_eq_non_const(VariableStatData *vardata, Oid operator, var_eq_non_const(VariableStatData *vardata, Oid operator,
Node *other, Node *other,
bool varonleft) bool varonleft, bool negate)
{ {
double selec; double selec;
double nullfrac = 0.0;
bool isdefault; bool isdefault;
/*
* Grab the nullfrac for use below.
*/
if (HeapTupleIsValid(vardata->statsTuple))
{
Form_pg_statistic stats;
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
nullfrac = stats->stanullfrac;
}
/* /*
* If we matched the var to a unique index or DISTINCT clause, assume * If we matched the var to a unique index or DISTINCT clause, assume
* there is exactly one match regardless of anything else. (This is * there is exactly one match regardless of anything else. (This is
...@@ -421,9 +475,10 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, ...@@ -421,9 +475,10 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
* ignoring the information.) * ignoring the information.)
*/ */
if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
return 1.0 / vardata->rel->tuples; {
selec = 1.0 / vardata->rel->tuples;
if (HeapTupleIsValid(vardata->statsTuple)) }
else if (HeapTupleIsValid(vardata->statsTuple))
{ {
Form_pg_statistic stats; Form_pg_statistic stats;
double ndistinct; double ndistinct;
...@@ -441,7 +496,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, ...@@ -441,7 +496,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
* values, regardless of their frequency in the table. Is that a good * values, regardless of their frequency in the table. Is that a good
* idea?) * idea?)
*/ */
selec = 1.0 - stats->stanullfrac; selec = 1.0 - nullfrac;
ndistinct = get_variable_numdistinct(vardata, &isdefault); ndistinct = get_variable_numdistinct(vardata, &isdefault);
if (ndistinct > 1) if (ndistinct > 1)
selec /= ndistinct; selec /= ndistinct;
...@@ -469,6 +524,10 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, ...@@ -469,6 +524,10 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
selec = 1.0 / get_variable_numdistinct(vardata, &isdefault); selec = 1.0 / get_variable_numdistinct(vardata, &isdefault);
} }
/* now adjust if we wanted <> rather than = */
if (negate)
selec = 1.0 - selec - nullfrac;
/* result should be in range, but make sure... */ /* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec); CLAMP_PROBABILITY(selec);
...@@ -485,33 +544,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, ...@@ -485,33 +544,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
Datum Datum
neqsel(PG_FUNCTION_ARGS) neqsel(PG_FUNCTION_ARGS)
{ {
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); PG_RETURN_FLOAT8((float8) eqsel_internal(fcinfo, true));
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
Oid eqop;
float8 result;
/*
* We want 1 - eqsel() where the equality operator is the one associated
* with this != operator, that is, its negator.
*/
eqop = get_negator(operator);
if (eqop)
{
result = DatumGetFloat8(DirectFunctionCall4(eqsel,
PointerGetDatum(root),
ObjectIdGetDatum(eqop),
PointerGetDatum(args),
Int32GetDatum(varRelid)));
}
else
{
/* Use default selectivity (should we raise an error instead?) */
result = DEFAULT_EQ_SEL;
}
result = 1.0 - result;
PG_RETURN_FLOAT8(result);
} }
/* /*
...@@ -1114,6 +1147,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ...@@ -1114,6 +1147,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
Const *patt; Const *patt;
Const *prefix = NULL; Const *prefix = NULL;
Selectivity rest_selec = 0; Selectivity rest_selec = 0;
double nullfrac = 0.0;
double result; double result;
/* /*
...@@ -1202,6 +1236,17 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ...@@ -1202,6 +1236,17 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
return result; return result;
} }
/*
* Grab the nullfrac for use below.
*/
if (HeapTupleIsValid(vardata.statsTuple))
{
Form_pg_statistic stats;
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
nullfrac = stats->stanullfrac;
}
/* /*
* Pull out any fixed prefix implied by the pattern, and estimate the * Pull out any fixed prefix implied by the pattern, and estimate the
* fractional selectivity of the remainder of the pattern. Unlike many of * fractional selectivity of the remainder of the pattern. Unlike many of
...@@ -1252,7 +1297,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ...@@ -1252,7 +1297,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
if (eqopr == InvalidOid) if (eqopr == InvalidOid)
elog(ERROR, "no = operator for opfamily %u", opfamily); elog(ERROR, "no = operator for opfamily %u", opfamily);
result = var_eq_const(&vardata, eqopr, prefix->constvalue, result = var_eq_const(&vardata, eqopr, prefix->constvalue,
false, true); false, true, false);
} }
else else
{ {
...@@ -1275,8 +1320,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ...@@ -1275,8 +1320,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
Selectivity selec; Selectivity selec;
int hist_size; int hist_size;
FmgrInfo opproc; FmgrInfo opproc;
double nullfrac, double mcv_selec,
mcv_selec,
sumcommon; sumcommon;
/* Try to use the histogram entries to get selectivity */ /* Try to use the histogram entries to get selectivity */
...@@ -1328,11 +1372,6 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ...@@ -1328,11 +1372,6 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true, mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
&sumcommon); &sumcommon);
if (HeapTupleIsValid(vardata.statsTuple))
nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
else
nullfrac = 0.0;
/* /*
* Now merge the results from the MCV and histogram calculations, * Now merge the results from the MCV and histogram calculations,
* realizing that the histogram covers only the non-null values that * realizing that the histogram covers only the non-null values that
...@@ -1340,12 +1379,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ...@@ -1340,12 +1379,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
*/ */
selec *= 1.0 - nullfrac - sumcommon; selec *= 1.0 - nullfrac - sumcommon;
selec += mcv_selec; selec += mcv_selec;
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec);
result = selec; result = selec;
} }
/* now adjust if we wanted not-match rather than match */
if (negate)
result = 1.0 - result - nullfrac;
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(result);
if (prefix) if (prefix)
{ {
pfree(DatumGetPointer(prefix->constvalue)); pfree(DatumGetPointer(prefix->constvalue));
...@@ -1354,7 +1397,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ...@@ -1354,7 +1397,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
ReleaseVariableStats(vardata); ReleaseVariableStats(vardata);
return negate ? (1.0 - result) : result; return result;
} }
/* /*
...@@ -1451,7 +1494,7 @@ boolvarsel(PlannerInfo *root, Node *arg, int varRelid) ...@@ -1451,7 +1494,7 @@ boolvarsel(PlannerInfo *root, Node *arg, int varRelid)
* compute the selectivity as if that is what we have. * compute the selectivity as if that is what we have.
*/ */
selec = var_eq_const(&vardata, BooleanEqualOperator, selec = var_eq_const(&vardata, BooleanEqualOperator,
BoolGetDatum(true), false, true); BoolGetDatum(true), false, true, false);
} }
else if (is_funcclause(arg)) else if (is_funcclause(arg))
{ {
...@@ -5788,7 +5831,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, ...@@ -5788,7 +5831,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
if (cmpopr == InvalidOid) if (cmpopr == InvalidOid)
elog(ERROR, "no = operator for opfamily %u", opfamily); elog(ERROR, "no = operator for opfamily %u", opfamily);
eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue, eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
false, true); false, true, false);
prefixsel = Max(prefixsel, eq_sel); prefixsel = Max(prefixsel, eq_sel);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment