Commit 0d3b231e authored by Tom Lane's avatar Tom Lane

Further repair of eqjoinsel ndistinct-clamping logic.

Examination of examples provided by Mark Kirkwood and others has convinced
me that actually commit 7f3eba30 was quite
a few bricks shy of a load.  The useful part of that patch was clamping
ndistinct for the inner side of a semi or anti join, and the reason why
that's needed is that it's the only way that restriction clauses
eliminating rows from the inner relation can affect the estimated size of
the join result.  I had not clearly understood why the clamping was
appropriate, and so mis-extrapolated to conclude that we should clamp
ndistinct for the outer side too, as well as for both sides of regular
joins.  These latter actions were all wrong, and are reverted with this
patch.  In addition, the clamping logic is now made to affect the behavior
of both paths in eqjoinsel_semi, with or without MCV lists to compare.
When we have MCVs, we suppose that the most common values are the ones
that are most likely to survive the decimation resulting from a lower
restriction clause, so we think of the clamping as eliminating non-MCV
values, or potentially even the least-common MCVs for the inner relation.

Back-patch to 8.4, same as previous fixes in this area.
parent 7971a57f
...@@ -142,11 +142,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root, ...@@ -142,11 +142,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
FmgrInfo *opproc, bool isgt, FmgrInfo *opproc, bool isgt,
Datum constval, Oid consttype); Datum constval, Oid consttype);
static double eqjoinsel_inner(Oid operator, static double eqjoinsel_inner(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2, VariableStatData *vardata1, VariableStatData *vardata2);
RelOptInfo *rel1, RelOptInfo *rel2);
static double eqjoinsel_semi(Oid operator, static double eqjoinsel_semi(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2, VariableStatData *vardata1, VariableStatData *vardata2,
RelOptInfo *rel1, RelOptInfo *rel2); RelOptInfo *inner_rel);
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid, Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound); double *scaledlobound, double *scaledhibound);
...@@ -2011,47 +2010,35 @@ eqjoinsel(PG_FUNCTION_ARGS) ...@@ -2011,47 +2010,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
VariableStatData vardata1; VariableStatData vardata1;
VariableStatData vardata2; VariableStatData vardata2;
bool join_is_reversed; bool join_is_reversed;
RelOptInfo *rel1; RelOptInfo *inner_rel;
RelOptInfo *rel2;
get_join_variables(root, args, sjinfo, get_join_variables(root, args, sjinfo,
&vardata1, &vardata2, &join_is_reversed); &vardata1, &vardata2, &join_is_reversed);
/*
* Identify the join's direct input relations. We use the min lefthand
* and min righthand as the inputs, even though the join might actually
* get done with larger input relations. The min inputs are guaranteed to
* have been formed by now, though, and always using them ensures
* consistency of estimates.
*/
if (!join_is_reversed)
{
rel1 = find_join_input_rel(root, sjinfo->min_lefthand);
rel2 = find_join_input_rel(root, sjinfo->min_righthand);
}
else
{
rel1 = find_join_input_rel(root, sjinfo->min_righthand);
rel2 = find_join_input_rel(root, sjinfo->min_lefthand);
}
switch (sjinfo->jointype) switch (sjinfo->jointype)
{ {
case JOIN_INNER: case JOIN_INNER:
case JOIN_LEFT: case JOIN_LEFT:
case JOIN_FULL: case JOIN_FULL:
selec = eqjoinsel_inner(operator, &vardata1, &vardata2, selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
rel1, rel2);
break; break;
case JOIN_SEMI: case JOIN_SEMI:
case JOIN_ANTI: case JOIN_ANTI:
/*
* Look up the join's inner relation. min_righthand is sufficient
* information because neither SEMI nor ANTI joins permit any
* reassociation into or out of their RHS, so the righthand will
* always be exactly that set of rels.
*/
inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
if (!join_is_reversed) if (!join_is_reversed)
selec = eqjoinsel_semi(operator, &vardata1, &vardata2, selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
rel1, rel2); inner_rel);
else else
selec = eqjoinsel_semi(get_commutator(operator), selec = eqjoinsel_semi(get_commutator(operator),
&vardata2, &vardata1, &vardata2, &vardata1,
rel2, rel1); inner_rel);
break; break;
default: default:
/* other values not expected here */ /* other values not expected here */
...@@ -2077,8 +2064,7 @@ eqjoinsel(PG_FUNCTION_ARGS) ...@@ -2077,8 +2064,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
*/ */
static double static double
eqjoinsel_inner(Oid operator, eqjoinsel_inner(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2, VariableStatData *vardata1, VariableStatData *vardata2)
RelOptInfo *rel1, RelOptInfo *rel2)
{ {
double selec; double selec;
double nd1; double nd1;
...@@ -2273,26 +2259,10 @@ eqjoinsel_inner(Oid operator, ...@@ -2273,26 +2259,10 @@ eqjoinsel_inner(Oid operator,
* XXX Can we be smarter if we have an MCV list for just one side? It * XXX Can we be smarter if we have an MCV list for just one side? It
* seems that if we assume equal distribution for the other side, we * seems that if we assume equal distribution for the other side, we
* end up with the same answer anyway. * end up with the same answer anyway.
*
* An additional hack we use here is to clamp the nd1 and nd2 values
* to not more than what we are estimating the input relation sizes to
* be, providing a crude correction for the selectivity of restriction
* clauses on those relations. (We don't do that in the other path
* since there we are comparing the nd values to stats for the whole
* relations.) We can apply this clamp both with respect to the base
* relations from which the join variables come, and to the immediate
* input relations of the current join.
*/ */
double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0; double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0; double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
if (vardata1->rel)
nd1 = Min(nd1, vardata1->rel->rows);
nd1 = Min(nd1, rel1->rows);
if (vardata2->rel)
nd2 = Min(nd2, vardata2->rel->rows);
nd2 = Min(nd2, rel2->rows);
selec = (1.0 - nullfrac1) * (1.0 - nullfrac2); selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
if (nd1 > nd2) if (nd1 > nd2)
selec /= nd1; selec /= nd1;
...@@ -2319,7 +2289,7 @@ eqjoinsel_inner(Oid operator, ...@@ -2319,7 +2289,7 @@ eqjoinsel_inner(Oid operator,
static double static double
eqjoinsel_semi(Oid operator, eqjoinsel_semi(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2, VariableStatData *vardata1, VariableStatData *vardata2,
RelOptInfo *rel1, RelOptInfo *rel2) RelOptInfo *inner_rel)
{ {
double selec; double selec;
double nd1; double nd1;
...@@ -2339,6 +2309,25 @@ eqjoinsel_semi(Oid operator, ...@@ -2339,6 +2309,25 @@ eqjoinsel_semi(Oid operator,
nd1 = get_variable_numdistinct(vardata1); nd1 = get_variable_numdistinct(vardata1);
nd2 = get_variable_numdistinct(vardata2); nd2 = get_variable_numdistinct(vardata2);
/*
* We clamp nd2 to be not more than what we estimate the inner relation's
* size to be. This is intuitively somewhat reasonable since obviously
* there can't be more than that many distinct values coming from the
* inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
* likewise) is that this is the only pathway by which restriction clauses
* applied to the inner rel will affect the join result size estimate,
* since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
* only the outer rel's size. If we clamped nd1 we'd be double-counting
* the selectivity of outer-rel restrictions.
*
* We can apply this clamping both with respect to the base relation from
* which the join variable comes (if there is just one), and to the
* immediate inner input relation of the current join.
*/
if (vardata2->rel)
nd2 = Min(nd2, vardata2->rel->rows);
nd2 = Min(nd2, inner_rel->rows);
if (HeapTupleIsValid(vardata1->statsTuple)) if (HeapTupleIsValid(vardata1->statsTuple))
{ {
stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple); stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
...@@ -2382,11 +2371,21 @@ eqjoinsel_semi(Oid operator, ...@@ -2382,11 +2371,21 @@ eqjoinsel_semi(Oid operator,
uncertainfrac, uncertainfrac,
uncertain; uncertain;
int i, int i,
nmatches; nmatches,
clamped_nvalues2;
/*
* The clamping above could have resulted in nd2 being less than
* nvalues2; in which case, we assume that precisely the nd2 most
* common values in the relation will appear in the join input, and so
* compare to only the first nd2 members of the MCV list. Of course
* this is frequently wrong, but it's the best bet we can make.
*/
clamped_nvalues2 = Min(nvalues2, nd2);
fmgr_info(get_opcode(operator), &eqproc); fmgr_info(get_opcode(operator), &eqproc);
hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool)); hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool)); hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
/* /*
* Note we assume that each MCV will match at most one member of the * Note we assume that each MCV will match at most one member of the
...@@ -2399,7 +2398,7 @@ eqjoinsel_semi(Oid operator, ...@@ -2399,7 +2398,7 @@ eqjoinsel_semi(Oid operator,
{ {
int j; int j;
for (j = 0; j < nvalues2; j++) for (j = 0; j < clamped_nvalues2; j++)
{ {
if (hasmatch2[j]) if (hasmatch2[j])
continue; continue;
...@@ -2444,7 +2443,7 @@ eqjoinsel_semi(Oid operator, ...@@ -2444,7 +2443,7 @@ eqjoinsel_semi(Oid operator,
{ {
nd1 -= nmatches; nd1 -= nmatches;
nd2 -= nmatches; nd2 -= nmatches;
if (nd1 <= nd2 || nd2 <= 0) if (nd1 <= nd2 || nd2 < 0)
uncertainfrac = 1.0; uncertainfrac = 1.0;
else else
uncertainfrac = nd2 / nd1; uncertainfrac = nd2 / nd1;
...@@ -2465,14 +2464,7 @@ eqjoinsel_semi(Oid operator, ...@@ -2465,14 +2464,7 @@ eqjoinsel_semi(Oid operator,
if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT) if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
{ {
if (vardata1->rel) if (nd1 <= nd2 || nd2 < 0)
nd1 = Min(nd1, vardata1->rel->rows);
nd1 = Min(nd1, rel1->rows);
if (vardata2->rel)
nd2 = Min(nd2, vardata2->rel->rows);
nd2 = Min(nd2, rel2->rows);
if (nd1 <= nd2 || nd2 <= 0)
selec = 1.0 - nullfrac1; selec = 1.0 - nullfrac1;
else else
selec = (nd2 / nd1) * (1.0 - nullfrac1); selec = (nd2 / nd1) * (1.0 - nullfrac1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment