Commit 875b0c62 authored by Tom Lane's avatar Tom Lane

When dealing with multiple grouping columns coming from the same table,

clamp the estimated number of groups to table row count over 10, instead
of table row count; this reflects a heuristic that people probably won't
group over a near-unique set of columns, and the knowledge that we don't
currently have any way to estimate the correlation of the columns better
than guessing.  This change creates a trivial plan change in one of the
regression tests.
parent 0bf2587d
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.169 2004/12/31 22:01:22 pgsql Exp $ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.170 2005/01/28 20:34:25 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -1937,10 +1937,13 @@ add_unique_group_var(Query *root, List *varinfos, ...@@ -1937,10 +1937,13 @@ add_unique_group_var(Query *root, List *varinfos,
* if we considered ones of the same rel, we'd be double-counting the * if we considered ones of the same rel, we'd be double-counting the
* restriction selectivity of the equality in the next step. * restriction selectivity of the equality in the next step.
* 3. For Vars within a single source rel, we multiply together the numbers * 3. For Vars within a single source rel, we multiply together the numbers
* of values, clamp to the number of rows in the rel, and then multiply * of values, clamp to the number of rows in the rel (divided by 10 if
* by the selectivity of the restriction clauses for that rel. The * more than one Var), and then multiply by the selectivity of the
* initial product is probably too high (it's the worst case) but since * restriction clauses for that rel. When there's more than one Var,
* we can clamp to the rel's rows it won't be hugely bad. Multiplying * the initial product is probably too high (it's the worst case) but
* clamping to a fraction of the rel's rows seems to be a helpful
* heuristic for not letting the estimate get out of hand. (The factor
* of 10 is derived from pre-Postgres-7.4 practice.) Multiplying
* by the restriction selectivity is effectively assuming that the * by the restriction selectivity is effectively assuming that the
* restriction clauses are independent of the grouping, which is a crummy * restriction clauses are independent of the grouping, which is a crummy
* assumption, but it's hard to do better. * assumption, but it's hard to do better.
...@@ -2040,6 +2043,7 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows) ...@@ -2040,6 +2043,7 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos); GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
RelOptInfo *rel = varinfo1->rel; RelOptInfo *rel = varinfo1->rel;
double reldistinct = varinfo1->ndistinct; double reldistinct = varinfo1->ndistinct;
int relvarcount = 1;
List *newvarinfos = NIL; List *newvarinfos = NIL;
/* /*
...@@ -2051,7 +2055,10 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows) ...@@ -2051,7 +2055,10 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l); GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
if (varinfo2->rel == varinfo1->rel) if (varinfo2->rel == varinfo1->rel)
{
reldistinct *= varinfo2->ndistinct; reldistinct *= varinfo2->ndistinct;
relvarcount++;
}
else else
{ {
/* not time to process varinfo2 yet */ /* not time to process varinfo2 yet */
...@@ -2066,10 +2073,20 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows) ...@@ -2066,10 +2073,20 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
if (rel->tuples > 0) if (rel->tuples > 0)
{ {
/* /*
* Clamp to size of rel, multiply by restriction selectivity. * Clamp to size of rel, or size of rel / 10 if multiple Vars.
* The fudge factor is because the Vars are probably correlated
* but we don't know by how much.
*/
double clamp = rel->tuples;
if (relvarcount > 1)
clamp *= 0.1;
if (reldistinct > clamp)
reldistinct = clamp;
/*
* Multiply by restriction selectivity.
*/ */
if (reldistinct > rel->tuples)
reldistinct = rel->tuples;
reldistinct *= rel->rows / rel->tuples; reldistinct *= rel->rows / rel->tuples;
/* /*
......
...@@ -134,11 +134,11 @@ SELECT '' AS five, f1 AS "Correlated Field" ...@@ -134,11 +134,11 @@ SELECT '' AS five, f1 AS "Correlated Field"
WHERE f3 IS NOT NULL); WHERE f3 IS NOT NULL);
five | Correlated Field five | Correlated Field
------+------------------ ------+------------------
| 2
| 3 | 3
| 1 | 1
| 2
| 3 | 3
| 2
| 2
(5 rows) (5 rows)
-- --
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment