Commit d4c3a156 authored by Tom Lane's avatar Tom Lane

Remove GROUP BY columns that are functionally dependent on other columns.

If a GROUP BY clause includes all columns of a non-deferred primary key,
as well as other columns of the same relation, those other columns are
redundant and can be dropped from the grouping; the pkey is enough to
ensure that each row of the table corresponds to a separate group.
Getting rid of the excess columns will reduce the cost of the sorting or
hashing needed to implement GROUP BY, and can indeed remove the need for
a sort step altogether.

This seems worth testing for since many query authors are not aware of
the GROUP-BY-primary-key exception to the rule about queries not being
allowed to reference non-grouped-by columns in their targetlists or
HAVING clauses.  Thus, redundant GROUP BY items are not uncommon.  Also,
we can make the test pretty cheap in most queries where it won't help
by not looking up a rel's primary key until we've found that at least
two of its columns are in GROUP BY.

David Rowley, reviewed by Julien Rouhaud
parent 72eee410
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "access/genam.h" #include "access/genam.h"
#include "access/heapam.h" #include "access/heapam.h"
#include "access/htup_details.h" #include "access/htup_details.h"
#include "access/sysattr.h"
#include "catalog/dependency.h" #include "catalog/dependency.h"
#include "catalog/indexing.h" #include "catalog/indexing.h"
#include "catalog/objectaccess.h" #include "catalog/objectaccess.h"
...@@ -871,6 +872,99 @@ get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok) ...@@ -871,6 +872,99 @@ get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok)
return conOid; return conOid;
} }
/*
* get_primary_key_attnos
* Identify the columns in a relation's primary key, if any.
*
* Returns a Bitmapset of the column attnos of the primary key's columns,
* with attnos being offset by FirstLowInvalidHeapAttributeNumber so that
* system columns can be represented.
*
* If there is no primary key, return NULL. We also return NULL if the pkey
* constraint is deferrable and deferrableOk is false.
*
* *constraintOid is set to the OID of the pkey constraint, or InvalidOid
* on failure.
*/
Bitmapset *
get_primary_key_attnos(Oid relid, bool deferrableOk, Oid *constraintOid)
{
Bitmapset *pkattnos = NULL;
Relation pg_constraint;
HeapTuple tuple;
SysScanDesc scan;
ScanKeyData skey[1];
/* Set *constraintOid, to avoid complaints about uninitialized vars */
*constraintOid = InvalidOid;
/* Scan pg_constraint for constraints of the target rel */
pg_constraint = heap_open(ConstraintRelationId, AccessShareLock);
ScanKeyInit(&skey[0],
Anum_pg_constraint_conrelid,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(relid));
scan = systable_beginscan(pg_constraint, ConstraintRelidIndexId, true,
NULL, 1, skey);
while (HeapTupleIsValid(tuple = systable_getnext(scan)))
{
Form_pg_constraint con = (Form_pg_constraint) GETSTRUCT(tuple);
Datum adatum;
bool isNull;
ArrayType *arr;
int16 *attnums;
int numkeys;
int i;
/* Skip constraints that are not PRIMARY KEYs */
if (con->contype != CONSTRAINT_PRIMARY)
continue;
/*
* If the primary key is deferrable, but we've been instructed to
* ignore deferrable constraints, then we might as well give up
* searching, since there can only be a single primary key on a table.
*/
if (con->condeferrable && !deferrableOk)
break;
/* Extract the conkey array, ie, attnums of PK's columns */
adatum = heap_getattr(tuple, Anum_pg_constraint_conkey,
RelationGetDescr(pg_constraint), &isNull);
if (isNull)
elog(ERROR, "null conkey for constraint %u",
HeapTupleGetOid(tuple));
arr = DatumGetArrayTypeP(adatum); /* ensure not toasted */
numkeys = ARR_DIMS(arr)[0];
if (ARR_NDIM(arr) != 1 ||
numkeys < 0 ||
ARR_HASNULL(arr) ||
ARR_ELEMTYPE(arr) != INT2OID)
elog(ERROR, "conkey is not a 1-D smallint array");
attnums = (int16 *) ARR_DATA_PTR(arr);
/* Construct the result value */
for (i = 0; i < numkeys; i++)
{
pkattnos = bms_add_member(pkattnos,
attnums[i] - FirstLowInvalidHeapAttributeNumber);
}
*constraintOid = HeapTupleGetOid(tuple);
/* No need to search further */
break;
}
systable_endscan(scan);
heap_close(pg_constraint, AccessShareLock);
return pkattnos;
}
/* /*
* Determine whether a relation can be proven functionally dependent on * Determine whether a relation can be proven functionally dependent on
* a set of grouping columns. If so, return TRUE and add the pg_constraint * a set of grouping columns. If so, return TRUE and add the pg_constraint
......
...@@ -20,7 +20,9 @@ ...@@ -20,7 +20,9 @@
#include "access/htup_details.h" #include "access/htup_details.h"
#include "access/parallel.h" #include "access/parallel.h"
#include "access/sysattr.h"
#include "access/xact.h" #include "access/xact.h"
#include "catalog/pg_constraint_fn.h"
#include "executor/executor.h" #include "executor/executor.h"
#include "executor/nodeAgg.h" #include "executor/nodeAgg.h"
#include "foreign/fdwapi.h" #include "foreign/fdwapi.h"
...@@ -89,6 +91,7 @@ static double preprocess_limit(PlannerInfo *root, ...@@ -89,6 +91,7 @@ static double preprocess_limit(PlannerInfo *root,
double tuple_fraction, double tuple_fraction,
int64 *offset_est, int64 *count_est); int64 *offset_est, int64 *count_est);
static bool limit_needed(Query *parse); static bool limit_needed(Query *parse);
static void remove_useless_groupby_columns(PlannerInfo *root);
static List *preprocess_groupclause(PlannerInfo *root, List *force); static List *preprocess_groupclause(PlannerInfo *root, List *force);
static List *extract_rollup_sets(List *groupingSets); static List *extract_rollup_sets(List *groupingSets);
static List *reorder_grouping_sets(List *groupingSets, List *sortclause); static List *reorder_grouping_sets(List *groupingSets, List *sortclause);
...@@ -710,6 +713,9 @@ subquery_planner(PlannerGlobal *glob, Query *parse, ...@@ -710,6 +713,9 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
} }
parse->havingQual = (Node *) newHaving; parse->havingQual = (Node *) newHaving;
/* Remove any redundant GROUP BY columns */
remove_useless_groupby_columns(root);
/* /*
* If we have any outer joins, try to reduce them to plain inner joins. * If we have any outer joins, try to reduce them to plain inner joins.
* This step is most easily done after we've done expression * This step is most easily done after we've done expression
...@@ -3207,6 +3213,159 @@ limit_needed(Query *parse) ...@@ -3207,6 +3213,159 @@ limit_needed(Query *parse)
} }
/*
* remove_useless_groupby_columns
* Remove any columns in the GROUP BY clause that are redundant due to
* being functionally dependent on other GROUP BY columns.
*
* Since some other DBMSes do not allow references to ungrouped columns, it's
* not unusual to find all columns listed in GROUP BY even though listing the
* primary-key columns would be sufficient. Deleting such excess columns
* avoids redundant sorting work, so it's worth doing. When we do this, we
* must mark the plan as dependent on the pkey constraint (compare the
* parser's check_ungrouped_columns() and check_functional_grouping()).
*
* In principle, we could treat any NOT-NULL columns appearing in a UNIQUE
* index as the determining columns. But as with check_functional_grouping(),
* there's currently no way to represent dependency on a NOT NULL constraint,
* so we consider only the pkey for now.
*/
static void
remove_useless_groupby_columns(PlannerInfo *root)
{
Query *parse = root->parse;
Bitmapset **groupbyattnos;
Bitmapset **surplusvars;
ListCell *lc;
int relid;
/* No chance to do anything if there are less than two GROUP BY items */
if (list_length(parse->groupClause) < 2)
return;
/* Don't fiddle with the GROUP BY clause if the query has grouping sets */
if (parse->groupingSets)
return;
/*
* Scan the GROUP BY clause to find GROUP BY items that are simple Vars.
* Fill groupbyattnos[k] with a bitmapset of the column attnos of RTE k
* that are GROUP BY items.
*/
groupbyattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
(list_length(parse->rtable) + 1));
foreach(lc, parse->groupClause)
{
SortGroupClause *sgc = (SortGroupClause *) lfirst(lc);
TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList);
Var *var = (Var *) tle->expr;
/*
* Ignore non-Vars and Vars from other query levels.
*
* XXX in principle, stable expressions containing Vars could also be
* removed, if all the Vars are functionally dependent on other GROUP
* BY items. But it's not clear that such cases occur often enough to
* be worth troubling over.
*/
if (!IsA(var, Var) ||
var->varlevelsup > 0)
continue;
/* OK, remember we have this Var */
relid = var->varno;
Assert(relid <= list_length(parse->rtable));
groupbyattnos[relid] = bms_add_member(groupbyattnos[relid],
var->varattno - FirstLowInvalidHeapAttributeNumber);
}
/*
* Consider each relation and see if it is possible to remove some of its
* Vars from GROUP BY. For simplicity and speed, we do the actual removal
* in a separate pass. Here, we just fill surplusvars[k] with a bitmapset
* of the column attnos of RTE k that are removable GROUP BY items.
*/
surplusvars = NULL; /* don't allocate array unless required */
relid = 0;
foreach(lc, parse->rtable)
{
RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
Bitmapset *relattnos;
Bitmapset *pkattnos;
Oid constraintOid;
relid++;
/* Only plain relations could have primary-key constraints */
if (rte->rtekind != RTE_RELATION)
continue;
/* Nothing to do unless this rel has multiple Vars in GROUP BY */
relattnos = groupbyattnos[relid];
if (bms_membership(relattnos) != BMS_MULTIPLE)
continue;
/*
* Can't remove any columns for this rel if there is no suitable
* (i.e., nondeferrable) primary key constraint.
*/
pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
if (pkattnos == NULL)
continue;
/*
* If the primary key is a proper subset of relattnos then we have
* some items in the GROUP BY that can be removed.
*/
if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
{
/*
* To easily remember whether we've found anything to do, we don't
* allocate the surplusvars[] array until we find something.
*/
if (surplusvars == NULL)
surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
(list_length(parse->rtable) + 1));
/* Remember the attnos of the removable columns */
surplusvars[relid] = bms_difference(relattnos, pkattnos);
/* Also, mark the resulting plan as dependent on this constraint */
parse->constraintDeps = lappend_oid(parse->constraintDeps,
constraintOid);
}
}
/*
* If we found any surplus Vars, build a new GROUP BY clause without them.
* (Note: this may leave some TLEs with unreferenced ressortgroupref
* markings, but that's harmless.)
*/
if (surplusvars != NULL)
{
List *new_groupby = NIL;
foreach(lc, parse->groupClause)
{
SortGroupClause *sgc = (SortGroupClause *) lfirst(lc);
TargetEntry *tle = get_sortgroupclause_tle(sgc, parse->targetList);
Var *var = (Var *) tle->expr;
/*
* New list must include non-Vars, outer Vars, and anything not
* marked as surplus.
*/
if (!IsA(var, Var) ||
var->varlevelsup > 0 ||
!bms_is_member(var->varattno - FirstLowInvalidHeapAttributeNumber,
surplusvars[var->varno]))
new_groupby = lappend(new_groupby, sgc);
}
parse->groupClause = new_groupby;
}
}
/* /*
* preprocess_groupclause - do preparatory work on GROUP BY clause * preprocess_groupclause - do preparatory work on GROUP BY clause
* *
......
...@@ -71,6 +71,9 @@ extern void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, ...@@ -71,6 +71,9 @@ extern void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId,
extern Oid get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok); extern Oid get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok);
extern Oid get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok); extern Oid get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok);
extern Bitmapset *get_primary_key_attnos(Oid relid, bool deferrableOk,
Oid *constraintOid);
extern bool check_functional_grouping(Oid relid, extern bool check_functional_grouping(Oid relid,
Index varno, Index varlevelsup, Index varno, Index varlevelsup,
List *grouping_columns, List *grouping_columns,
......
...@@ -856,6 +856,70 @@ ERROR: aggregate function calls cannot be nested ...@@ -856,6 +856,70 @@ ERROR: aggregate function calls cannot be nested
LINE 1: select (select max(min(unique1)) from int8_tbl) from tenk1; LINE 1: select (select max(min(unique1)) from int8_tbl) from tenk1;
^ ^
-- --
-- Test removal of redundant GROUP BY columns
--
create temp table t1 (a int, b int, c int, d int, primary key (a, b));
create temp table t2 (x int, y int, z int, primary key (x, y));
create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
-- Non-primary-key columns can be removed from GROUP BY
explain (costs off) select * from t1 group by a,b,c,d;
QUERY PLAN
----------------------
HashAggregate
Group Key: a, b
-> Seq Scan on t1
(3 rows)
-- No removal can happen if the complete PK is not present in GROUP BY
explain (costs off) select a,c from t1 group by a,c,d;
QUERY PLAN
----------------------
HashAggregate
Group Key: a, c, d
-> Seq Scan on t1
(3 rows)
-- Test removal across multiple relations
explain (costs off) select *
from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
QUERY PLAN
-------------------------------------------------------
Group
Group Key: t1.a, t1.b, t2.x, t2.y
-> Merge Join
Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y))
-> Index Scan using t1_pkey on t1
-> Index Scan using t2_pkey on t2
(6 rows)
-- Test case where t1 can be optimized but not t2
explain (costs off) select t1.*,t2.x,t2.z
from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
QUERY PLAN
-------------------------------------------------------
HashAggregate
Group Key: t1.a, t1.b, t2.x, t2.z
-> Merge Join
Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y))
-> Index Scan using t1_pkey on t1
-> Index Scan using t2_pkey on t2
(6 rows)
-- Cannot optimize when PK is deferrable
explain (costs off) select * from t3 group by a,b,c;
QUERY PLAN
----------------------
HashAggregate
Group Key: a, b, c
-> Seq Scan on t3
(3 rows)
drop table t1;
drop table t2;
drop table t3;
--
-- Test combinations of DISTINCT and/or ORDER BY -- Test combinations of DISTINCT and/or ORDER BY
-- --
select array_agg(a order by b) select array_agg(a order by b)
......
...@@ -3943,12 +3943,14 @@ select d.* from d left join (select distinct * from b) s ...@@ -3943,12 +3943,14 @@ select d.* from d left join (select distinct * from b) s
(1 row) (1 row)
-- join removal is not possible when the GROUP BY contains a column that is -- join removal is not possible when the GROUP BY contains a column that is
-- not in the join condition -- not in the join condition. (Note: as of 9.6, we notice that b.id is a
-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
-- but this happens too late for join removal in the outer plan level.)
explain (costs off) explain (costs off)
select d.* from d left join (select * from b group by b.id, b.c_id) s select d.* from d left join (select * from b group by b.id, b.c_id) s
on d.a = s.id; on d.a = s.id;
QUERY PLAN QUERY PLAN
--------------------------------------------- ---------------------------------------
Merge Left Join Merge Left Join
Merge Cond: (d.a = s.id) Merge Cond: (d.a = s.id)
-> Sort -> Sort
...@@ -3958,7 +3960,7 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s ...@@ -3958,7 +3960,7 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
Sort Key: s.id Sort Key: s.id
-> Subquery Scan on s -> Subquery Scan on s
-> HashAggregate -> HashAggregate
Group Key: b.id, b.c_id Group Key: b.id
-> Seq Scan on b -> Seq Scan on b
(11 rows) (11 rows)
......
...@@ -300,6 +300,37 @@ drop table minmaxtest cascade; ...@@ -300,6 +300,37 @@ drop table minmaxtest cascade;
select max(min(unique1)) from tenk1; select max(min(unique1)) from tenk1;
select (select max(min(unique1)) from int8_tbl) from tenk1; select (select max(min(unique1)) from int8_tbl) from tenk1;
--
-- Test removal of redundant GROUP BY columns
--
create temp table t1 (a int, b int, c int, d int, primary key (a, b));
create temp table t2 (x int, y int, z int, primary key (x, y));
create temp table t3 (a int, b int, c int, primary key(a, b) deferrable);
-- Non-primary-key columns can be removed from GROUP BY
explain (costs off) select * from t1 group by a,b,c,d;
-- No removal can happen if the complete PK is not present in GROUP BY
explain (costs off) select a,c from t1 group by a,c,d;
-- Test removal across multiple relations
explain (costs off) select *
from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z;
-- Test case where t1 can be optimized but not t2
explain (costs off) select t1.*,t2.x,t2.z
from t1 inner join t2 on t1.a = t2.x and t1.b = t2.y
group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z;
-- Cannot optimize when PK is deferrable
explain (costs off) select * from t3 group by a,b,c;
drop table t1;
drop table t2;
drop table t3;
-- --
-- Test combinations of DISTINCT and/or ORDER BY -- Test combinations of DISTINCT and/or ORDER BY
-- --
......
...@@ -1279,7 +1279,9 @@ select d.* from d left join (select distinct * from b) s ...@@ -1279,7 +1279,9 @@ select d.* from d left join (select distinct * from b) s
on d.a = s.id and d.b = s.c_id; on d.a = s.id and d.b = s.c_id;
-- join removal is not possible when the GROUP BY contains a column that is -- join removal is not possible when the GROUP BY contains a column that is
-- not in the join condition -- not in the join condition. (Note: as of 9.6, we notice that b.id is a
-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
-- but this happens too late for join removal in the outer plan level.)
explain (costs off) explain (costs off)
select d.* from d left join (select * from b group by b.id, b.c_id) s select d.* from d left join (select * from b group by b.id, b.c_id) s
on d.a = s.id; on d.a = s.id;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment