Commit b5635948 authored by Andrew Gierth's avatar Andrew Gierth

Support hashed aggregation with grouping sets.

This extends the Aggregate node with two new features: HashAggregate
can now run multiple hashtables concurrently, and a new strategy
MixedAggregate populates hashtables while doing sorted grouping.

The planner will now attempt to save as many sorts as possible when
planning grouping sets queries, while not exceeding work_mem for the
estimated combined sizes of all hashtables used.  No SQL-level changes
are required.  There should be no user-visible impact other than the
new EXPLAIN output and possible changes to result ordering when ORDER
BY was not used (which affected a few regression tests).  The
enable_hashagg option is respected.

Author: Andrew Gierth
Reviewers: Mark Dilger, Andres Freund
Discussion: https://postgr.es/m/87vatszyhj.fsf@news-spur.riddles.org.uk
parent f0a6046b
......@@ -3276,16 +3276,19 @@ select sum(q.a), count(q.b) from ft4 left join (select 13, avg(ft1.c1), sum(ft2.
-- Grouping sets
explain (verbose, costs off)
select c2, sum(c1) from ft1 where c2 < 3 group by rollup(c2) order by 1 nulls last;
QUERY PLAN
---------------------------------------------------------------------------------------------------
GroupAggregate
Output: c2, sum(c1)
Group Key: ft1.c2
Group Key: ()
-> Foreign Scan on public.ft1
Output: c2, c1
Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE ((c2 < 3)) ORDER BY c2 ASC NULLS LAST
(7 rows)
QUERY PLAN
------------------------------------------------------------------------------
Sort
Output: c2, (sum(c1))
Sort Key: ft1.c2
-> MixedAggregate
Output: c2, sum(c1)
Hash Key: ft1.c2
Group Key: ()
-> Foreign Scan on public.ft1
Output: c2, c1
Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE ((c2 < 3))
(10 rows)
select c2, sum(c1) from ft1 where c2 < 3 group by rollup(c2) order by 1 nulls last;
c2 | sum
......@@ -3298,16 +3301,19 @@ select c2, sum(c1) from ft1 where c2 < 3 group by rollup(c2) order by 1 nulls la
explain (verbose, costs off)
select c2, sum(c1) from ft1 where c2 < 3 group by cube(c2) order by 1 nulls last;
QUERY PLAN
---------------------------------------------------------------------------------------------------
GroupAggregate
Output: c2, sum(c1)
Group Key: ft1.c2
Group Key: ()
-> Foreign Scan on public.ft1
Output: c2, c1
Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE ((c2 < 3)) ORDER BY c2 ASC NULLS LAST
(7 rows)
QUERY PLAN
------------------------------------------------------------------------------
Sort
Output: c2, (sum(c1))
Sort Key: ft1.c2
-> MixedAggregate
Output: c2, sum(c1)
Hash Key: ft1.c2
Group Key: ()
-> Foreign Scan on public.ft1
Output: c2, c1
Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1" WHERE ((c2 < 3))
(10 rows)
select c2, sum(c1) from ft1 where c2 < 3 group by cube(c2) order by 1 nulls last;
c2 | sum
......@@ -3320,20 +3326,19 @@ select c2, sum(c1) from ft1 where c2 < 3 group by cube(c2) order by 1 nulls last
explain (verbose, costs off)
select c2, c6, sum(c1) from ft1 where c2 < 3 group by grouping sets(c2, c6) order by 1 nulls last, 2 nulls last;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------
QUERY PLAN
----------------------------------------------------------------------------------
Sort
Output: c2, c6, (sum(c1))
Sort Key: ft1.c2, ft1.c6
-> GroupAggregate
-> HashAggregate
Output: c2, c6, sum(c1)
Group Key: ft1.c2
Sort Key: ft1.c6
Group Key: ft1.c6
Hash Key: ft1.c2
Hash Key: ft1.c6
-> Foreign Scan on public.ft1
Output: c2, c6, c1
Remote SQL: SELECT "C 1", c2, c6 FROM "S 1"."T 1" WHERE ((c2 < 3)) ORDER BY c2 ASC NULLS LAST
(11 rows)
Remote SQL: SELECT "C 1", c2, c6 FROM "S 1"."T 1" WHERE ((c2 < 3))
(10 rows)
select c2, c6, sum(c1) from ft1 where c2 < 3 group by grouping sets(c2, c6) order by 1 nulls last, 2 nulls last;
c2 | c6 | sum
......
......@@ -1015,6 +1015,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
pname = "HashAggregate";
strategy = "Hashed";
break;
case AGG_MIXED:
pname = "MixedAggregate";
strategy = "Mixed";
break;
default:
pname = "Aggregate ???";
strategy = "???";
......@@ -1978,6 +1982,19 @@ show_grouping_set_keys(PlanState *planstate,
ListCell *lc;
List *gsets = aggnode->groupingSets;
AttrNumber *keycols = aggnode->grpColIdx;
const char *keyname;
const char *keysetname;
if (aggnode->aggstrategy == AGG_HASHED || aggnode->aggstrategy == AGG_MIXED)
{
keyname = "Hash Key";
keysetname = "Hash Keys";
}
else
{
keyname = "Group Key";
keysetname = "Group Keys";
}
ExplainOpenGroup("Grouping Set", NULL, true, es);
......@@ -1992,7 +2009,7 @@ show_grouping_set_keys(PlanState *planstate,
es->indent++;
}
ExplainOpenGroup("Group Keys", "Group Keys", false, es);
ExplainOpenGroup(keysetname, keysetname, false, es);
foreach(lc, gsets)
{
......@@ -2016,12 +2033,12 @@ show_grouping_set_keys(PlanState *planstate,
}
if (!result && es->format == EXPLAIN_FORMAT_TEXT)
ExplainPropertyText("Group Key", "()", es);
ExplainPropertyText(keyname, "()", es);
else
ExplainPropertyListNested("Group Key", result, es);
ExplainPropertyListNested(keyname, result, es);
}
ExplainCloseGroup("Group Keys", "Group Keys", false, es);
ExplainCloseGroup(keysetname, keysetname, false, es);
if (sortnode && es->format == EXPLAIN_FORMAT_TEXT)
es->indent--;
......
This diff is collapsed.
......@@ -12,7 +12,7 @@ subdir = src/backend/lib
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = binaryheap.o bipartite_match.o hyperloglog.o ilist.o pairingheap.o \
rbtree.o stringinfo.o
OBJS = binaryheap.o bipartite_match.o hyperloglog.o ilist.o knapsack.o \
pairingheap.o rbtree.o stringinfo.o
include $(top_srcdir)/src/backend/common.mk
/*-------------------------------------------------------------------------
*
* knapsack.c
* Knapsack problem solver
*
* Given input vectors of integral item weights (must be >= 0) and values
* (double >= 0), compute the set of items which produces the greatest total
* value without exceeding a specified total weight; each item is included at
* most once (this is the 0/1 knapsack problem). Weight 0 items will always be
* included.
*
* The performance of this algorithm is pseudo-polynomial, O(nW) where W is the
* weight limit. To use with non-integral weights or approximate solutions,
* the caller should pre-scale the input weights to a suitable range. This
* allows approximate solutions in polynomial time (the general case of the
* exact problem is NP-hard).
*
* Copyright (c) 2017, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/lib/knapsack.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include <limits.h>
#include "lib/knapsack.h"
#include "miscadmin.h"
#include "nodes/bitmapset.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/palloc.h"
/*
* DiscreteKnapsack
*
* The item_values input is optional; if omitted, all the items are assumed to
* have value 1.
*
* Returns a Bitmapset of the 0..(n-1) indexes of the items chosen for
* inclusion in the solution.
*
* This uses the usual dynamic-programming algorithm, adapted to reuse the
* memory on each pass (by working from larger weights to smaller). At the
* start of pass number i, the values[w] array contains the largest value
* computed with total weight <= w, using only items with indices < i; and
* sets[w] contains the bitmap of items actually used for that value. (The
* bitmapsets are all pre-initialized with an unused high bit so that memory
* allocation is done only once.)
*/
Bitmapset *
DiscreteKnapsack(int max_weight, int num_items,
int *item_weights, double *item_values)
{
MemoryContext local_ctx = AllocSetContextCreate(CurrentMemoryContext,
"Knapsack",
ALLOCSET_SMALL_MINSIZE,
ALLOCSET_SMALL_INITSIZE,
ALLOCSET_SMALL_MAXSIZE);
MemoryContext oldctx = MemoryContextSwitchTo(local_ctx);
double *values;
Bitmapset **sets;
Bitmapset *result;
int i,
j;
Assert(max_weight >= 0);
Assert(num_items > 0 && item_weights);
values = palloc((1 + max_weight) * sizeof(double));
sets = palloc((1 + max_weight) * sizeof(Bitmapset *));
for (i = 0; i <= max_weight; ++i)
{
values[i] = 0;
sets[i] = bms_make_singleton(num_items);
}
for (i = 0; i < num_items; ++i)
{
int iw = item_weights[i];
double iv = item_values ? item_values[i] : 1;
for (j = max_weight; j >= iw; --j)
{
int ow = j - iw;
if (values[j] <= values[ow] + iv)
{
/* copy sets[ow] to sets[j] without realloc */
if (j != ow)
{
sets[j] = bms_del_members(sets[j], sets[j]);
sets[j] = bms_add_members(sets[j], sets[ow]);
}
sets[j] = bms_add_member(sets[j], i);
values[j] = values[ow] + iv;
}
}
}
MemoryContextSwitchTo(oldctx);
result = bms_del_member(bms_copy(sets[max_weight]), num_items);
MemoryContextDelete(local_ctx);
return result;
}
......@@ -21,6 +21,7 @@
#include "postgres.h"
#include "access/hash.h"
#include "nodes/pg_list.h"
#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
......@@ -457,6 +458,35 @@ bms_overlap(const Bitmapset *a, const Bitmapset *b)
return false;
}
/*
* bms_overlap_list - does a set overlap an integer list?
*/
bool
bms_overlap_list(const Bitmapset *a, const List *b)
{
ListCell *lc;
int wordnum,
bitnum;
if (a == NULL || b == NIL)
return false;
foreach(lc, b)
{
int x = lfirst_int(lc);
if (x < 0)
elog(ERROR, "negative bitmapset member not allowed");
wordnum = WORDNUM(x);
bitnum = BITNUM(x);
if (wordnum < a->nwords)
if ((a->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
return true;
}
return false;
}
/*
* bms_nonempty_difference - do sets have a nonempty difference?
*/
......
......@@ -1941,6 +1941,28 @@ _outAggPath(StringInfo str, const AggPath *node)
WRITE_NODE_FIELD(qual);
}
static void
_outRollupData(StringInfo str, const RollupData *node)
{
WRITE_NODE_TYPE("ROLLUP");
WRITE_NODE_FIELD(groupClause);
WRITE_NODE_FIELD(gsets);
WRITE_NODE_FIELD(gsets_data);
WRITE_FLOAT_FIELD(numGroups, "%.0f");
WRITE_BOOL_FIELD(hashable);
WRITE_BOOL_FIELD(is_hashed);
}
static void
_outGroupingSetData(StringInfo str, const GroupingSetData *node)
{
WRITE_NODE_TYPE("GSDATA");
WRITE_NODE_FIELD(set);
WRITE_FLOAT_FIELD(numGroups, "%.0f");
}
static void
_outGroupingSetsPath(StringInfo str, const GroupingSetsPath *node)
{
......@@ -1949,8 +1971,8 @@ _outGroupingSetsPath(StringInfo str, const GroupingSetsPath *node)
_outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath);
WRITE_NODE_FIELD(rollup_groupclauses);
WRITE_NODE_FIELD(rollup_lists);
WRITE_ENUM_FIELD(aggstrategy, AggStrategy);
WRITE_NODE_FIELD(rollups);
WRITE_NODE_FIELD(qual);
}
......@@ -3961,14 +3983,18 @@ outNode(StringInfo str, const void *obj)
case T_PlannerParamItem:
_outPlannerParamItem(str, obj);
break;
case T_RollupData:
_outRollupData(str, obj);
break;
case T_GroupingSetData:
_outGroupingSetData(str, obj);
break;
case T_StatisticExtInfo:
_outStatisticExtInfo(str, obj);
break;
case T_ExtensibleNode:
_outExtensibleNode(str, obj);
break;
case T_CreateStmt:
_outCreateStmt(str, obj);
break;
......
......@@ -1884,11 +1884,16 @@ cost_agg(Path *path, PlannerInfo *root,
total_cost = startup_cost + cpu_tuple_cost;
output_tuples = 1;
}
else if (aggstrategy == AGG_SORTED)
else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
{
/* Here we are able to deliver output on-the-fly */
startup_cost = input_startup_cost;
total_cost = input_total_cost;
if (aggstrategy == AGG_MIXED && !enable_hashagg)
{
startup_cost += disable_cost;
total_cost += disable_cost;
}
/* calcs phrased this way to match HASHED case, see note above */
total_cost += aggcosts->transCost.startup;
total_cost += aggcosts->transCost.per_tuple * input_tuples;
......
......@@ -1783,18 +1783,15 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
{
Agg *plan;
Plan *subplan;
List *rollup_groupclauses = best_path->rollup_groupclauses;
List *rollup_lists = best_path->rollup_lists;
List *rollups = best_path->rollups;
AttrNumber *grouping_map;
int maxref;
List *chain;
ListCell *lc,
*lc2;
ListCell *lc;
/* Shouldn't get here without grouping sets */
Assert(root->parse->groupingSets);
Assert(rollup_lists != NIL);
Assert(list_length(rollup_lists) == list_length(rollup_groupclauses));
Assert(rollups != NIL);
/*
* Agg can project, so no need to be terribly picky about child tlist, but
......@@ -1846,72 +1843,86 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
* costs will be shown by EXPLAIN.
*/
chain = NIL;
if (list_length(rollup_groupclauses) > 1)
if (list_length(rollups) > 1)
{
forboth(lc, rollup_groupclauses, lc2, rollup_lists)
ListCell *lc2 = lnext(list_head(rollups));
bool is_first_sort = ((RollupData *) linitial(rollups))->is_hashed;
for_each_cell(lc, lc2)
{
List *groupClause = (List *) lfirst(lc);
List *gsets = (List *) lfirst(lc2);
RollupData *rollup = lfirst(lc);
AttrNumber *new_grpColIdx;
Plan *sort_plan;
Plan *sort_plan = NULL;
Plan *agg_plan;
AggStrategy strat;
/* We want to iterate over all but the last rollup list elements */
if (lnext(lc) == NULL)
break;
new_grpColIdx = remap_groupColIdx(root, rollup->groupClause);
if (!rollup->is_hashed && !is_first_sort)
{
sort_plan = (Plan *)
make_sort_from_groupcols(rollup->groupClause,
new_grpColIdx,
subplan);
}
new_grpColIdx = remap_groupColIdx(root, groupClause);
if (!rollup->is_hashed)
is_first_sort = false;
sort_plan = (Plan *)
make_sort_from_groupcols(groupClause,
new_grpColIdx,
subplan);
if (rollup->is_hashed)
strat = AGG_HASHED;
else if (list_length(linitial(rollup->gsets)) == 0)
strat = AGG_PLAIN;
else
strat = AGG_SORTED;
agg_plan = (Plan *) make_agg(NIL,
NIL,
AGG_SORTED,
strat,
AGGSPLIT_SIMPLE,
list_length((List *) linitial(gsets)),
list_length((List *) linitial(rollup->gsets)),
new_grpColIdx,
extract_grouping_ops(groupClause),
gsets,
extract_grouping_ops(rollup->groupClause),
rollup->gsets,
NIL,
0, /* numGroups not needed */
rollup->numGroups,
sort_plan);
/*
* Nuke stuff we don't need to avoid bloating debug output.
* Remove stuff we don't need to avoid bloating debug output.
*/
sort_plan->targetlist = NIL;
sort_plan->lefttree = NULL;
if (sort_plan)
{
sort_plan->targetlist = NIL;
sort_plan->lefttree = NULL;
}
chain = lappend(chain, agg_plan);
}
}
/*
* Now make the final Agg node
* Now make the real Agg node
*/
{
List *groupClause = (List *) llast(rollup_groupclauses);
List *gsets = (List *) llast(rollup_lists);
RollupData *rollup = linitial(rollups);
AttrNumber *top_grpColIdx;
int numGroupCols;
top_grpColIdx = remap_groupColIdx(root, groupClause);
top_grpColIdx = remap_groupColIdx(root, rollup->groupClause);
numGroupCols = list_length((List *) linitial(gsets));
numGroupCols = list_length((List *) linitial(rollup->gsets));
plan = make_agg(build_path_tlist(root, &best_path->path),
best_path->qual,
(numGroupCols > 0) ? AGG_SORTED : AGG_PLAIN,
best_path->aggstrategy,
AGGSPLIT_SIMPLE,
numGroupCols,
top_grpColIdx,
extract_grouping_ops(groupClause),
gsets,
extract_grouping_ops(rollup->groupClause),
rollup->gsets,
chain,
0, /* numGroups not needed */
rollup->numGroups,
subplan);
/* Copy cost data from Path to Plan */
......
This diff is collapsed.
......@@ -2697,10 +2697,9 @@ create_agg_path(PlannerInfo *root,
* 'subpath' is the path representing the source of data
* 'target' is the PathTarget to be computed
* 'having_qual' is the HAVING quals if any
* 'rollup_lists' is a list of grouping sets
* 'rollup_groupclauses' is a list of grouping clauses for grouping sets
* 'rollups' is a list of RollupData nodes
* 'agg_costs' contains cost info about the aggregate functions to be computed
* 'numGroups' is the estimated number of groups
* 'numGroups' is the estimated total number of groups
*/
GroupingSetsPath *
create_groupingsets_path(PlannerInfo *root,
......@@ -2708,13 +2707,15 @@ create_groupingsets_path(PlannerInfo *root,
Path *subpath,
PathTarget *target,
List *having_qual,
List *rollup_lists,
List *rollup_groupclauses,
AggStrategy aggstrategy,
List *rollups,
const AggClauseCosts *agg_costs,
double numGroups)
{
GroupingSetsPath *pathnode = makeNode(GroupingSetsPath);
int numGroupCols;
ListCell *lc;
bool is_first = true;
bool is_first_sort = true;
/* The topmost generated Plan node will be an Agg */
pathnode->path.pathtype = T_Agg;
......@@ -2727,75 +2728,110 @@ create_groupingsets_path(PlannerInfo *root,
pathnode->path.parallel_workers = subpath->parallel_workers;
pathnode->subpath = subpath;
/*
* Simplify callers by downgrading AGG_SORTED to AGG_PLAIN, and AGG_MIXED
* to AGG_HASHED, here if possible.
*/
if (aggstrategy == AGG_SORTED &&
list_length(rollups) == 1 &&
((RollupData *) linitial(rollups))->groupClause == NIL)
aggstrategy = AGG_PLAIN;
if (aggstrategy == AGG_MIXED &&
list_length(rollups) == 1)
aggstrategy = AGG_HASHED;
/*
* Output will be in sorted order by group_pathkeys if, and only if, there
* is a single rollup operation on a non-empty list of grouping
* expressions.
*/
if (list_length(rollup_groupclauses) == 1 &&
((List *) linitial(rollup_groupclauses)) != NIL)
if (aggstrategy == AGG_SORTED && list_length(rollups) == 1)
pathnode->path.pathkeys = root->group_pathkeys;
else
pathnode->path.pathkeys = NIL;
pathnode->rollup_groupclauses = rollup_groupclauses;
pathnode->rollup_lists = rollup_lists;
pathnode->aggstrategy = aggstrategy;
pathnode->rollups = rollups;
pathnode->qual = having_qual;
Assert(rollup_lists != NIL);
Assert(list_length(rollup_lists) == list_length(rollup_groupclauses));
/* Account for cost of the topmost Agg node */
numGroupCols = list_length((List *) linitial((List *) llast(rollup_lists)));
cost_agg(&pathnode->path, root,
(numGroupCols > 0) ? AGG_SORTED : AGG_PLAIN,
agg_costs,
numGroupCols,
numGroups,
subpath->startup_cost,
subpath->total_cost,
subpath->rows);
Assert(rollups != NIL);
Assert(aggstrategy != AGG_PLAIN || list_length(rollups) == 1);
Assert(aggstrategy != AGG_MIXED || list_length(rollups) > 1);
/*
* Add in the costs and output rows of the additional sorting/aggregation
* steps, if any. Only total costs count, since the extra sorts aren't
* run on startup.
*/
if (list_length(rollup_lists) > 1)
foreach(lc, rollups)
{
ListCell *lc;
RollupData *rollup = lfirst(lc);
List *gsets = rollup->gsets;
int numGroupCols = list_length(linitial(gsets));
foreach(lc, rollup_lists)
/*
* In AGG_SORTED or AGG_PLAIN mode, the first rollup takes the
* (already-sorted) input, and following ones do their own sort.
*
* In AGG_HASHED mode, there is one rollup for each grouping set.
*
* In AGG_MIXED mode, the first rollups are hashed, the first
* non-hashed one takes the (already-sorted) input, and following ones
* do their own sort.
*/
if (is_first)
{
cost_agg(&pathnode->path, root,
aggstrategy,
agg_costs,
numGroupCols,
rollup->numGroups,
subpath->startup_cost,
subpath->total_cost,
subpath->rows);
is_first = false;
if (!rollup->is_hashed)
is_first_sort = false;
}
else
{
List *gsets = (List *) lfirst(lc);
Path sort_path; /* dummy for result of cost_sort */
Path agg_path; /* dummy for result of cost_agg */
/* We must iterate over all but the last rollup_lists element */
if (lnext(lc) == NULL)
break;
/* Account for cost of sort, but don't charge input cost again */
cost_sort(&sort_path, root, NIL,
0.0,
subpath->rows,
subpath->pathtarget->width,
0.0,
work_mem,
-1.0);
/* Account for cost of aggregation */
numGroupCols = list_length((List *) linitial(gsets));
cost_agg(&agg_path, root,
AGG_SORTED,
agg_costs,
numGroupCols,
numGroups, /* XXX surely not right for all steps? */
sort_path.startup_cost,
sort_path.total_cost,
sort_path.rows);
if (rollup->is_hashed || is_first_sort)
{
/*
* Account for cost of aggregation, but don't charge input
* cost again
*/
cost_agg(&agg_path, root,
rollup->is_hashed ? AGG_HASHED : AGG_SORTED,
agg_costs,
numGroupCols,
rollup->numGroups,
0.0, 0.0,
subpath->rows);
if (!rollup->is_hashed)
is_first_sort = false;
}
else
{
/* Account for cost of sort, but don't charge input cost again */
cost_sort(&sort_path, root, NIL,
0.0,
subpath->rows,
subpath->pathtarget->width,
0.0,
work_mem,
-1.0);
/* Account for cost of aggregation */
cost_agg(&agg_path, root,
AGG_SORTED,
agg_costs,
numGroupCols,
rollup->numGroups,
sort_path.startup_cost,
sort_path.total_cost,
sort_path.rows);
}
pathnode->path.total_cost += agg_path.total_cost;
pathnode->path.rows += agg_path.rows;
......
/*
* knapsack.h
*
* Copyright (c) 2017, PostgreSQL Global Development Group
*
* src/include/lib/knapsack.h
*/
#ifndef KNAPSACK_H
#define KNAPSACK_H
#include "postgres.h"
#include "nodes/bitmapset.h"
extern Bitmapset *DiscreteKnapsack(int max_weight, int num_items,
int *item_weights, double *item_values);
#endif /* KNAPSACK_H */
......@@ -20,6 +20,11 @@
#ifndef BITMAPSET_H
#define BITMAPSET_H
/*
* Forward decl to save including pg_list.h
*/
struct List;
/*
* Data representation
*/
......@@ -70,6 +75,7 @@ extern bool bms_is_subset(const Bitmapset *a, const Bitmapset *b);
extern BMS_Comparison bms_subset_compare(const Bitmapset *a, const Bitmapset *b);
extern bool bms_is_member(int x, const Bitmapset *a);
extern bool bms_overlap(const Bitmapset *a, const Bitmapset *b);
extern bool bms_overlap_list(const Bitmapset *a, const struct List *b);
extern bool bms_nonempty_difference(const Bitmapset *a, const Bitmapset *b);
extern int bms_singleton_member(const Bitmapset *a);
extern bool bms_get_singleton_member(const Bitmapset *a, int *member);
......
......@@ -1699,6 +1699,7 @@ typedef struct AggStatePerAggData *AggStatePerAgg;
typedef struct AggStatePerTransData *AggStatePerTrans;
typedef struct AggStatePerGroupData *AggStatePerGroup;
typedef struct AggStatePerPhaseData *AggStatePerPhase;
typedef struct AggStatePerHashData *AggStatePerHash;
typedef struct AggState
{
......@@ -1706,15 +1707,17 @@ typedef struct AggState
List *aggs; /* all Aggref nodes in targetlist & quals */
int numaggs; /* length of list (could be zero!) */
int numtrans; /* number of pertrans items */
AggStrategy aggstrategy; /* strategy mode */
AggSplit aggsplit; /* agg-splitting mode, see nodes.h */
AggStatePerPhase phase; /* pointer to current phase data */
int numphases; /* number of phases */
int numphases; /* number of phases (including phase 0) */
int current_phase; /* current phase number */
FmgrInfo *hashfunctions; /* per-grouping-field hash fns */
AggStatePerAgg peragg; /* per-Aggref information */
AggStatePerTrans pertrans; /* per-Trans state information */
ExprContext *hashcontext; /* econtexts for long-lived data (hashtable) */
ExprContext **aggcontexts; /* econtexts for long-lived data (per GS) */
ExprContext *tmpcontext; /* econtext for input expressions */
ExprContext *curaggcontext; /* currently active aggcontext */
AggStatePerTrans curpertrans; /* currently active trans state */
bool input_done; /* indicates end of input */
bool agg_done; /* indicates completion of Agg scan */
......@@ -1726,21 +1729,17 @@ typedef struct AggState
/* These fields are for grouping set phase data */
int maxsets; /* The max number of sets in any phase */
AggStatePerPhase phases; /* array of all phases */
Tuplesortstate *sort_in; /* sorted input to phases > 0 */
Tuplesortstate *sort_in; /* sorted input to phases > 1 */
Tuplesortstate *sort_out; /* input is copied here for next phase */
TupleTableSlot *sort_slot; /* slot for sort results */
/* these fields are used in AGG_PLAIN and AGG_SORTED modes: */
AggStatePerGroup pergroup; /* per-Aggref-per-group working state */
HeapTuple grp_firstTuple; /* copy of first tuple of current group */
/* these fields are used in AGG_HASHED mode: */
TupleHashTable hashtable; /* hash table with one entry per group */
TupleTableSlot *hashslot; /* slot for loading hash table */
int numhashGrpCols; /* number of columns in hash table */
int largestGrpColIdx; /* largest column required for hashing */
AttrNumber *hashGrpColIdxInput; /* and their indices in input slot */
AttrNumber *hashGrpColIdxHash; /* indices for execGrouping in hashtbl */
/* these fields are used in AGG_HASHED and AGG_MIXED modes: */
bool table_filled; /* hash table filled yet? */
TupleHashIterator hashiter; /* for iterating through hash table */
int num_hashes;
AggStatePerHash perhash;
AggStatePerGroup *hash_pergroup; /* array of per-group pointers */
/* support for evaluation of agg inputs */
TupleTableSlot *evalslot; /* slot for agg inputs */
ProjectionInfo *evalproj; /* projection machinery */
......
......@@ -261,6 +261,8 @@ typedef enum NodeTag
T_PlaceHolderInfo,
T_MinMaxAggInfo,
T_PlannerParamItem,
T_RollupData,
T_GroupingSetData,
T_StatisticExtInfo,
/*
......@@ -724,7 +726,8 @@ typedef enum AggStrategy
{
AGG_PLAIN, /* simple agg across all input rows */
AGG_SORTED, /* grouped agg, input must be sorted */
AGG_HASHED /* grouped agg, use internal hashtable */
AGG_HASHED, /* grouped agg, use internal hashtable */
AGG_MIXED /* grouped agg, hash and sort both used */
} AggStrategy;
/*
......
......@@ -758,7 +758,7 @@ typedef struct Agg
Oid *grpOperators; /* equality operators to compare with */
long numGroups; /* estimated number of groups in input */
Bitmapset *aggParams; /* IDs of Params used in Aggref inputs */
/* Note: planner provides numGroups & aggParams only in AGG_HASHED case */
/* Note: planner provides numGroups & aggParams only in HASHED/MIXED case */
List *groupingSets; /* grouping sets to use */
List *chain; /* chained Agg/Sort nodes */
} Agg;
......
......@@ -1417,18 +1417,38 @@ typedef struct AggPath
List *qual; /* quals (HAVING quals), if any */
} AggPath;
/*
* Various annotations used for grouping sets in the planner.
*/
typedef struct GroupingSetData
{
NodeTag type;
List *set; /* grouping set as list of sortgrouprefs */
double numGroups; /* est. number of result groups */
} GroupingSetData;
typedef struct RollupData
{
NodeTag type;
List *groupClause; /* applicable subset of parse->groupClause */
List *gsets; /* lists of integer indexes into groupClause */
List *gsets_data; /* list of GroupingSetData */
double numGroups; /* est. number of result groups */
bool hashable; /* can be hashed */
bool is_hashed; /* to be implemented as a hashagg */
} RollupData;
/*
* GroupingSetsPath represents a GROUPING SETS aggregation
*
* Currently we only support this in sorted not hashed form, so the input
* must always be appropriately presorted.
*/
typedef struct GroupingSetsPath
{
Path path;
Path *subpath; /* path representing input source */
List *rollup_groupclauses; /* list of lists of SortGroupClause's */
List *rollup_lists; /* parallel list of lists of grouping sets */
AggStrategy aggstrategy; /* basic strategy */
List *rollups; /* list of RollupData */
List *qual; /* quals (HAVING quals), if any */
} GroupingSetsPath;
......
......@@ -195,8 +195,8 @@ extern GroupingSetsPath *create_groupingsets_path(PlannerInfo *root,
Path *subpath,
PathTarget *target,
List *having_qual,
List *rollup_lists,
List *rollup_groupclauses,
AggStrategy aggstrategy,
List *rollups,
const AggClauseCosts *agg_costs,
double numGroups);
extern MinMaxAggPath *create_minmaxagg_path(PlannerInfo *root,
......
This diff is collapsed.
......@@ -233,6 +233,7 @@ SELECT few.dataa, count(*), min(id), max(id), generate_series(1,3) FROM few GROU
(6 rows)
-- grouping sets are a bit special, they produce NULLs in columns not actually NULL
set enable_hashagg = false;
SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab);
dataa | b | g | count
-------+-----+---+-------
......@@ -311,46 +312,46 @@ SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(d
b | bar | | 2
b | | | 2
| | | 6
a | | 1 | 2
b | | 1 | 1
| | 1 | 3
a | | 2 | 2
b | | 2 | 1
| | 2 | 3
| bar | 1 | 2
| bar | 2 | 2
| bar | | 4
| foo | 1 | 1
| foo | 2 | 1
| foo | | 2
a | | 1 | 2
b | | 1 | 1
| | 1 | 3
a | | 2 | 2
b | | 2 | 1
| | 2 | 3
(24 rows)
SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g) ORDER BY dataa;
dataa | b | g | count
-------+-----+---+-------
a | foo | | 2
a | | | 4
a | | 2 | 2
a | bar | 1 | 1
a | bar | 2 | 1
a | bar | | 2
a | foo | 1 | 1
a | foo | 2 | 1
a | foo | | 2
a | | | 4
a | | 1 | 2
a | | 2 | 2
b | bar | 2 | 1
b | bar | 1 | 1
b | | | 2
b | | 1 | 1
b | | 2 | 1
b | bar | 1 | 1
b | bar | 2 | 1
b | bar | | 2
| foo | | 2
| foo | 1 | 1
b | | 2 | 1
| | 2 | 3
| | | 6
| bar | 1 | 2
| bar | 2 | 2
| | | 6
| foo | 2 | 1
| bar | | 4
| foo | 1 | 1
| foo | 2 | 1
| foo | | 2
| | 1 | 3
(24 rows)
......@@ -360,29 +361,30 @@ SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(d
a | bar | 1 | 1
a | foo | 1 | 1
b | bar | 1 | 1
| bar | 1 | 2
| foo | 1 | 1
a | | 1 | 2
b | | 1 | 1
| | 1 | 3
| bar | 1 | 2
| foo | 1 | 1
| foo | 2 | 1
| bar | 2 | 2
a | | 2 | 2
b | | 2 | 1
a | bar | 2 | 1
| bar | 2 | 2
| | 2 | 3
| foo | 2 | 1
a | bar | 2 | 1
a | foo | 2 | 1
b | bar | 2 | 1
a | foo | | 2
a | | | 4
b | bar | | 2
b | | | 2
| | | 6
a | | | 4
a | foo | | 2
a | bar | | 2
| bar | | 4
| foo | | 2
a | bar | | 2
(24 rows)
reset enable_hashagg;
-- data modification
CREATE TABLE fewmore AS SELECT generate_series(1,3) AS data;
INSERT INTO fewmore VALUES(generate_series(4,5));
......
......@@ -31,6 +31,14 @@ copy gstest3 from stdin;
\.
alter table gstest3 add primary key (a);
create temp table gstest4(id integer, v integer,
unhashable_col bit(4), unsortable_col xid);
insert into gstest4
values (1,1,b'0000','1'), (2,2,b'0001','1'),
(3,4,b'0010','2'), (4,8,b'0011','2'),
(5,16,b'0000','2'), (6,32,b'0001','2'),
(7,64,b'0010','1'), (8,128,b'0011','1');
create temp table gstest_empty (a integer, b integer, v integer);
create function gstest_data(v integer, out a integer, out b integer)
......@@ -43,8 +51,11 @@ create function gstest_data(v integer, out a integer, out b integer)
-- basic functionality
set enable_hashagg = false; -- test hashing explicitly later
-- simple rollup with multiple plain aggregates, with and without ordering
-- (and with ordering differing from grouping)
select a, b, grouping(a,b), sum(v), count(*), max(v)
from gstest1 group by rollup (a,b);
select a, b, grouping(a,b), sum(v), count(*), max(v)
......@@ -161,7 +172,7 @@ select a, b from (values (1,2),(2,3)) v(a,b) group by a,b, grouping sets(a);
-- Tests for chained aggregates
select a, b, grouping(a,b), sum(v), count(*), max(v)
from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2));
from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,6;
select(select (select grouping(a,b) from (values (1)) v2(c)) from (values (1,2)) v1(a,b) group by (a,b)) from (values(6,7)) v3(e,f) GROUP BY ROLLUP((e+1),(f+1));
select(select (select grouping(a,b) from (values (1)) v2(c)) from (values (1,2)) v1(a,b) group by (a,b)) from (values(6,7)) v3(e,f) GROUP BY CUBE((e+1),(f+1)) ORDER BY (e+1),(f+1);
select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum
......@@ -224,4 +235,146 @@ select array(select row(v.a,s1.*) from (select two,four, count(*) from onek grou
select sum(ten) from onek group by two, rollup(four::text) order by 1;
select sum(ten) from onek group by rollup(four::text), two order by 1;
-- hashing support
set enable_hashagg = true;
-- failure cases
select count(*) from gstest4 group by rollup(unhashable_col,unsortable_col);
select array_agg(v order by v) from gstest4 group by grouping sets ((id,unsortable_col),(id));
-- simple cases
select a, b, grouping(a,b), sum(v), count(*), max(v)
from gstest1 group by grouping sets ((a),(b)) order by 3,1,2;
explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v)
from gstest1 group by grouping sets ((a),(b)) order by 3,1,2;
select a, b, grouping(a,b), sum(v), count(*), max(v)
from gstest1 group by cube(a,b) order by 3,1,2;
explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v)
from gstest1 group by cube(a,b) order by 3,1,2;
-- shouldn't try and hash
explain (costs off)
select a, b, grouping(a,b), array_agg(v order by v)
from gstest1 group by cube(a,b);
-- mixed hashable/sortable cases
select unhashable_col, unsortable_col,
grouping(unhashable_col, unsortable_col),
count(*), sum(v)
from gstest4 group by grouping sets ((unhashable_col),(unsortable_col))
order by 3, 5;
explain (costs off)
select unhashable_col, unsortable_col,
grouping(unhashable_col, unsortable_col),
count(*), sum(v)
from gstest4 group by grouping sets ((unhashable_col),(unsortable_col))
order by 3,5;
select unhashable_col, unsortable_col,
grouping(unhashable_col, unsortable_col),
count(*), sum(v)
from gstest4 group by grouping sets ((v,unhashable_col),(v,unsortable_col))
order by 3,5;
explain (costs off)
select unhashable_col, unsortable_col,
grouping(unhashable_col, unsortable_col),
count(*), sum(v)
from gstest4 group by grouping sets ((v,unhashable_col),(v,unsortable_col))
order by 3,5;
-- empty input: first is 0 rows, second 1, third 3 etc.
select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),a);
explain (costs off)
select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),a);
select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),());
select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),(),(),());
explain (costs off)
select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),(),(),());
select sum(v), count(*) from gstest_empty group by grouping sets ((),(),());
explain (costs off)
select sum(v), count(*) from gstest_empty group by grouping sets ((),(),());
-- check that functionally dependent cols are not nulled
select a, d, grouping(a,b,c)
from gstest3
group by grouping sets ((a,b), (a,c));
explain (costs off)
select a, d, grouping(a,b,c)
from gstest3
group by grouping sets ((a,b), (a,c));
-- simple rescan tests
select a, b, sum(v.x)
from (values (1),(2)) v(x), gstest_data(v.x)
group by grouping sets (a,b);
explain (costs off)
select a, b, sum(v.x)
from (values (1),(2)) v(x), gstest_data(v.x)
group by grouping sets (a,b);
select *
from (values (1),(2)) v(x),
lateral (select a, b, sum(v.x) from gstest_data(v.x) group by grouping sets (a,b)) s;
explain (costs off)
select *
from (values (1),(2)) v(x),
lateral (select a, b, sum(v.x) from gstest_data(v.x) group by grouping sets (a,b)) s;
-- Tests for chained aggregates
select a, b, grouping(a,b), sum(v), count(*), max(v)
from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,6;
explain (costs off)
select a, b, grouping(a,b), sum(v), count(*), max(v)
from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,6;
select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum
from gstest2 group by cube (a,b) order by rsum, a, b;
explain (costs off)
select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum
from gstest2 group by cube (a,b) order by rsum, a, b;
select a, b, sum(v.x)
from (values (1),(2)) v(x), gstest_data(v.x)
group by cube (a,b) order by a,b;
explain (costs off)
select a, b, sum(v.x)
from (values (1),(2)) v(x), gstest_data(v.x)
group by cube (a,b) order by a,b;
-- More rescan tests
select * from (values (1),(2)) v(a) left join lateral (select v.a, four, ten, count(*) from onek group by cube(four,ten)) s on true order by v.a,four,ten;
select array(select row(v.a,s1.*) from (select two,four, count(*) from onek group by cube(two,four) order by two,four) s1) from (values (1),(2)) v(a);
-- Rescan logic changes when there are no empty grouping sets, so test
-- that too:
select * from (values (1),(2)) v(a) left join lateral (select v.a, four, ten, count(*) from onek group by grouping sets(four,ten)) s on true order by v.a,four,ten;
select array(select row(v.a,s1.*) from (select two,four, count(*) from onek group by grouping sets(two,four) order by two,four) s1) from (values (1),(2)) v(a);
-- test the knapsack
set work_mem = '64kB';
explain (costs off)
select unique1,
count(two), count(four), count(ten),
count(hundred), count(thousand), count(twothousand),
count(*)
from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two);
explain (costs off)
select unique1,
count(two), count(four), count(ten),
count(hundred), count(thousand), count(twothousand),
count(*)
from tenk1 group by grouping sets (unique1,hundred,ten,four,two);
set work_mem = '384kB';
explain (costs off)
select unique1,
count(two), count(four), count(ten),
count(hundred), count(thousand), count(twothousand),
count(*)
from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two);
-- end
......@@ -66,12 +66,14 @@ SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_se
SELECT few.dataa, count(*), min(id), max(id), generate_series(1,3) FROM few GROUP BY few.dataa ORDER BY 5, 1;
-- grouping sets are a bit special, they produce NULLs in columns not actually NULL
set enable_hashagg = false;
SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab);
SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab) ORDER BY dataa;
SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab) ORDER BY g;
SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g);
SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g) ORDER BY dataa;
SELECT dataa, datab b, generate_series(1,2) g, count(*) FROM few GROUP BY CUBE(dataa, datab, g) ORDER BY g;
reset enable_hashagg;
-- data modification
CREATE TABLE fewmore AS SELECT generate_series(1,3) AS data;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment