Commit be3b265c authored by Tom Lane's avatar Tom Lane

Improve SELECT DISTINCT to consider hash aggregation, as well as sort/uniq,

as methods for implementing the DISTINCT step.  This eliminates the former
performance gap between DISTINCT and GROUP BY, and also makes it possible
to do SELECT DISTINCT on datatypes that only support hashing not sorting.

SELECT DISTINCT ON is still always implemented by sorting; it would take
executor changes to support hashing that, and it's not clear it's worth
the trouble.

This is a release-note-worthy incompatibility from previous PG versions,
since SELECT DISTINCT can no longer be counted on to deliver sorted output
without explicitly saying ORDER BY.  (Anyone who can't cope with that
can consider turning off enable_hashagg.)

Several regression test queries needed to have ORDER BY added to preserve
stable output order.  I fixed the ones that manifested here, but there
might be some other cases that show up on other platforms.
parent 4abd7b49
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.329 2008/08/02 21:31:59 tgl Exp $
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.330 2008/08/05 02:43:17 tgl Exp $
*
* NOTES
* Every node type that can appear in stored rules' parsetrees *must*
......@@ -1334,6 +1334,7 @@ _outPlannerInfo(StringInfo str, PlannerInfo *node)
WRITE_NODE_FIELD(append_rel_list);
WRITE_NODE_FIELD(query_pathkeys);
WRITE_NODE_FIELD(group_pathkeys);
WRITE_NODE_FIELD(distinct_pathkeys);
WRITE_NODE_FIELD(sort_pathkeys);
WRITE_FLOAT_FIELD(total_table_pages, "%.0f");
WRITE_FLOAT_FIELD(tuple_fraction, "%.4f");
......
......@@ -14,7 +14,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.108 2008/08/03 19:10:52 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.109 2008/08/05 02:43:17 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -66,9 +66,9 @@
* PlannerInfo field and not a passed parameter is that the low-level routines
* in indxpath.c need to see it.)
*
* Note: the PlannerInfo node also includes group_pathkeys and sort_pathkeys,
* which like query_pathkeys need to be canonicalized once the info is
* available.
* Note: the PlannerInfo node also includes group_pathkeys, distinct_pathkeys,
* and sort_pathkeys, which like query_pathkeys need to be canonicalized once
* the info is available.
*
* tuple_fraction is interpreted as follows:
* 0: expect all tuples to be retrieved (normal case)
......@@ -120,6 +120,8 @@ query_planner(PlannerInfo *root, List *tlist,
root->query_pathkeys);
root->group_pathkeys = canonicalize_pathkeys(root,
root->group_pathkeys);
root->distinct_pathkeys = canonicalize_pathkeys(root,
root->distinct_pathkeys);
root->sort_pathkeys = canonicalize_pathkeys(root,
root->sort_pathkeys);
return;
......@@ -237,10 +239,12 @@ query_planner(PlannerInfo *root, List *tlist,
/*
* We have completed merging equivalence sets, so it's now possible to
* convert the requested query_pathkeys to canonical form. Also
* canonicalize the groupClause and sortClause pathkeys for use later.
* canonicalize the groupClause, distinctClause and sortClause pathkeys
* for use later.
*/
root->query_pathkeys = canonicalize_pathkeys(root, root->query_pathkeys);
root->group_pathkeys = canonicalize_pathkeys(root, root->group_pathkeys);
root->distinct_pathkeys = canonicalize_pathkeys(root, root->distinct_pathkeys);
root->sort_pathkeys = canonicalize_pathkeys(root, root->sort_pathkeys);
/*
......@@ -286,9 +290,11 @@ query_planner(PlannerInfo *root, List *tlist,
/*
* If both GROUP BY and ORDER BY are specified, we will need two
* levels of sort --- and, therefore, certainly need to read all the
* tuples --- unless ORDER BY is a subset of GROUP BY.
* tuples --- unless ORDER BY is a subset of GROUP BY. Likewise if
* we have both DISTINCT and GROUP BY.
*/
if (!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
if (!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys) ||
!pathkeys_contained_in(root->distinct_pathkeys, root->group_pathkeys))
tuple_fraction = 0.0;
}
else if (parse->hasAggs || root->hasHavingQual)
......
This diff is collapsed.
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.173 2008/08/03 19:10:52 tgl Exp $
* $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.174 2008/08/05 02:43:17 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1447,9 +1447,6 @@ transformDistinctClause(ParseState *pstate,
/*
* Now add any remaining non-resjunk tlist items, using default
* sort/group semantics for their data types.
*
* XXX for now, the planner requires distinctClause to be sortable,
* so we have to insist on that here.
*/
foreach(tlitem, *targetlist)
{
......@@ -1459,8 +1456,7 @@ transformDistinctClause(ParseState *pstate,
continue; /* ignore junk */
result = addTargetToGroupList(pstate, tle,
result, *targetlist,
true, /* XXX for now */
true);
false, true);
}
return result;
......@@ -1555,8 +1551,7 @@ transformDistinctOnClause(ParseState *pstate, List *distinctlist,
errmsg("SELECT DISTINCT ON expressions must match initial ORDER BY expressions")));
result = addTargetToGroupList(pstate, tle,
result, *targetlist,
true, /* someday allow hash-only? */
true);
false, true);
}
return result;
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.156 2008/04/21 20:54:15 tgl Exp $
* $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.157 2008/08/05 02:43:17 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -162,8 +162,9 @@ typedef struct PlannerInfo
List *query_pathkeys; /* desired pathkeys for query_planner(), and
* actual pathkeys afterwards */
List *group_pathkeys; /* groupClause pathkeys, if any */
List *sort_pathkeys; /* sortClause pathkeys, if any */
List *group_pathkeys; /* groupClause pathkeys, if any */
List *distinct_pathkeys; /* distinctClause pathkeys, if any */
List *sort_pathkeys; /* sortClause pathkeys, if any */
List *initial_rels; /* RelOptInfos we are now trying to join */
......
......@@ -79,7 +79,7 @@ INSERT INTO TEMP_GROUP
INSERT INTO TEMP_GROUP
SELECT 2, i.f1, f.f1
FROM INT4_TBL i, FLOAT8_TBL f;
SELECT DISTINCT f1 AS two FROM TEMP_GROUP;
SELECT DISTINCT f1 AS two FROM TEMP_GROUP ORDER BY 1;
two
-----
1
......
......@@ -129,7 +129,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.prorettype < p2.prorettype);
(p1.prorettype < p2.prorettype)
ORDER BY 1, 2;
prorettype | prorettype
------------+------------
25 | 1043
......@@ -142,7 +143,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[0] < p2.proargtypes[0]);
(p1.proargtypes[0] < p2.proargtypes[0])
ORDER BY 1, 2;
proargtypes | proargtypes
-------------+-------------
25 | 1042
......@@ -158,7 +160,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[1] < p2.proargtypes[1]);
(p1.proargtypes[1] < p2.proargtypes[1])
ORDER BY 1, 2;
proargtypes | proargtypes
-------------+-------------
23 | 28
......@@ -173,7 +176,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[2] < p2.proargtypes[2]);
(p1.proargtypes[2] < p2.proargtypes[2])
ORDER BY 1, 2;
proargtypes | proargtypes
-------------+-------------
1114 | 1184
......@@ -185,7 +189,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[3] < p2.proargtypes[3]);
(p1.proargtypes[3] < p2.proargtypes[3])
ORDER BY 1, 2;
proargtypes | proargtypes
-------------+-------------
1114 | 1184
......@@ -197,7 +202,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[4] < p2.proargtypes[4]);
(p1.proargtypes[4] < p2.proargtypes[4])
ORDER BY 1, 2;
proargtypes | proargtypes
-------------+-------------
(0 rows)
......@@ -208,7 +214,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[5] < p2.proargtypes[5]);
(p1.proargtypes[5] < p2.proargtypes[5])
ORDER BY 1, 2;
proargtypes | proargtypes
-------------+-------------
(0 rows)
......@@ -219,7 +226,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[6] < p2.proargtypes[6]);
(p1.proargtypes[6] < p2.proargtypes[6])
ORDER BY 1, 2;
proargtypes | proargtypes
-------------+-------------
(0 rows)
......@@ -230,7 +238,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[7] < p2.proargtypes[7]);
(p1.proargtypes[7] < p2.proargtypes[7])
ORDER BY 1, 2;
proargtypes | proargtypes
-------------+-------------
(0 rows)
......
......@@ -14,7 +14,7 @@ SELECT DISTINCT two FROM tmp;
--
-- awk '{print $5;}' onek.data | sort -n | uniq
--
SELECT DISTINCT ten FROM tmp;
SELECT DISTINCT ten FROM tmp ORDER BY 1;
ten
-----
0
......@@ -32,7 +32,7 @@ SELECT DISTINCT ten FROM tmp;
--
-- awk '{print $16;}' onek.data | sort -d | uniq
--
SELECT DISTINCT string4 FROM tmp;
SELECT DISTINCT string4 FROM tmp ORDER BY 1;
string4
---------
AAAAxx
......
......@@ -183,7 +183,8 @@ SELECT p.name, name(p.hobbies) FROM person* p;
-- the next two queries demonstrate how functions generate bogus duplicates.
-- this is a "feature" ..
--
SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r;
SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r
ORDER BY 1,2;
SELECT hobbies_r.name, (hobbies_r.equipment).name FROM hobbies_r;
......
......@@ -469,7 +469,8 @@ SELECT p.name, name(p.hobbies) FROM person* p;
-- the next two queries demonstrate how functions generate bogus duplicates.
-- this is a "feature" ..
--
SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r;
SELECT DISTINCT hobbies_r.name, name(hobbies_r.equipment) FROM hobbies_r
ORDER BY 1,2;
name | name
-------------+---------------
basketball | hightops
......
......@@ -63,7 +63,7 @@ INSERT INTO TEMP_GROUP
SELECT 2, i.f1, f.f1
FROM INT4_TBL i, FLOAT8_TBL f;
SELECT DISTINCT f1 AS two FROM TEMP_GROUP;
SELECT DISTINCT f1 AS two FROM TEMP_GROUP ORDER BY 1;
SELECT f1 AS two, max(f3) AS max_float, min(f3) as min_float
FROM TEMP_GROUP
......
......@@ -121,7 +121,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.prorettype < p2.prorettype);
(p1.prorettype < p2.prorettype)
ORDER BY 1, 2;
SELECT DISTINCT p1.proargtypes[0], p2.proargtypes[0]
FROM pg_proc AS p1, pg_proc AS p2
......@@ -129,7 +130,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[0] < p2.proargtypes[0]);
(p1.proargtypes[0] < p2.proargtypes[0])
ORDER BY 1, 2;
SELECT DISTINCT p1.proargtypes[1], p2.proargtypes[1]
FROM pg_proc AS p1, pg_proc AS p2
......@@ -137,7 +139,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[1] < p2.proargtypes[1]);
(p1.proargtypes[1] < p2.proargtypes[1])
ORDER BY 1, 2;
SELECT DISTINCT p1.proargtypes[2], p2.proargtypes[2]
FROM pg_proc AS p1, pg_proc AS p2
......@@ -145,7 +148,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[2] < p2.proargtypes[2]);
(p1.proargtypes[2] < p2.proargtypes[2])
ORDER BY 1, 2;
SELECT DISTINCT p1.proargtypes[3], p2.proargtypes[3]
FROM pg_proc AS p1, pg_proc AS p2
......@@ -153,7 +157,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[3] < p2.proargtypes[3]);
(p1.proargtypes[3] < p2.proargtypes[3])
ORDER BY 1, 2;
SELECT DISTINCT p1.proargtypes[4], p2.proargtypes[4]
FROM pg_proc AS p1, pg_proc AS p2
......@@ -161,7 +166,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[4] < p2.proargtypes[4]);
(p1.proargtypes[4] < p2.proargtypes[4])
ORDER BY 1, 2;
SELECT DISTINCT p1.proargtypes[5], p2.proargtypes[5]
FROM pg_proc AS p1, pg_proc AS p2
......@@ -169,7 +175,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[5] < p2.proargtypes[5]);
(p1.proargtypes[5] < p2.proargtypes[5])
ORDER BY 1, 2;
SELECT DISTINCT p1.proargtypes[6], p2.proargtypes[6]
FROM pg_proc AS p1, pg_proc AS p2
......@@ -177,7 +184,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[6] < p2.proargtypes[6]);
(p1.proargtypes[6] < p2.proargtypes[6])
ORDER BY 1, 2;
SELECT DISTINCT p1.proargtypes[7], p2.proargtypes[7]
FROM pg_proc AS p1, pg_proc AS p2
......@@ -185,7 +193,8 @@ WHERE p1.oid != p2.oid AND
p1.prosrc = p2.prosrc AND
p1.prolang = 12 AND p2.prolang = 12 AND
NOT p1.proisagg AND NOT p2.proisagg AND
(p1.proargtypes[7] < p2.proargtypes[7]);
(p1.proargtypes[7] < p2.proargtypes[7])
ORDER BY 1, 2;
-- Look for functions that return type "internal" and do not have any
-- "internal" argument. Such a function would be a security hole since
......
......@@ -10,12 +10,12 @@ SELECT DISTINCT two FROM tmp;
--
-- awk '{print $5;}' onek.data | sort -n | uniq
--
SELECT DISTINCT ten FROM tmp;
SELECT DISTINCT ten FROM tmp ORDER BY 1;
--
-- awk '{print $16;}' onek.data | sort -d | uniq
--
SELECT DISTINCT string4 FROM tmp;
SELECT DISTINCT string4 FROM tmp ORDER BY 1;
--
-- awk '{print $3,$16,$5;}' onek.data | sort -d | uniq |
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment