Commit f15821ee authored by Tom Lane's avatar Tom Lane

Allow join removal in some cases involving a left join to a subquery.

We can remove a left join to a relation if the relation's output is
provably distinct for the columns involved in the join clause (considering
only equijoin clauses) and the relation supplies no variables needed above
the join.  Previously, the join removal logic could only prove distinctness
by reference to unique indexes of a table.  This patch extends the logic
to consider subquery relations, wherein distinctness might be proven by
reference to GROUP BY, DISTINCT, etc.

We actually already had some code to check that a subquery's output was
provably distinct, but it was hidden inside pathnode.c; which was a pretty
bad place for it really, since that file is mostly boilerplate Path
construction and comparison.  Move that code to analyzejoins.c, which is
arguably a more appropriate location, and is certainly the site of the
new usage for it.

David Rowley, reviewed by Simon Riggs
parent 5571caf4
This diff is collapsed.
......@@ -22,8 +22,9 @@
#include "optimizer/cost.h"
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/planmain.h"
#include "optimizer/restrictinfo.h"
#include "optimizer/tlist.h"
#include "optimizer/var.h"
#include "parser/parsetree.h"
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
......@@ -38,8 +39,6 @@ typedef enum
} PathCostComparison;
static List *translate_sub_tlist(List *tlist, int relid);
static bool query_is_distinct_for(Query *query, List *colnos, List *opids);
static Oid distinct_col_search(int colno, List *colnos, List *opids);
/*****************************************************************************
......@@ -1312,25 +1311,29 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
if (rel->rtekind == RTE_SUBQUERY)
{
RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
List *sub_tlist_colnos;
sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid);
if (sub_tlist_colnos &&
query_is_distinct_for(rte->subquery,
sub_tlist_colnos, in_operators))
if (query_supports_distinctness(rte->subquery))
{
pathnode->umethod = UNIQUE_PATH_NOOP;
pathnode->path.rows = rel->rows;
pathnode->path.startup_cost = subpath->startup_cost;
pathnode->path.total_cost = subpath->total_cost;
pathnode->path.pathkeys = subpath->pathkeys;
List *sub_tlist_colnos;
sub_tlist_colnos = translate_sub_tlist(uniq_exprs, rel->relid);
if (sub_tlist_colnos &&
query_is_distinct_for(rte->subquery,
sub_tlist_colnos, in_operators))
{
pathnode->umethod = UNIQUE_PATH_NOOP;
pathnode->path.rows = rel->rows;
pathnode->path.startup_cost = subpath->startup_cost;
pathnode->path.total_cost = subpath->total_cost;
pathnode->path.pathkeys = subpath->pathkeys;
rel->cheapest_unique_path = (Path *) pathnode;
rel->cheapest_unique_path = (Path *) pathnode;
MemoryContextSwitchTo(oldcontext);
MemoryContextSwitchTo(oldcontext);
return pathnode;
return pathnode;
}
}
}
......@@ -1450,161 +1453,6 @@ translate_sub_tlist(List *tlist, int relid)
return result;
}
/*
* query_is_distinct_for - does query never return duplicates of the
* specified columns?
*
* colnos is an integer list of output column numbers (resno's). We are
* interested in whether rows consisting of just these columns are certain
* to be distinct. "Distinctness" is defined according to whether the
* corresponding upper-level equality operators listed in opids would think
* the values are distinct. (Note: the opids entries could be cross-type
* operators, and thus not exactly the equality operators that the subquery
* would use itself. We use equality_ops_are_compatible() to check
* compatibility. That looks at btree or hash opfamily membership, and so
* should give trustworthy answers for all operators that we might need
* to deal with here.)
*/
static bool
query_is_distinct_for(Query *query, List *colnos, List *opids)
{
ListCell *l;
Oid opid;
Assert(list_length(colnos) == list_length(opids));
/*
* A set-returning function in the query's targetlist can result in
* returning duplicate rows, if the SRF is evaluated after the
* de-duplication step; so we play it safe and say "no" if there are any
* SRFs. (We could be certain that it's okay if SRFs appear only in the
* specified columns, since those must be evaluated before de-duplication;
* but it doesn't presently seem worth the complication to check that.)
*/
if (expression_returns_set((Node *) query->targetList))
return false;
/*
* DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
* columns in the DISTINCT clause appear in colnos and operator semantics
* match.
*/
if (query->distinctClause)
{
foreach(l, query->distinctClause)
{
SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
TargetEntry *tle = get_sortgroupclause_tle(sgc,
query->targetList);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
return true;
}
/*
* Similarly, GROUP BY guarantees uniqueness if all the grouped columns
* appear in colnos and operator semantics match.
*/
if (query->groupClause)
{
foreach(l, query->groupClause)
{
SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
TargetEntry *tle = get_sortgroupclause_tle(sgc,
query->targetList);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
return true;
}
else
{
/*
* If we have no GROUP BY, but do have aggregates or HAVING, then the
* result is at most one row so it's surely unique, for any operators.
*/
if (query->hasAggs || query->havingQual)
return true;
}
/*
* UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
* except with ALL.
*/
if (query->setOperations)
{
SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
Assert(IsA(topop, SetOperationStmt));
Assert(topop->op != SETOP_NONE);
if (!topop->all)
{
ListCell *lg;
/* We're good if all the nonjunk output columns are in colnos */
lg = list_head(topop->groupClauses);
foreach(l, query->targetList)
{
TargetEntry *tle = (TargetEntry *) lfirst(l);
SortGroupClause *sgc;
if (tle->resjunk)
continue; /* ignore resjunk columns */
/* non-resjunk columns should have grouping clauses */
Assert(lg != NULL);
sgc = (SortGroupClause *) lfirst(lg);
lg = lnext(lg);
opid = distinct_col_search(tle->resno, colnos, opids);
if (!OidIsValid(opid) ||
!equality_ops_are_compatible(opid, sgc->eqop))
break; /* exit early if no match */
}
if (l == NULL) /* had matches for all? */
return true;
}
}
/*
* XXX Are there any other cases in which we can easily see the result
* must be distinct?
*/
return false;
}
/*
* distinct_col_search - subroutine for query_is_distinct_for
*
* If colno is in colnos, return the corresponding element of opids,
* else return InvalidOid. (We expect colnos does not contain duplicates,
* so the result is well-defined.)
*/
static Oid
distinct_col_search(int colno, List *colnos, List *opids)
{
ListCell *lc1,
*lc2;
forboth(lc1, colnos, lc2, opids)
{
if (colno == lfirst_int(lc1))
return lfirst_oid(lc2);
}
return InvalidOid;
}
/*
* create_subqueryscan_path
* Creates a path corresponding to a sequential scan of a subquery,
......
......@@ -122,6 +122,8 @@ extern RestrictInfo *build_implied_join_equality(Oid opno,
* prototypes for plan/analyzejoins.c
*/
extern List *remove_useless_joins(PlannerInfo *root, List *joinlist);
extern bool query_supports_distinctness(Query *query);
extern bool query_is_distinct_for(Query *query, List *colnos, List *opids);
/*
* prototypes for plan/setrefs.c
......
......@@ -3131,9 +3131,11 @@ begin;
CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int);
CREATE TEMP TABLE c (id int PRIMARY KEY);
CREATE TEMP TABLE d (a int, b int);
INSERT INTO a VALUES (0, 0), (1, NULL);
INSERT INTO b VALUES (0, 0), (1, NULL);
INSERT INTO c VALUES (0), (1);
INSERT INTO d VALUES (1,3), (2,2), (3,1);
-- all three cases should be optimizable into a simple seqscan
explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id;
QUERY PLAN
......@@ -3169,6 +3171,83 @@ select id from a where id in (
-> Seq Scan on b
(5 rows)
-- check that join removal works for a left join when joining a subquery
-- that is guaranteed to be unique by its GROUP BY clause
explain (costs off)
select d.* from d left join (select * from b group by b.id, b.c_id) s
on d.a = s.id and d.b = s.c_id;
QUERY PLAN
---------------
Seq Scan on d
(1 row)
-- similarly, but keying off a DISTINCT clause
explain (costs off)
select d.* from d left join (select distinct * from b) s
on d.a = s.id and d.b = s.c_id;
QUERY PLAN
---------------
Seq Scan on d
(1 row)
-- join removal is not possible when the GROUP BY contains a column that is
-- not in the join condition
explain (costs off)
select d.* from d left join (select * from b group by b.id, b.c_id) s
on d.a = s.id;
QUERY PLAN
---------------------------------------------
Merge Left Join
Merge Cond: (d.a = s.id)
-> Sort
Sort Key: d.a
-> Seq Scan on d
-> Sort
Sort Key: s.id
-> Subquery Scan on s
-> HashAggregate
Group Key: b.id, b.c_id
-> Seq Scan on b
(11 rows)
-- similarly, but keying off a DISTINCT clause
explain (costs off)
select d.* from d left join (select distinct * from b) s
on d.a = s.id;
QUERY PLAN
---------------------------------------------
Merge Left Join
Merge Cond: (d.a = s.id)
-> Sort
Sort Key: d.a
-> Seq Scan on d
-> Sort
Sort Key: s.id
-> Subquery Scan on s
-> HashAggregate
Group Key: b.id, b.c_id
-> Seq Scan on b
(11 rows)
-- check join removal works when uniqueness of the join condition is enforced
-- by a UNION
explain (costs off)
select d.* from d left join (select id from a union select id from b) s
on d.a = s.id;
QUERY PLAN
---------------
Seq Scan on d
(1 row)
-- check join removal with a cross-type comparison operator
explain (costs off)
select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4
on i8.q1 = i4.f1;
QUERY PLAN
-------------------------
Seq Scan on int8_tbl i8
(1 row)
rollback;
create temp table parent (k int primary key, pd int);
create temp table child (k int unique, cd int);
......
......@@ -919,9 +919,11 @@ begin;
CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int);
CREATE TEMP TABLE c (id int PRIMARY KEY);
CREATE TEMP TABLE d (a int, b int);
INSERT INTO a VALUES (0, 0), (1, NULL);
INSERT INTO b VALUES (0, 0), (1, NULL);
INSERT INTO c VALUES (0), (1);
INSERT INTO d VALUES (1,3), (2,2), (3,1);
-- all three cases should be optimizable into a simple seqscan
explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id;
......@@ -936,6 +938,39 @@ select id from a where id in (
select b.id from b left join c on b.id = c.id
);
-- check that join removal works for a left join when joining a subquery
-- that is guaranteed to be unique by its GROUP BY clause
explain (costs off)
select d.* from d left join (select * from b group by b.id, b.c_id) s
on d.a = s.id and d.b = s.c_id;
-- similarly, but keying off a DISTINCT clause
explain (costs off)
select d.* from d left join (select distinct * from b) s
on d.a = s.id and d.b = s.c_id;
-- join removal is not possible when the GROUP BY contains a column that is
-- not in the join condition
explain (costs off)
select d.* from d left join (select * from b group by b.id, b.c_id) s
on d.a = s.id;
-- similarly, but keying off a DISTINCT clause
explain (costs off)
select d.* from d left join (select distinct * from b) s
on d.a = s.id;
-- check join removal works when uniqueness of the join condition is enforced
-- by a UNION
explain (costs off)
select d.* from d left join (select id from a union select id from b) s
on d.a = s.id;
-- check join removal with a cross-type comparison operator
explain (costs off)
select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4
on i8.q1 = i4.f1;
rollback;
create temp table parent (k int primary key, pd int);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment