Commit 45be99f8 authored by Robert Haas's avatar Robert Haas

Support parallel joins, and make related improvements.

The core innovation of this patch is the introduction of the concept
of a partial path; that is, a path which if executed in parallel will
generate a subset of the output rows in each process.  Gathering a
partial path produces an ordinary (complete) path.  This allows us to
generate paths for parallel joins by joining a partial path for one
side (which at the baserel level is currently always a Partial Seq
Scan) to an ordinary path on the other side.  This is subject to
various restrictions at present, especially that this strategy seems
unlikely to be sensible for merge joins, so only nested loops and
hash joins paths are generated.

This also allows an Append node to be pushed below a Gather node in
the case of a partitioned table.

Testing revealed that early versions of this patch made poor decisions
in some cases, which turned out to be caused by the fact that the
original cost model for Parallel Seq Scan wasn't very good.  So this
patch tries to make some modest improvements in that area.

There is much more to be done in the area of generating good parallel
plans in all cases, but this seems like a useful step forward.

Patch by me, reviewed by Dilip Kumar and Amit Kapila.
parent a7de3dc5
...@@ -167,6 +167,8 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) ...@@ -167,6 +167,8 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
e->nnodes++; e->nnodes++;
/* Call estimators for parallel-aware nodes. */ /* Call estimators for parallel-aware nodes. */
if (planstate->plan->parallel_aware)
{
switch (nodeTag(planstate)) switch (nodeTag(planstate))
{ {
case T_SeqScanState: case T_SeqScanState:
...@@ -176,16 +178,14 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) ...@@ -176,16 +178,14 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
default: default:
break; break;
} }
}
return planstate_tree_walker(planstate, ExecParallelEstimate, e); return planstate_tree_walker(planstate, ExecParallelEstimate, e);
} }
/* /*
* Ordinary plan nodes won't do anything here, but parallel-aware plan nodes * Initialize the dynamic shared memory segment that will be used to control
* may need to initialize shared state in the DSM before parallel workers * parallel execution.
* are available. They can allocate the space they previous estimated using
* shm_toc_allocate, and add the keys they previously estimated using
* shm_toc_insert, in each case targeting pcxt->toc.
*/ */
static bool static bool
ExecParallelInitializeDSM(PlanState *planstate, ExecParallelInitializeDSM(PlanState *planstate,
...@@ -202,7 +202,17 @@ ExecParallelInitializeDSM(PlanState *planstate, ...@@ -202,7 +202,17 @@ ExecParallelInitializeDSM(PlanState *planstate,
/* Count this node. */ /* Count this node. */
d->nnodes++; d->nnodes++;
/* Call initializers for parallel-aware plan nodes. */ /*
* Call initializers for parallel-aware plan nodes.
*
* Ordinary plan nodes won't do anything here, but parallel-aware plan
* nodes may need to initialize shared state in the DSM before parallel
* workers are available. They can allocate the space they previously
* estimated using shm_toc_allocate, and add the keys they previously
* estimated using shm_toc_insert, in each case targeting pcxt->toc.
*/
if (planstate->plan->parallel_aware)
{
switch (nodeTag(planstate)) switch (nodeTag(planstate))
{ {
case T_SeqScanState: case T_SeqScanState:
...@@ -212,6 +222,7 @@ ExecParallelInitializeDSM(PlanState *planstate, ...@@ -212,6 +222,7 @@ ExecParallelInitializeDSM(PlanState *planstate,
default: default:
break; break;
} }
}
return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d); return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d);
} }
...@@ -623,6 +634,8 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) ...@@ -623,6 +634,8 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
return false; return false;
/* Call initializers for parallel-aware plan nodes. */ /* Call initializers for parallel-aware plan nodes. */
if (planstate->plan->parallel_aware)
{
switch (nodeTag(planstate)) switch (nodeTag(planstate))
{ {
case T_SeqScanState: case T_SeqScanState:
...@@ -631,6 +644,7 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) ...@@ -631,6 +644,7 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
default: default:
break; break;
} }
}
return planstate_tree_walker(planstate, ExecParallelInitializeWorker, toc); return planstate_tree_walker(planstate, ExecParallelInitializeWorker, toc);
} }
......
...@@ -1591,6 +1591,8 @@ _outPathInfo(StringInfo str, const Path *node) ...@@ -1591,6 +1591,8 @@ _outPathInfo(StringInfo str, const Path *node)
else else
_outBitmapset(str, NULL); _outBitmapset(str, NULL);
WRITE_BOOL_FIELD(parallel_aware); WRITE_BOOL_FIELD(parallel_aware);
WRITE_BOOL_FIELD(parallel_safe);
WRITE_INT_FIELD(parallel_degree);
WRITE_FLOAT_FIELD(rows, "%.0f"); WRITE_FLOAT_FIELD(rows, "%.0f");
WRITE_FLOAT_FIELD(startup_cost, "%.2f"); WRITE_FLOAT_FIELD(startup_cost, "%.2f");
WRITE_FLOAT_FIELD(total_cost, "%.2f"); WRITE_FLOAT_FIELD(total_cost, "%.2f");
...@@ -1768,7 +1770,6 @@ _outGatherPath(StringInfo str, const GatherPath *node) ...@@ -1768,7 +1770,6 @@ _outGatherPath(StringInfo str, const GatherPath *node)
_outPathInfo(str, (const Path *) node); _outPathInfo(str, (const Path *) node);
WRITE_NODE_FIELD(subpath); WRITE_NODE_FIELD(subpath);
WRITE_INT_FIELD(num_workers);
WRITE_BOOL_FIELD(single_copy); WRITE_BOOL_FIELD(single_copy);
} }
...@@ -1890,6 +1891,7 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node) ...@@ -1890,6 +1891,7 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node)
WRITE_NODE_FIELD(reltargetlist); WRITE_NODE_FIELD(reltargetlist);
WRITE_NODE_FIELD(pathlist); WRITE_NODE_FIELD(pathlist);
WRITE_NODE_FIELD(ppilist); WRITE_NODE_FIELD(ppilist);
WRITE_NODE_FIELD(partial_pathlist);
WRITE_NODE_FIELD(cheapest_startup_path); WRITE_NODE_FIELD(cheapest_startup_path);
WRITE_NODE_FIELD(cheapest_total_path); WRITE_NODE_FIELD(cheapest_total_path);
WRITE_NODE_FIELD(cheapest_unique_path); WRITE_NODE_FIELD(cheapest_unique_path);
......
...@@ -851,4 +851,57 @@ lateral reference. (Perhaps now that that stuff works, we could relax the ...@@ -851,4 +851,57 @@ lateral reference. (Perhaps now that that stuff works, we could relax the
pullup restriction?) pullup restriction?)
-- bjm & tgl Parallel Query and Partial Paths
--------------------------------
Parallel query involves dividing up the work that needs to be performed
either by an entire query or some portion of the query in such a way that
some of that work can be done by one or more worker processes, which are
called parallel workers. Parallel workers are a subtype of dynamic
background workers; see src/backend/access/transam/README.parallel for a
fuller description. Academic literature on parallel query suggests that
that parallel execution strategies can be divided into essentially two
categories: pipelined parallelism, where the execution of the query is
divided into multiple stages and each stage is handled by a separate
process; and partitioning parallelism, where the data is split between
multiple processes and each process handles a subset of it. The
literature, however, suggests that gains from pipeline parallelism are
often very limited due to the difficulty of avoiding pipeline stalls.
Consequently, we do not currently attempt to generate query plans that
use this technique.
Instead, we focus on partitioning paralellism, which does not require
that the underlying table be partitioned. It only requires that (1)
there is some method of dividing the data from at least one of the base
tables involved in the relation across multiple processes, (2) allowing
each process to handle its own portion of the data, and then (3)
collecting the results. Requirements (2) and (3) is satisfied by the
executor node Gather, which launches any number of worker processes and
executes its single child plan in all of them (and perhaps in the leader
also, if the children aren't generating enough data to keep the leader
busy). Requirement (1) is handled by the SeqScan node: when invoked
with parallel_aware = true, this node will, in effect, partition the
table on a block by block basis, returning a subset of the tuples from
the relation in each worker where that SeqScan is executed. A similar
scheme could be (and probably should be) implemented for bitmap heap
scans.
Just as we do for non-parallel access methods, we build Paths to
represent access strategies that can be used in a parallel plan. These
are, in essence, the same strategies that are available in the
non-parallel plan, but there is an important difference: a path that
will run beneath a Gather node returns only a subset of the query
results in each worker, not all of them. To form a path that can
actually be executed, the (rather large) cost of the Gather node must be
accounted for. For this reason among others, paths intended to run
beneath a Gather node - which we call "partial" paths since they return
only a subset of the results in each worker - must be kept separate from
ordinary paths (see RelOptInfo's partial_pathlist and the function
add_partial_path).
One of the keys to making parallel query effective is to run as much of
the query in parallel as possible. Therefore, we expect it to generally
be desirable to postpone the Gather stage until as near to the top of the
plan as possible. Expanding the range of cases in which more work can be
pushed below the Gather (and costly them accurately) is likely to keep us
busy for a long time to come.
...@@ -72,6 +72,7 @@ static void set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, ...@@ -72,6 +72,7 @@ static void set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
Index rti, RangeTblEntry *rte); Index rti, RangeTblEntry *rte);
static void set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, static void set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel,
RangeTblEntry *rte); RangeTblEntry *rte);
static void create_parallel_paths(PlannerInfo *root, RelOptInfo *rel);
static void set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel, static void set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel,
RangeTblEntry *rte); RangeTblEntry *rte);
static bool function_rte_parallel_ok(RangeTblEntry *rte); static bool function_rte_parallel_ok(RangeTblEntry *rte);
...@@ -612,7 +613,6 @@ static void ...@@ -612,7 +613,6 @@ static void
set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
{ {
Relids required_outer; Relids required_outer;
int parallel_threshold = 1000;
/* /*
* We don't support pushing join clauses into the quals of a seqscan, but * We don't support pushing join clauses into the quals of a seqscan, but
...@@ -624,17 +624,41 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) ...@@ -624,17 +624,41 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
/* Consider sequential scan */ /* Consider sequential scan */
add_path(rel, create_seqscan_path(root, rel, required_outer, 0)); add_path(rel, create_seqscan_path(root, rel, required_outer, 0));
/* Consider parallel sequential scan */ /* If appropriate, consider parallel sequential scan */
if (rel->consider_parallel && rel->pages > parallel_threshold && if (rel->consider_parallel && required_outer == NULL)
required_outer == NULL) create_parallel_paths(root, rel);
{
Path *path; /* Consider index scans */
create_index_paths(root, rel);
/* Consider TID scans */
create_tidscan_paths(root, rel);
}
/*
* create_parallel_paths
* Build parallel access paths for a plain relation
*/
static void
create_parallel_paths(PlannerInfo *root, RelOptInfo *rel)
{
int parallel_threshold = 1000;
int parallel_degree = 1; int parallel_degree = 1;
/* /*
* Limit the degree of parallelism logarithmically based on the size * If this relation is too small to be worth a parallel scan, just return
* of the relation. This probably needs to be a good deal more * without doing anything ... unless it's an inheritance child. In that case,
* sophisticated, but we need something here for now. * we want to generate a parallel path here anyway. It might not be worthwhile
* just for this relation, but when combined with all of its inheritance siblings
* it may well pay off.
*/
if (rel->pages < parallel_threshold && rel->reloptkind == RELOPT_BASEREL)
return;
/*
* Limit the degree of parallelism logarithmically based on the size of the
* relation. This probably needs to be a good deal more sophisticated, but we
* need something here for now.
*/ */
while (rel->pages > parallel_threshold * 3 && while (rel->pages > parallel_threshold * 3 &&
parallel_degree < max_parallel_degree) parallel_degree < max_parallel_degree)
...@@ -645,24 +669,18 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) ...@@ -645,24 +669,18 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
break; break;
} }
/* Add an unordered partial path based on a parallel sequential scan. */
add_partial_path(rel, create_seqscan_path(root, rel, NULL, parallel_degree));
/* /*
* Ideally we should consider postponing the gather operation until * If this is a baserel, consider gathering any partial paths we may have
* much later, after we've pushed joins and so on atop the parallel * just created. If we gathered an inheritance child, we could end up
* sequential scan path. But we don't have the infrastructure for * with a very large number of gather nodes, each trying to grab its own
* that yet, so just do this for now. * pool of workers, so don't do this in that case. Instead, we'll
* consider gathering partial paths for the appendrel.
*/ */
path = create_seqscan_path(root, rel, required_outer, parallel_degree); if (rel->reloptkind == RELOPT_BASEREL)
path = (Path *) generate_gather_paths(root, rel);
create_gather_path(root, rel, path, required_outer,
parallel_degree);
add_path(rel, path);
}
/* Consider index scans */
create_index_paths(root, rel);
/* Consider TID scans */
create_tidscan_paths(root, rel);
} }
/* /*
...@@ -1039,6 +1057,8 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, ...@@ -1039,6 +1057,8 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
List *live_childrels = NIL; List *live_childrels = NIL;
List *subpaths = NIL; List *subpaths = NIL;
bool subpaths_valid = true; bool subpaths_valid = true;
List *partial_subpaths = NIL;
bool partial_subpaths_valid = true;
List *all_child_pathkeys = NIL; List *all_child_pathkeys = NIL;
List *all_child_outers = NIL; List *all_child_outers = NIL;
ListCell *l; ListCell *l;
...@@ -1093,6 +1113,13 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, ...@@ -1093,6 +1113,13 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
else else
subpaths_valid = false; subpaths_valid = false;
/* Same idea, but for a partial plan. */
if (childrel->partial_pathlist != NIL)
partial_subpaths = accumulate_append_subpath(partial_subpaths,
linitial(childrel->partial_pathlist));
else
partial_subpaths_valid = false;
/* /*
* Collect lists of all the available path orderings and * Collect lists of all the available path orderings and
* parameterizations for all the children. We use these as a * parameterizations for all the children. We use these as a
...@@ -1164,7 +1191,39 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, ...@@ -1164,7 +1191,39 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
* if we have zero or one live subpath due to constraint exclusion.) * if we have zero or one live subpath due to constraint exclusion.)
*/ */
if (subpaths_valid) if (subpaths_valid)
add_path(rel, (Path *) create_append_path(rel, subpaths, NULL)); add_path(rel, (Path *) create_append_path(rel, subpaths, NULL, 0));
/*
* Consider an append of partial unordered, unparameterized partial paths.
*/
if (partial_subpaths_valid)
{
AppendPath *appendpath;
ListCell *lc;
int parallel_degree = 0;
/*
* Decide what parallel degree to request for this append path. For
* now, we just use the maximum parallel degree of any member. It
* might be useful to use a higher number if the Append node were
* smart enough to spread out the workers, but it currently isn't.
*/
foreach(lc, partial_subpaths)
{
Path *path = lfirst(lc);
parallel_degree = Max(parallel_degree, path->parallel_degree);
}
Assert(parallel_degree > 0);
/* Generate a partial append path. */
appendpath = create_append_path(rel, partial_subpaths, NULL,
parallel_degree);
add_partial_path(rel, (Path *) appendpath);
/* Consider gathering it. */
generate_gather_paths(root, rel);
}
/* /*
* Also build unparameterized MergeAppend paths based on the collected * Also build unparameterized MergeAppend paths based on the collected
...@@ -1214,7 +1273,7 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, ...@@ -1214,7 +1273,7 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
if (subpaths_valid) if (subpaths_valid)
add_path(rel, (Path *) add_path(rel, (Path *)
create_append_path(rel, subpaths, required_outer)); create_append_path(rel, subpaths, required_outer, 0));
} }
} }
...@@ -1440,8 +1499,9 @@ set_dummy_rel_pathlist(RelOptInfo *rel) ...@@ -1440,8 +1499,9 @@ set_dummy_rel_pathlist(RelOptInfo *rel)
/* Discard any pre-existing paths; no further need for them */ /* Discard any pre-existing paths; no further need for them */
rel->pathlist = NIL; rel->pathlist = NIL;
rel->partial_pathlist = NIL;
add_path(rel, (Path *) create_append_path(rel, NIL, NULL)); add_path(rel, (Path *) create_append_path(rel, NIL, NULL, 0));
/* /*
* We set the cheapest path immediately, to ensure that IS_DUMMY_REL() * We set the cheapest path immediately, to ensure that IS_DUMMY_REL()
...@@ -1843,6 +1903,36 @@ set_worktable_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) ...@@ -1843,6 +1903,36 @@ set_worktable_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
add_path(rel, create_worktablescan_path(root, rel, required_outer)); add_path(rel, create_worktablescan_path(root, rel, required_outer));
} }
/*
* generate_gather_paths
* Generate parallel access paths for a relation by pushing a Gather on
* top of a partial path.
*/
void
generate_gather_paths(PlannerInfo *root, RelOptInfo *rel)
{
Path *cheapest_partial_path;
Path *simple_gather_path;
/* If there are no partial paths, there's nothing to do here. */
if (rel->partial_pathlist == NIL)
return;
/*
* The output of Gather is currently always unsorted, so there's only one
* partial path of interest: the cheapest one.
*
* Eventually, we should have a Gather Merge operation that can merge
* multiple tuple streams together while preserving their ordering. We
* could usefully generate such a path from each partial path that has
* non-NIL pathkeys.
*/
cheapest_partial_path = linitial(rel->partial_pathlist);
simple_gather_path = (Path *)
create_gather_path(root, rel, cheapest_partial_path, NULL);
add_path(rel, simple_gather_path);
}
/* /*
* make_rel_from_joinlist * make_rel_from_joinlist
* Build access paths using a "joinlist" to guide the join path search. * Build access paths using a "joinlist" to guide the join path search.
......
...@@ -187,11 +187,11 @@ clamp_row_est(double nrows) ...@@ -187,11 +187,11 @@ clamp_row_est(double nrows)
*/ */
void void
cost_seqscan(Path *path, PlannerInfo *root, cost_seqscan(Path *path, PlannerInfo *root,
RelOptInfo *baserel, ParamPathInfo *param_info, RelOptInfo *baserel, ParamPathInfo *param_info)
int nworkers)
{ {
Cost startup_cost = 0; Cost startup_cost = 0;
Cost run_cost = 0; Cost cpu_run_cost;
Cost disk_run_cost;
double spc_seq_page_cost; double spc_seq_page_cost;
QualCost qpqual_cost; QualCost qpqual_cost;
Cost cpu_per_tuple; Cost cpu_per_tuple;
...@@ -217,27 +217,58 @@ cost_seqscan(Path *path, PlannerInfo *root, ...@@ -217,27 +217,58 @@ cost_seqscan(Path *path, PlannerInfo *root,
/* /*
* disk costs * disk costs
*/ */
run_cost += spc_seq_page_cost * baserel->pages; disk_run_cost = spc_seq_page_cost * baserel->pages;
/* CPU costs */ /* CPU costs */
get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
startup_cost += qpqual_cost.startup; startup_cost += qpqual_cost.startup;
cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
run_cost += cpu_per_tuple * baserel->tuples; cpu_run_cost = cpu_per_tuple * baserel->tuples;
/* Adjust costing for parallelism, if used. */
if (path->parallel_degree > 0)
{
double parallel_divisor = path->parallel_degree;
double leader_contribution;
/*
* Early experience with parallel query suggests that when there is
* only one worker, the leader often makes a very substantial
* contribution to executing the parallel portion of the plan, but as
* more workers are added, it does less and less, because it's busy
* reading tuples from the workers and doing whatever non-paralell
* post-processing is needed. By the time we reach 4 workers, the
* leader no longer makes a meaningful contribution. Thus, for now,
* estimate that the leader spends 30% of its time servicing each
* worker, and the remainder executing the parallel plan.
*/
leader_contribution = 1.0 - (0.3 * path->parallel_degree);
if (leader_contribution > 0)
parallel_divisor += leader_contribution;
/* /*
* Primitive parallel cost model. Assume the leader will do half as much * In the case of a parallel plan, the row count needs to represent
* work as a regular worker, because it will also need to read the tuples * the number of tuples processed per worker. Otherwise, higher-level
* returned by the workers when they percolate up to the gather node. This * plan nodes that appear below the gather will be costed incorrectly,
* is almost certainly not exactly the right way to model this, so this * because they'll anticipate receiving more rows than any given copy
* will probably need to be changed at some point... * will actually get.
*/ */
if (nworkers > 0) path->rows /= parallel_divisor;
run_cost = run_cost / (nworkers + 0.5);
/* The CPU cost is divided among all the workers. */
cpu_run_cost /= parallel_divisor;
/*
* It may be possible to amortize some of the I/O cost, but probably
* not very much, because most operating systems already do aggressive
* prefetching. For now, we assume that the disk run cost can't be
* amortized at all.
*/
}
path->startup_cost = startup_cost; path->startup_cost = startup_cost;
path->total_cost = startup_cost + run_cost; path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
} }
/* /*
......
...@@ -34,6 +34,12 @@ static void sort_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel, ...@@ -34,6 +34,12 @@ static void sort_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
static void match_unsorted_outer(PlannerInfo *root, RelOptInfo *joinrel, static void match_unsorted_outer(PlannerInfo *root, RelOptInfo *joinrel,
RelOptInfo *outerrel, RelOptInfo *innerrel, RelOptInfo *outerrel, RelOptInfo *innerrel,
JoinType jointype, JoinPathExtraData *extra); JoinType jointype, JoinPathExtraData *extra);
static void consider_parallel_nestloop(PlannerInfo *root,
RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
JoinType jointype,
JoinPathExtraData *extra);
static void hash_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel, static void hash_inner_and_outer(PlannerInfo *root, RelOptInfo *joinrel,
RelOptInfo *outerrel, RelOptInfo *innerrel, RelOptInfo *outerrel, RelOptInfo *innerrel,
JoinType jointype, JoinPathExtraData *extra); JoinType jointype, JoinPathExtraData *extra);
...@@ -216,7 +222,12 @@ add_paths_to_joinrel(PlannerInfo *root, ...@@ -216,7 +222,12 @@ add_paths_to_joinrel(PlannerInfo *root,
jointype, &extra); jointype, &extra);
/* /*
* 6. Finally, give extensions a chance to manipulate the path list. * 6. Consider gathering partial paths.
*/
generate_gather_paths(root, joinrel);
/*
* 7. Finally, give extensions a chance to manipulate the path list.
*/ */
if (set_join_pathlist_hook) if (set_join_pathlist_hook)
set_join_pathlist_hook(root, joinrel, outerrel, innerrel, set_join_pathlist_hook(root, joinrel, outerrel, innerrel,
...@@ -329,6 +340,62 @@ try_nestloop_path(PlannerInfo *root, ...@@ -329,6 +340,62 @@ try_nestloop_path(PlannerInfo *root,
} }
} }
/*
* try_partial_nestloop_path
* Consider a partial nestloop join path; if it appears useful, push it into
* the joinrel's partial_pathlist via add_partial_path().
*/
static void
try_partial_nestloop_path(PlannerInfo *root,
RelOptInfo *joinrel,
Path *outer_path,
Path *inner_path,
List *pathkeys,
JoinType jointype,
JoinPathExtraData *extra)
{
JoinCostWorkspace workspace;
/*
* If the inner path is parameterized, the parameterization must be fully
* satisfied by the proposed outer path. Parameterized partial paths are
* not supported. The caller should already have verified that no
* extra_lateral_rels are required here.
*/
Assert(bms_is_empty(joinrel->lateral_relids));
if (inner_path->param_info != NULL)
{
Relids inner_paramrels = inner_path->param_info->ppi_req_outer;
if (!bms_is_subset(inner_paramrels, outer_path->parent->relids))
return;
}
/*
* Before creating a path, get a quick lower bound on what it is likely
* to cost. Bail out right away if it looks terrible.
*/
initial_cost_nestloop(root, &workspace, jointype,
outer_path, inner_path,
extra->sjinfo, &extra->semifactors);
if (!add_partial_path_precheck(joinrel, workspace.total_cost, pathkeys))
return;
/* Might be good enough to be worth trying, so let's try it. */
add_partial_path(joinrel, (Path *)
create_nestloop_path(root,
joinrel,
jointype,
&workspace,
extra->sjinfo,
&extra->semifactors,
outer_path,
inner_path,
extra->restrictlist,
pathkeys,
NULL));
}
/* /*
* try_mergejoin_path * try_mergejoin_path
* Consider a merge join path; if it appears useful, push it into * Consider a merge join path; if it appears useful, push it into
...@@ -471,6 +538,62 @@ try_hashjoin_path(PlannerInfo *root, ...@@ -471,6 +538,62 @@ try_hashjoin_path(PlannerInfo *root,
} }
} }
/*
* try_partial_hashjoin_path
* Consider a partial hashjoin join path; if it appears useful, push it into
* the joinrel's partial_pathlist via add_partial_path().
*/
static void
try_partial_hashjoin_path(PlannerInfo *root,
RelOptInfo *joinrel,
Path *outer_path,
Path *inner_path,
List *hashclauses,
JoinType jointype,
JoinPathExtraData *extra)
{
JoinCostWorkspace workspace;
/*
* If the inner path is parameterized, the parameterization must be fully
* satisfied by the proposed outer path. Parameterized partial paths are
* not supported. The caller should already have verified that no
* extra_lateral_rels are required here.
*/
Assert(bms_is_empty(joinrel->lateral_relids));
if (inner_path->param_info != NULL)
{
Relids inner_paramrels = inner_path->param_info->ppi_req_outer;
if (!bms_is_empty(inner_paramrels))
return;
}
/*
* Before creating a path, get a quick lower bound on what it is likely
* to cost. Bail out right away if it looks terrible.
*/
initial_cost_hashjoin(root, &workspace, jointype, hashclauses,
outer_path, inner_path,
extra->sjinfo, &extra->semifactors);
if (!add_partial_path_precheck(joinrel, workspace.total_cost, NIL))
return;
/* Might be good enough to be worth trying, so let's try it. */
add_partial_path(joinrel, (Path *)
create_hashjoin_path(root,
joinrel,
jointype,
&workspace,
extra->sjinfo,
&extra->semifactors,
outer_path,
inner_path,
extra->restrictlist,
NULL,
hashclauses));
}
/* /*
* clause_sides_match_join * clause_sides_match_join
* Determine whether a join clause is of the right form to use in this join. * Determine whether a join clause is of the right form to use in this join.
...@@ -1063,6 +1186,85 @@ match_unsorted_outer(PlannerInfo *root, ...@@ -1063,6 +1186,85 @@ match_unsorted_outer(PlannerInfo *root,
break; break;
} }
} }
/*
* If the joinrel is parallel-safe and the join type supports nested loops,
* we may be able to consider a partial nestloop plan. However, we can't
* handle JOIN_UNIQUE_OUTER, because the outer path will be partial, and
* therefore we won't be able to properly guarantee uniqueness. Nor can
* we handle extra_lateral_rels, since partial paths must not be
* parameterized.
*/
if (joinrel->consider_parallel && nestjoinOK &&
save_jointype != JOIN_UNIQUE_OUTER &&
bms_is_empty(joinrel->lateral_relids))
consider_parallel_nestloop(root, joinrel, outerrel, innerrel,
save_jointype, extra);
}
/*
* consider_parallel_nestloop
* Try to build partial paths for a joinrel by joining a partial path for the
* outer relation to a complete path for the inner relation.
*
* 'joinrel' is the join relation
* 'outerrel' is the outer join relation
* 'innerrel' is the inner join relation
* 'jointype' is the type of join to do
* 'extra' contains additional input values
*/
static void
consider_parallel_nestloop(PlannerInfo *root,
RelOptInfo *joinrel,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
JoinType jointype,
JoinPathExtraData *extra)
{
ListCell *lc1;
foreach(lc1, outerrel->partial_pathlist)
{
Path *outerpath = (Path *) lfirst(lc1);
List *pathkeys;
ListCell *lc2;
/* Figure out what useful ordering any paths we create will have. */
pathkeys = build_join_pathkeys(root, joinrel, jointype,
outerpath->pathkeys);
/*
* Try the cheapest parameterized paths; only those which will
* produce an unparameterized path when joined to this outerrel
* will survive try_partial_nestloop_path. The cheapest
* unparameterized path is also in this list.
*/
foreach(lc2, innerrel->cheapest_parameterized_paths)
{
Path *innerpath = (Path *) lfirst(lc2);
/* Can't join to an inner path that is not parallel-safe */
if (!innerpath->parallel_safe)
continue;
/*
* Like match_unsorted_outer, we only consider a single nestloop
* path when the jointype is JOIN_UNIQUE_INNER. But we have to scan
* cheapest_parameterized_paths to find the one we want to consider,
* because cheapest_total_path might not be parallel-safe.
*/
if (jointype == JOIN_UNIQUE_INNER)
{
if (!bms_is_empty(PATH_REQ_OUTER(innerpath)))
continue;
innerpath = (Path *) create_unique_path(root, innerrel,
innerpath, extra->sjinfo);
}
try_partial_nestloop_path(root, joinrel, outerpath, innerpath,
pathkeys, jointype, extra);
}
}
} }
/* /*
...@@ -1240,6 +1442,55 @@ hash_inner_and_outer(PlannerInfo *root, ...@@ -1240,6 +1442,55 @@ hash_inner_and_outer(PlannerInfo *root,
} }
} }
} }
/*
* If the joinrel is parallel-safe, we may be able to consider a
* partial hash join. However, we can't handle JOIN_UNIQUE_OUTER,
* because the outer path will be partial, and therefore we won't be
* able to properly guarantee uniqueness. Also, the resulting path
* must not be parameterized.
*/
if (joinrel->consider_parallel && jointype != JOIN_UNIQUE_OUTER &&
outerrel->partial_pathlist != NIL &&
bms_is_empty(joinrel->lateral_relids))
{
Path *cheapest_partial_outer;
Path *cheapest_safe_inner = NULL;
cheapest_partial_outer =
(Path *) linitial(outerrel->partial_pathlist);
/*
* Normally, given that the joinrel is parallel-safe, the cheapest
* total inner path will also be parallel-safe, but if not, we'll
* have to search cheapest_parameterized_paths for the cheapest
* unparameterized inner path.
*/
if (cheapest_total_inner->parallel_safe)
cheapest_safe_inner = cheapest_total_inner;
else
{
ListCell *lc;
foreach(lc, innerrel->cheapest_parameterized_paths)
{
Path *innerpath = (Path *) lfirst(lc);
if (innerpath->parallel_safe &&
bms_is_empty(PATH_REQ_OUTER(innerpath)))
{
cheapest_safe_inner = innerpath;
break;
}
}
}
if (cheapest_safe_inner != NULL)
try_partial_hashjoin_path(root, joinrel,
cheapest_partial_outer,
cheapest_safe_inner,
hashclauses, jointype, extra);
}
} }
} }
......
...@@ -1194,9 +1194,10 @@ mark_dummy_rel(RelOptInfo *rel) ...@@ -1194,9 +1194,10 @@ mark_dummy_rel(RelOptInfo *rel)
/* Evict any previously chosen paths */ /* Evict any previously chosen paths */
rel->pathlist = NIL; rel->pathlist = NIL;
rel->partial_pathlist = NIL;
/* Set up the dummy path */ /* Set up the dummy path */
add_path(rel, (Path *) create_append_path(rel, NIL, NULL)); add_path(rel, (Path *) create_append_path(rel, NIL, NULL, 0));
/* Set or update cheapest_total_path and related fields */ /* Set or update cheapest_total_path and related fields */
set_cheapest(rel); set_cheapest(rel);
......
...@@ -1130,7 +1130,7 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path) ...@@ -1130,7 +1130,7 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path)
gather_plan = make_gather(subplan->targetlist, gather_plan = make_gather(subplan->targetlist,
NIL, NIL,
best_path->num_workers, best_path->path.parallel_degree,
best_path->single_copy, best_path->single_copy,
subplan); subplan);
......
...@@ -84,7 +84,8 @@ query_planner(PlannerInfo *root, List *tlist, ...@@ -84,7 +84,8 @@ query_planner(PlannerInfo *root, List *tlist,
/* The only path for it is a trivial Result path */ /* The only path for it is a trivial Result path */
add_path(final_rel, (Path *) add_path(final_rel, (Path *)
create_result_path((List *) parse->jointree->quals)); create_result_path(final_rel,
(List *) parse->jointree->quals));
/* Select cheapest path (pretty easy in this case...) */ /* Select cheapest path (pretty easy in this case...) */
set_cheapest(final_rel); set_cheapest(final_rel);
......
This diff is collapsed.
...@@ -107,6 +107,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptKind reloptkind) ...@@ -107,6 +107,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptKind reloptkind)
rel->reltargetlist = NIL; rel->reltargetlist = NIL;
rel->pathlist = NIL; rel->pathlist = NIL;
rel->ppilist = NIL; rel->ppilist = NIL;
rel->partial_pathlist = NIL;
rel->cheapest_startup_path = NULL; rel->cheapest_startup_path = NULL;
rel->cheapest_total_path = NULL; rel->cheapest_total_path = NULL;
rel->cheapest_unique_path = NULL; rel->cheapest_unique_path = NULL;
...@@ -370,6 +371,7 @@ build_join_rel(PlannerInfo *root, ...@@ -370,6 +371,7 @@ build_join_rel(PlannerInfo *root,
joinrel->reltargetlist = NIL; joinrel->reltargetlist = NIL;
joinrel->pathlist = NIL; joinrel->pathlist = NIL;
joinrel->ppilist = NIL; joinrel->ppilist = NIL;
joinrel->partial_pathlist = NIL;
joinrel->cheapest_startup_path = NULL; joinrel->cheapest_startup_path = NULL;
joinrel->cheapest_total_path = NULL; joinrel->cheapest_total_path = NULL;
joinrel->cheapest_unique_path = NULL; joinrel->cheapest_unique_path = NULL;
......
...@@ -458,6 +458,7 @@ typedef struct RelOptInfo ...@@ -458,6 +458,7 @@ typedef struct RelOptInfo
List *reltargetlist; /* Vars to be output by scan of relation */ List *reltargetlist; /* Vars to be output by scan of relation */
List *pathlist; /* Path structures */ List *pathlist; /* Path structures */
List *ppilist; /* ParamPathInfos used in pathlist */ List *ppilist; /* ParamPathInfos used in pathlist */
List *partial_pathlist; /* partial Paths */
struct Path *cheapest_startup_path; struct Path *cheapest_startup_path;
struct Path *cheapest_total_path; struct Path *cheapest_total_path;
struct Path *cheapest_unique_path; struct Path *cheapest_unique_path;
...@@ -761,6 +762,8 @@ typedef struct Path ...@@ -761,6 +762,8 @@ typedef struct Path
RelOptInfo *parent; /* the relation this path can build */ RelOptInfo *parent; /* the relation this path can build */
ParamPathInfo *param_info; /* parameterization info, or NULL if none */ ParamPathInfo *param_info; /* parameterization info, or NULL if none */
bool parallel_aware; /* engage parallel-aware logic? */ bool parallel_aware; /* engage parallel-aware logic? */
bool parallel_safe; /* OK to use as part of parallel plan? */
int parallel_degree; /* desired parallel degree; 0 = not parallel */
/* estimated size/costs for path (see costsize.c for more info) */ /* estimated size/costs for path (see costsize.c for more info) */
double rows; /* estimated number of result tuples */ double rows; /* estimated number of result tuples */
...@@ -1064,7 +1067,6 @@ typedef struct GatherPath ...@@ -1064,7 +1067,6 @@ typedef struct GatherPath
{ {
Path path; Path path;
Path *subpath; /* path for each worker */ Path *subpath; /* path for each worker */
int num_workers; /* number of workers sought to help */
bool single_copy; /* path must not be executed >1x */ bool single_copy; /* path must not be executed >1x */
} GatherPath; } GatherPath;
......
...@@ -72,7 +72,7 @@ extern double clamp_row_est(double nrows); ...@@ -72,7 +72,7 @@ extern double clamp_row_est(double nrows);
extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
double index_pages, PlannerInfo *root); double index_pages, PlannerInfo *root);
extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel, extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
ParamPathInfo *param_info, int nworkers); ParamPathInfo *param_info);
extern void cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel, extern void cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
ParamPathInfo *param_info); ParamPathInfo *param_info);
extern void cost_index(IndexPath *path, PlannerInfo *root, extern void cost_index(IndexPath *path, PlannerInfo *root,
......
...@@ -29,9 +29,12 @@ extern void add_path(RelOptInfo *parent_rel, Path *new_path); ...@@ -29,9 +29,12 @@ extern void add_path(RelOptInfo *parent_rel, Path *new_path);
extern bool add_path_precheck(RelOptInfo *parent_rel, extern bool add_path_precheck(RelOptInfo *parent_rel,
Cost startup_cost, Cost total_cost, Cost startup_cost, Cost total_cost,
List *pathkeys, Relids required_outer); List *pathkeys, Relids required_outer);
extern void add_partial_path(RelOptInfo *parent_rel, Path *new_path);
extern bool add_partial_path_precheck(RelOptInfo *parent_rel,
Cost total_cost, List *pathkeys);
extern Path *create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, extern Path *create_seqscan_path(PlannerInfo *root, RelOptInfo *rel,
Relids required_outer, int nworkers); Relids required_outer, int parallel_degree);
extern Path *create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, extern Path *create_samplescan_path(PlannerInfo *root, RelOptInfo *rel,
Relids required_outer); Relids required_outer);
extern IndexPath *create_index_path(PlannerInfo *root, extern IndexPath *create_index_path(PlannerInfo *root,
...@@ -59,19 +62,18 @@ extern BitmapOrPath *create_bitmap_or_path(PlannerInfo *root, ...@@ -59,19 +62,18 @@ extern BitmapOrPath *create_bitmap_or_path(PlannerInfo *root,
extern TidPath *create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, extern TidPath *create_tidscan_path(PlannerInfo *root, RelOptInfo *rel,
List *tidquals, Relids required_outer); List *tidquals, Relids required_outer);
extern AppendPath *create_append_path(RelOptInfo *rel, List *subpaths, extern AppendPath *create_append_path(RelOptInfo *rel, List *subpaths,
Relids required_outer); Relids required_outer, int parallel_degree);
extern MergeAppendPath *create_merge_append_path(PlannerInfo *root, extern MergeAppendPath *create_merge_append_path(PlannerInfo *root,
RelOptInfo *rel, RelOptInfo *rel,
List *subpaths, List *subpaths,
List *pathkeys, List *pathkeys,
Relids required_outer); Relids required_outer);
extern ResultPath *create_result_path(List *quals); extern ResultPath *create_result_path(RelOptInfo *rel, List *quals);
extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath); extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath);
extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel, extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel,
Path *subpath, SpecialJoinInfo *sjinfo); Path *subpath, SpecialJoinInfo *sjinfo);
extern GatherPath *create_gather_path(PlannerInfo *root, extern GatherPath *create_gather_path(PlannerInfo *root,
RelOptInfo *rel, Path *subpath, Relids required_outer, RelOptInfo *rel, Path *subpath, Relids required_outer);
int nworkers);
extern Path *create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel, extern Path *create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel,
List *pathkeys, Relids required_outer); List *pathkeys, Relids required_outer);
extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel, extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel,
......
...@@ -50,6 +50,8 @@ extern RelOptInfo *make_one_rel(PlannerInfo *root, List *joinlist); ...@@ -50,6 +50,8 @@ extern RelOptInfo *make_one_rel(PlannerInfo *root, List *joinlist);
extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed, extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed,
List *initial_rels); List *initial_rels);
extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel);
#ifdef OPTIMIZER_DEBUG #ifdef OPTIMIZER_DEBUG
extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel);
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment