Commit caf9c830 authored by Tom Lane's avatar Tom Lane

Improve planning of Materialize nodes inserted atop the inner input of a

mergejoin to shield it from doing mark/restore and refetches.  Put an explicit
flag in MergePath so we can centralize the logic that knows about this,
and add costing logic that considers using Materialize even when it's not
forced by the previously-existing considerations.  This is in response to
a discussion back in August that suggested that materializing an inner
indexscan can be helpful when the refetch percentage is high enough.
parent 29faadcd
......@@ -8,7 +8,7 @@
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.371 2009/10/28 14:55:38 tgl Exp $
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.372 2009/11/15 02:45:34 tgl Exp $
* Every node type that can appear in stored rules' parsetrees *must*
......@@ -1501,6 +1501,7 @@ _outMergePath(StringInfo str, MergePath *node)
static void
......@@ -8,7 +8,7 @@
* $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.188 2009/10/26 02:26:33 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/allpaths.c,v 1.189 2009/11/15 02:45:35 tgl Exp $
......@@ -1443,13 +1443,12 @@ print_path(PlannerInfo *root, Path *path, int indent)
MergePath *mp = (MergePath *) path;
if (mp->outersortkeys || mp->innersortkeys)
for (i = 0; i < indent; i++)
printf(" sortouter=%d sortinner=%d\n",
printf(" sortouter=%d sortinner=%d materializeinner=%d\n",
((mp->outersortkeys) ? 1 : 0),
((mp->innersortkeys) ? 1 : 0));
((mp->innersortkeys) ? 1 : 0),
((mp->materialize_inner) ? 1 : 0));
......@@ -54,7 +54,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.211 2009/09/12 22:12:03 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.212 2009/11/15 02:45:35 tgl Exp $
......@@ -1166,23 +1166,6 @@ cost_sort(Path *path, PlannerInfo *root,
path->total_cost = startup_cost + run_cost;
* sort_exceeds_work_mem
* Given a finished Sort plan node, detect whether it is expected to
* spill to disk (ie, will need more than work_mem workspace)
* This assumes there will be no available LIMIT.
sort_exceeds_work_mem(Sort *sort)
double input_bytes = relation_byte_size(sort->plan.plan_rows,
long work_mem_bytes = work_mem * 1024L;
return (input_bytes > work_mem_bytes);
* cost_material
* Determines and returns the cost of materializing a relation, including
......@@ -1543,7 +1526,18 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
* Determines and returns the cost of joining two relations using the
* merge join algorithm.
* 'path' is already filled in except for the cost fields
* Unlike other costsize functions, this routine makes one actual decision:
* whether we should materialize the inner path. We do that either because
* the inner path can't support mark/restore, or because it's cheaper to
* use an interposed Material node to handle mark/restore. When the decision
* is cost-based it would be logically cleaner to build and cost two separate
* paths with and without that flag set; but that would require repeating most
* of the calculations here, which are not all that cheap. Since the choice
* will not affect output pathkeys or startup cost, only total cost, there is
* no possibility of wanting to keep both paths. So it seems best to make
* the decision here and record it in the path's materialize_inner field.
* 'path' is already filled in except for the cost fields and materialize_inner
* 'sjinfo' is extra info about the join for selectivity estimation
* Notes: path's mergeclauses should be a subset of the joinrestrictinfo list;
......@@ -1561,7 +1555,10 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
List *innersortkeys = path->innersortkeys;
Cost startup_cost = 0;
Cost run_cost = 0;
Cost cpu_per_tuple;
Cost cpu_per_tuple,
QualCost merge_qual_cost;
QualCost qp_qual_cost;
double outer_path_rows = PATH_ROWS(outer_path);
......@@ -1606,10 +1603,7 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
* When there are equal merge keys in the outer relation, the mergejoin
* must rescan any matching tuples in the inner relation. This means
* re-fetching inner tuples. Our cost model for this is that a re-fetch
* costs the same as an original fetch, which is probably an overestimate;
* but on the other hand we ignore the bookkeeping costs of mark/restore.
* Not clear if it's worth developing a more refined model.
* re-fetching inner tuples; we have to estimate how often that happens.
* For regular inner and outer joins, the number of re-fetches can be
* estimated approximately as size of merge join output minus size of
......@@ -1641,7 +1635,7 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
if (rescannedtuples < 0)
rescannedtuples = 0;
/* We'll inflate inner run cost this much to account for rescanning */
/* We'll inflate various costs this much to account for rescanning */
rescanratio = 1.0 + (rescannedtuples / inner_path_rows);
......@@ -1778,31 +1772,82 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
startup_cost += sort_path.startup_cost;
startup_cost += (sort_path.total_cost - sort_path.startup_cost)
* innerstartsel * rescanratio;
run_cost += (sort_path.total_cost - sort_path.startup_cost)
* (innerendsel - innerstartsel) * rescanratio;
* If the inner sort is expected to spill to disk, we want to add a
* materialize node to shield it from the need to handle mark/restore.
* This will allow it to perform the last merge pass on-the-fly, while
* in most cases not requiring the materialize to spill to disk.
* Charge an extra cpu_tuple_cost per tuple to account for the
* materialize node. (Keep this estimate in sync with similar ones in
* create_mergejoin_path and create_mergejoin_plan.)
if (relation_byte_size(inner_path_rows, inner_path->parent->width) >
(work_mem * 1024L))
run_cost += cpu_tuple_cost * inner_path_rows;
* innerstartsel;
inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
* (innerendsel - innerstartsel);
startup_cost += inner_path->startup_cost;
startup_cost += (inner_path->total_cost - inner_path->startup_cost)
* innerstartsel * rescanratio;
run_cost += (inner_path->total_cost - inner_path->startup_cost)
* (innerendsel - innerstartsel) * rescanratio;
* innerstartsel;
inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
* (innerendsel - innerstartsel);
* Decide whether we want to materialize the inner input to shield it from
* mark/restore and performing re-fetches. Our cost model for regular
* re-fetches is that a re-fetch costs the same as an original fetch,
* which is probably an overestimate; but on the other hand we ignore the
* bookkeeping costs of mark/restore. Not clear if it's worth developing
* a more refined model. So we just need to inflate the inner run cost
* by rescanratio.
bare_inner_cost = inner_run_cost * rescanratio;
* When we interpose a Material node the re-fetch cost is assumed to be
* just cpu_tuple_cost per tuple, independently of the underlying plan's
* cost; but we have to charge an extra cpu_tuple_cost per original fetch
* as well. Note that we're assuming the materialize node will never
* spill to disk, since it only has to remember tuples back to the last
* mark. (If there are a huge number of duplicates, our other cost
* factors will make the path so expensive that it probably won't get
* chosen anyway.) So we don't use cost_rescan here.
* Note: keep this estimate in sync with create_mergejoin_plan's labeling
* of the generated Material node.
mat_inner_cost = inner_run_cost +
cpu_tuple_cost * inner_path_rows * rescanratio;
/* Prefer materializing if it looks cheaper */
if (mat_inner_cost < bare_inner_cost)
path->materialize_inner = true;
* Even if materializing doesn't look cheaper, we *must* do it if the
* inner path is to be used directly (without sorting) and it doesn't
* support mark/restore.
* Since the inner side must be ordered, and only Sorts and IndexScans can
* create order to begin with, and they both support mark/restore, you
* might think there's no problem --- but you'd be wrong. Nestloop and
* merge joins can *preserve* the order of their inputs, so they can be
* selected as the input of a mergejoin, and they don't support
* mark/restore at present.
else if (innersortkeys == NIL &&
path->materialize_inner = true;
* Also, force materializing if the inner path is to be sorted and the
* sort is expected to spill to disk. This is because the final merge
* pass can be done on-the-fly if it doesn't have to support mark/restore.
* We don't try to adjust the cost estimates for this consideration,
* though.
else if (innersortkeys != NIL &&
relation_byte_size(inner_path_rows, inner_path->parent->width) >
(work_mem * 1024L))
path->materialize_inner = true;
path->materialize_inner = false;
/* Charge the right incremental cost for the chosen case */
if (path->materialize_inner)
run_cost += mat_inner_cost;
run_cost += bare_inner_cost;
/* CPU costs */
......@@ -10,7 +10,7 @@
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.266 2009/10/26 02:26:33 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.267 2009/11/15 02:45:35 tgl Exp $
......@@ -1664,9 +1664,8 @@ create_mergejoin_plan(PlannerInfo *root,
* Create explicit sort nodes for the outer and inner join paths if
* necessary. The sort cost was already accounted for in the path. Make
* sure there are no excess columns in the inputs if sorting.
* Create explicit sort nodes for the outer and inner paths if necessary.
* Make sure there are no excess columns in the inputs if sorting.
if (best_path->outersortkeys)
......@@ -1695,23 +1694,17 @@ create_mergejoin_plan(PlannerInfo *root,
innerpathkeys = best_path->jpath.innerjoinpath->pathkeys;
* If inner plan is a sort that is expected to spill to disk, add a
* materialize node to shield it from the need to handle mark/restore.
* This will allow it to perform the last merge pass on-the-fly, while in
* most cases not requiring the materialize to spill to disk.
* XXX really, Sort oughta do this for itself, probably, to avoid the
* overhead of a separate plan node.
* If specified, add a materialize node to shield the inner plan from
* the need to handle mark/restore.
if (IsA(inner_plan, Sort) &&
sort_exceeds_work_mem((Sort *) inner_plan))
if (best_path->materialize_inner)
Plan *matplan = (Plan *) make_material(inner_plan);
* We assume the materialize will not spill to disk, and therefore
* charge just cpu_tuple_cost per tuple. (Keep this estimate in sync
* with similar ones in cost_mergejoin and create_mergejoin_path.)
* with cost_mergejoin.)
copy_plan_costsize(matplan, inner_plan);
matplan->total_cost += cpu_tuple_cost * matplan->plan_rows;
......@@ -1887,6 +1880,7 @@ create_mergejoin_plan(PlannerInfo *root,
/* Costs of sort and material steps are included in path cost already */
copy_path_costsize(&join_plan->join.plan, &best_path->jpath.path);
return join_plan;
......@@ -8,7 +8,7 @@
* $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.154 2009/09/17 20:49:29 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.155 2009/11/15 02:45:35 tgl Exp $
......@@ -17,7 +17,6 @@
#include <math.h>
#include "catalog/pg_operator.h"
#include "executor/executor.h"
#include "miscadmin.h"
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
......@@ -1414,47 +1413,6 @@ create_mergejoin_path(PlannerInfo *root,
pathkeys_contained_in(innersortkeys, inner_path->pathkeys))
innersortkeys = NIL;
* If we are not sorting the inner path, we may need a materialize node to
* ensure it can be marked/restored.
* Since the inner side must be ordered, and only Sorts and IndexScans can
* create order to begin with, and they both support mark/restore, you
* might think there's no problem --- but you'd be wrong. Nestloop and
* merge joins can *preserve* the order of their inputs, so they can be
* selected as the input of a mergejoin, and they don't support
* mark/restore at present.
* Note: Sort supports mark/restore, so no materialize is really needed in
* that case; but one may be desirable anyway to optimize the sort.
* However, since we aren't representing the sort step separately in the
* Path tree, we can't explicitly represent the materialize either. So
* that case is not handled here. Instead, cost_mergejoin has to factor
* in the cost and create_mergejoin_plan has to add the plan node.
if (innersortkeys == NIL &&
Path *mpath;
mpath = (Path *) create_material_path(inner_path->parent, inner_path);
* We expect the materialize won't spill to disk (it could only do so
* if there were a whole lot of duplicate tuples, which is a case
* cost_mergejoin will avoid choosing anyway). Therefore
* cost_material's cost estimate is bogus and we should charge just
* cpu_tuple_cost per tuple. (Keep this estimate in sync with similar
* ones in cost_mergejoin and create_mergejoin_plan; also see
* cost_rescan.)
mpath->startup_cost = inner_path->startup_cost;
mpath->total_cost = inner_path->total_cost;
mpath->total_cost += cpu_tuple_cost * inner_path->parent->rows;
inner_path = mpath;
pathnode->jpath.path.pathtype = T_MergeJoin;
pathnode->jpath.path.parent = joinrel;
pathnode->jpath.jointype = jointype;
......@@ -1465,6 +1423,7 @@ create_mergejoin_path(PlannerInfo *root,
pathnode->path_mergeclauses = mergeclauses;
pathnode->outersortkeys = outersortkeys;
pathnode->innersortkeys = innersortkeys;
/* pathnode->materialize_inner will be set by cost_mergejoin */
cost_mergejoin(pathnode, root, sjinfo);
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.178 2009/10/26 02:26:43 tgl Exp $
* $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.179 2009/11/15 02:45:35 tgl Exp $
......@@ -835,6 +835,14 @@ typedef JoinPath NestPath;
* A mergejoin path has these fields.
* Unlike other path types, a MergePath node doesn't represent just a single
* run-time plan node: it can represent up to four. Aside from the MergeJoin
* node itself, there can be a Sort node for the outer input, a Sort node
* for the inner input, and/or a Material node for the inner input. We could
* represent these nodes by separate path nodes, but considering how many
* different merge paths are investigated during a complex join problem,
* it seems better to avoid unnecessary palloc overhead.
* path_mergeclauses lists the clauses (in the form of RestrictInfos)
* that will be used in the merge.
......@@ -846,7 +854,10 @@ typedef JoinPath NestPath;
* outersortkeys (resp. innersortkeys) is NIL if the outer path
* (resp. inner path) is already ordered appropriately for the
* mergejoin. If it is not NIL then it is a PathKeys list describing
* the ordering that must be created by an explicit sort step.
* the ordering that must be created by an explicit Sort node.
* materialize_inner is TRUE if a Material node should be placed atop the
* inner input. This may appear with or without an inner Sort step.
typedef struct MergePath
......@@ -855,6 +866,7 @@ typedef struct MergePath
List *path_mergeclauses; /* join clauses to be used for merge */
List *outersortkeys; /* keys for explicit sort, if any */
List *innersortkeys; /* keys for explicit sort, if any */
bool materialize_inner; /* add Materialize to inner? */
} MergePath;
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.98 2009/09/12 22:12:04 tgl Exp $
* $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.99 2009/11/15 02:45:35 tgl Exp $
......@@ -84,7 +84,6 @@ extern void cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm);
extern void cost_sort(Path *path, PlannerInfo *root,
List *pathkeys, Cost input_cost, double tuples, int width,
double limit_tuples);
extern bool sort_exceeds_work_mem(Sort *sort);
extern void cost_material(Path *path,
Cost input_startup_cost, Cost input_total_cost,
double tuples, int width);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment