Commit 2415ad98 authored by Tom Lane's avatar Tom Lane

Teach tuplestore.c to throw away data before the "mark" point when the caller

is using mark/restore but not rewind or backward-scan capability.  Insert a
materialize plan node between a mergejoin and its inner child if the inner
child is a sort that is expected to spill to disk.  The materialize shields
the sort from the need to do mark/restore and thereby allows it to perform
its final merge pass on-the-fly; while the materialize itself is normally
cheap since it won't spill to disk unless the number of tuples with equal
key values exceeds work_mem.

Greg Stark, with some kibitzing from Tom Lane.
parent 3963574d
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeMaterial.c,v 1.58 2007/01/05 22:19:28 momjian Exp $
* $PostgreSQL: pgsql/src/backend/executor/nodeMaterial.c,v 1.59 2007/05/21 17:57:33 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -56,10 +56,10 @@ ExecMaterial(MaterialState *node)
/*
* If first time through, and we need a tuplestore, initialize it.
*/
if (tuplestorestate == NULL && node->randomAccess)
if (tuplestorestate == NULL && node->eflags != 0)
{
tuplestorestate = tuplestore_begin_heap(true, false, work_mem);
tuplestore_set_eflags(tuplestorestate, node->eflags);
node->tuplestorestate = (void *) tuplestorestate;
}
......@@ -162,14 +162,14 @@ ExecInitMaterial(Material *node, EState *estate, int eflags)
matstate->ss.ps.state = estate;
/*
* We must have random access to the subplan output to do backward scan or
* mark/restore. We also prefer to materialize the subplan output if we
* might be called on to rewind and replay it many times. However, if none
* of these cases apply, we can skip storing the data.
* We must have a tuplestore buffering the subplan output to do backward
* scan or mark/restore. We also prefer to materialize the subplan output
* if we might be called on to rewind and replay it many times. However,
* if none of these cases apply, we can skip storing the data.
*/
matstate->randomAccess = (eflags & (EXEC_FLAG_REWIND |
EXEC_FLAG_BACKWARD |
EXEC_FLAG_MARK)) != 0;
matstate->eflags = (eflags & (EXEC_FLAG_REWIND |
EXEC_FLAG_BACKWARD |
EXEC_FLAG_MARK));
matstate->eof_underlying = false;
matstate->tuplestorestate = NULL;
......@@ -255,7 +255,7 @@ ExecEndMaterial(MaterialState *node)
void
ExecMaterialMarkPos(MaterialState *node)
{
Assert(node->randomAccess);
Assert(node->eflags & EXEC_FLAG_MARK);
/*
* if we haven't materialized yet, just return.
......@@ -275,7 +275,7 @@ ExecMaterialMarkPos(MaterialState *node)
void
ExecMaterialRestrPos(MaterialState *node)
{
Assert(node->randomAccess);
Assert(node->eflags & EXEC_FLAG_MARK);
/*
* if we haven't materialized yet, just return.
......@@ -300,7 +300,7 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt)
{
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
if (node->randomAccess)
if (node->eflags != 0)
{
/*
* If we haven't materialized yet, just return. If outerplan' chgParam
......@@ -312,15 +312,21 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt)
/*
* If subnode is to be rescanned then we forget previous stored
* results; we have to re-read the subplan and re-store.
* results; we have to re-read the subplan and re-store. Also,
* if we told tuplestore it needn't support rescan, we lose and
* must re-read. (This last should not happen in common cases;
* else our caller lied by not passing EXEC_FLAG_REWIND to us.)
*
* Otherwise we can just rewind and rescan the stored output. The
* state of the subnode does not change.
*/
if (((PlanState *) node)->lefttree->chgParam != NULL)
if (((PlanState *) node)->lefttree->chgParam != NULL ||
(node->eflags & EXEC_FLAG_REWIND) == 0)
{
tuplestore_end((Tuplestorestate *) node->tuplestorestate);
node->tuplestorestate = NULL;
if (((PlanState *) node)->lefttree->chgParam == NULL)
ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
node->eof_underlying = false;
}
else
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeMergejoin.c,v 1.87 2007/02/02 00:07:03 tgl Exp $
* $PostgreSQL: pgsql/src/backend/executor/nodeMergejoin.c,v 1.88 2007/05/21 17:57:33 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -706,6 +706,9 @@ ExecMergeJoin(MergeJoinState *node)
}
else
{
/* Mark before advancing, if wanted */
if (node->mj_ExtraMarks)
ExecMarkPos(innerPlan);
/* Stay in same state to fetch next inner tuple */
if (doFillInner)
{
......@@ -830,6 +833,9 @@ ExecMergeJoin(MergeJoinState *node)
* now we get the next inner tuple, if any. If there's none,
* advance to next outer tuple (which may be able to join to
* previously marked tuples).
*
* NB: must NOT do "extraMarks" here, since we may need to
* return to previously marked tuples.
*/
innerTupleSlot = ExecProcNode(innerPlan);
node->mj_InnerTupleSlot = innerTupleSlot;
......@@ -1140,6 +1146,9 @@ ExecMergeJoin(MergeJoinState *node)
break;
/*
* SKIPOUTER_ADVANCE: advance over an outer tuple that is
* known not to join to any inner tuple.
*
* Before advancing, we check to see if we must emit an
* outer-join fill tuple for this outer tuple.
*/
......@@ -1204,6 +1213,9 @@ ExecMergeJoin(MergeJoinState *node)
break;
/*
* SKIPINNER_ADVANCE: advance over an inner tuple that is
* known not to join to any outer tuple.
*
* Before advancing, we check to see if we must emit an
* outer-join fill tuple for this inner tuple.
*/
......@@ -1225,6 +1237,10 @@ ExecMergeJoin(MergeJoinState *node)
return result;
}
/* Mark before advancing, if wanted */
if (node->mj_ExtraMarks)
ExecMarkPos(innerPlan);
/*
* now we get the next inner tuple, if any
*/
......@@ -1295,6 +1311,10 @@ ExecMergeJoin(MergeJoinState *node)
return result;
}
/* Mark before advancing, if wanted */
if (node->mj_ExtraMarks)
ExecMarkPos(innerPlan);
/*
* now we get the next inner tuple, if any
*/
......@@ -1425,6 +1445,22 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate,
eflags | EXEC_FLAG_MARK);
/*
* For certain types of inner child nodes, it is advantageous to issue
* MARK every time we advance past an inner tuple we will never return
* to. For other types, MARK on a tuple we cannot return to is a waste
* of cycles. Detect which case applies and set mj_ExtraMarks if we
* want to issue "unnecessary" MARK calls.
*
* Currently, only Material wants the extra MARKs, and it will be helpful
* only if eflags doesn't specify REWIND.
*/
if (IsA(innerPlan(node), Material) &&
(eflags & EXEC_FLAG_REWIND) == 0)
mergestate->mj_ExtraMarks = true;
else
mergestate->mj_ExtraMarks = false;
#define MERGEJOIN_NSLOTS 4
/*
......
......@@ -54,7 +54,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.183 2007/05/21 17:57:33 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1038,6 +1038,23 @@ cost_sort(Path *path, PlannerInfo *root,
path->total_cost = startup_cost + run_cost;
}
/*
* sort_exceeds_work_mem
* Given a finished Sort plan node, detect whether it is expected to
* spill to disk (ie, will need more than work_mem workspace)
*
* This assumes there will be no available LIMIT.
*/
bool
sort_exceeds_work_mem(Sort *sort)
{
double input_bytes = relation_byte_size(sort->plan.plan_rows,
sort->plan.plan_width);
long work_mem_bytes = work_mem * 1024L;
return (input_bytes > work_mem_bytes);
}
/*
* cost_material
* Determines and returns the cost of materializing a relation, including
......
......@@ -10,7 +10,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.231 2007/05/21 17:57:34 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1600,6 +1600,30 @@ create_mergejoin_plan(PlannerInfo *root,
else
innerpathkeys = best_path->jpath.innerjoinpath->pathkeys;
/*
* If inner plan is a sort that is expected to spill to disk, add a
* materialize node to shield it from the need to handle mark/restore.
* This will allow it to perform the last merge pass on-the-fly, while
* in most cases not requiring the materialize to spill to disk.
*
* XXX really, Sort oughta do this for itself, probably, to avoid the
* overhead of a separate plan node.
*/
if (IsA(inner_plan, Sort) &&
sort_exceeds_work_mem((Sort *) inner_plan))
{
Plan *matplan = (Plan *) make_material(inner_plan);
/*
* We assume the materialize will not spill to disk, and therefore
* charge just cpu_tuple_cost per tuple.
*/
copy_plan_costsize(matplan, inner_plan);
matplan->total_cost += cpu_tuple_cost * matplan->plan_rows;
inner_plan = matplan;
}
/*
* Compute the opfamily/strategy/nullsfirst arrays needed by the executor.
* The information is in the pathkeys for the two inputs, but we need to
......
This diff is collapsed.
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.174 2007/05/17 19:35:08 tgl Exp $
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.175 2007/05/21 17:57:34 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -1180,6 +1180,7 @@ typedef struct NestLoopState
* NumClauses number of mergejoinable join clauses
* Clauses info for each mergejoinable clause
* JoinState current "state" of join. see execdefs.h
* ExtraMarks true to issue extra Mark operations on inner scan
* FillOuter true if should emit unjoined outer tuples anyway
* FillInner true if should emit unjoined inner tuples anyway
* MatchedOuter true if found a join match for current outer tuple
......@@ -1202,6 +1203,7 @@ typedef struct MergeJoinState
int mj_NumClauses;
MergeJoinClause mj_Clauses; /* array of length mj_NumClauses */
int mj_JoinState;
bool mj_ExtraMarks;
bool mj_FillOuter;
bool mj_FillInner;
bool mj_MatchedOuter;
......@@ -1281,7 +1283,7 @@ typedef struct HashJoinState
typedef struct MaterialState
{
ScanState ss; /* its first field is NodeTag */
bool randomAccess; /* need random access to subplan output? */
int eflags; /* capability flags to pass to tuplestore */
bool eof_underlying; /* reached end of underlying plan? */
void *tuplestorestate; /* private state of tuplestore.c */
} MaterialState;
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.86 2007/05/04 01:13:45 tgl Exp $
* $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.87 2007/05/21 17:57:34 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -75,6 +75,7 @@ extern void cost_valuesscan(Path *path, PlannerInfo *root,
extern void cost_sort(Path *path, PlannerInfo *root,
List *pathkeys, Cost input_cost, double tuples, int width,
double limit_tuples);
extern bool sort_exceeds_work_mem(Sort *sort);
extern void cost_material(Path *path,
Cost input_cost, double tuples, int width);
extern void cost_agg(Path *path, PlannerInfo *root,
......
......@@ -22,7 +22,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/utils/tuplestore.h,v 1.20 2007/01/05 22:20:00 momjian Exp $
* $PostgreSQL: pgsql/src/include/utils/tuplestore.h,v 1.21 2007/05/21 17:57:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -46,6 +46,8 @@ extern Tuplestorestate *tuplestore_begin_heap(bool randomAccess,
bool interXact,
int maxKBytes);
extern void tuplestore_set_eflags(Tuplestorestate *state, int eflags);
extern void tuplestore_puttupleslot(Tuplestorestate *state,
TupleTableSlot *slot);
extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple);
......@@ -53,7 +55,6 @@ extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple);
/* tuplestore_donestoring() used to be required, but is no longer used */
#define tuplestore_donestoring(state) ((void) 0)
/* backwards scan is only allowed if randomAccess was specified 'true' */
extern bool tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
TupleTableSlot *slot);
extern bool tuplestore_advance(Tuplestorestate *state, bool forward);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment