Commit 2415ad98 authored by Tom Lane's avatar Tom Lane

Teach tuplestore.c to throw away data before the "mark" point when the caller

is using mark/restore but not rewind or backward-scan capability.  Insert a
materialize plan node between a mergejoin and its inner child if the inner
child is a sort that is expected to spill to disk.  The materialize shields
the sort from the need to do mark/restore and thereby allows it to perform
its final merge pass on-the-fly; while the materialize itself is normally
cheap since it won't spill to disk unless the number of tuples with equal
key values exceeds work_mem.

Greg Stark, with some kibitzing from Tom Lane.
parent 3963574d
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeMaterial.c,v 1.58 2007/01/05 22:19:28 momjian Exp $ * $PostgreSQL: pgsql/src/backend/executor/nodeMaterial.c,v 1.59 2007/05/21 17:57:33 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -56,10 +56,10 @@ ExecMaterial(MaterialState *node) ...@@ -56,10 +56,10 @@ ExecMaterial(MaterialState *node)
/* /*
* If first time through, and we need a tuplestore, initialize it. * If first time through, and we need a tuplestore, initialize it.
*/ */
if (tuplestorestate == NULL && node->randomAccess) if (tuplestorestate == NULL && node->eflags != 0)
{ {
tuplestorestate = tuplestore_begin_heap(true, false, work_mem); tuplestorestate = tuplestore_begin_heap(true, false, work_mem);
tuplestore_set_eflags(tuplestorestate, node->eflags);
node->tuplestorestate = (void *) tuplestorestate; node->tuplestorestate = (void *) tuplestorestate;
} }
...@@ -162,14 +162,14 @@ ExecInitMaterial(Material *node, EState *estate, int eflags) ...@@ -162,14 +162,14 @@ ExecInitMaterial(Material *node, EState *estate, int eflags)
matstate->ss.ps.state = estate; matstate->ss.ps.state = estate;
/* /*
* We must have random access to the subplan output to do backward scan or * We must have a tuplestore buffering the subplan output to do backward
* mark/restore. We also prefer to materialize the subplan output if we * scan or mark/restore. We also prefer to materialize the subplan output
* might be called on to rewind and replay it many times. However, if none * if we might be called on to rewind and replay it many times. However,
* of these cases apply, we can skip storing the data. * if none of these cases apply, we can skip storing the data.
*/ */
matstate->randomAccess = (eflags & (EXEC_FLAG_REWIND | matstate->eflags = (eflags & (EXEC_FLAG_REWIND |
EXEC_FLAG_BACKWARD | EXEC_FLAG_BACKWARD |
EXEC_FLAG_MARK)) != 0; EXEC_FLAG_MARK));
matstate->eof_underlying = false; matstate->eof_underlying = false;
matstate->tuplestorestate = NULL; matstate->tuplestorestate = NULL;
...@@ -255,7 +255,7 @@ ExecEndMaterial(MaterialState *node) ...@@ -255,7 +255,7 @@ ExecEndMaterial(MaterialState *node)
void void
ExecMaterialMarkPos(MaterialState *node) ExecMaterialMarkPos(MaterialState *node)
{ {
Assert(node->randomAccess); Assert(node->eflags & EXEC_FLAG_MARK);
/* /*
* if we haven't materialized yet, just return. * if we haven't materialized yet, just return.
...@@ -275,7 +275,7 @@ ExecMaterialMarkPos(MaterialState *node) ...@@ -275,7 +275,7 @@ ExecMaterialMarkPos(MaterialState *node)
void void
ExecMaterialRestrPos(MaterialState *node) ExecMaterialRestrPos(MaterialState *node)
{ {
Assert(node->randomAccess); Assert(node->eflags & EXEC_FLAG_MARK);
/* /*
* if we haven't materialized yet, just return. * if we haven't materialized yet, just return.
...@@ -300,7 +300,7 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt) ...@@ -300,7 +300,7 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt)
{ {
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
if (node->randomAccess) if (node->eflags != 0)
{ {
/* /*
* If we haven't materialized yet, just return. If outerplan' chgParam * If we haven't materialized yet, just return. If outerplan' chgParam
...@@ -312,15 +312,21 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt) ...@@ -312,15 +312,21 @@ ExecMaterialReScan(MaterialState *node, ExprContext *exprCtxt)
/* /*
* If subnode is to be rescanned then we forget previous stored * If subnode is to be rescanned then we forget previous stored
* results; we have to re-read the subplan and re-store. * results; we have to re-read the subplan and re-store. Also,
* if we told tuplestore it needn't support rescan, we lose and
* must re-read. (This last should not happen in common cases;
* else our caller lied by not passing EXEC_FLAG_REWIND to us.)
* *
* Otherwise we can just rewind and rescan the stored output. The * Otherwise we can just rewind and rescan the stored output. The
* state of the subnode does not change. * state of the subnode does not change.
*/ */
if (((PlanState *) node)->lefttree->chgParam != NULL) if (((PlanState *) node)->lefttree->chgParam != NULL ||
(node->eflags & EXEC_FLAG_REWIND) == 0)
{ {
tuplestore_end((Tuplestorestate *) node->tuplestorestate); tuplestore_end((Tuplestorestate *) node->tuplestorestate);
node->tuplestorestate = NULL; node->tuplestorestate = NULL;
if (((PlanState *) node)->lefttree->chgParam == NULL)
ExecReScan(((PlanState *) node)->lefttree, exprCtxt);
node->eof_underlying = false; node->eof_underlying = false;
} }
else else
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeMergejoin.c,v 1.87 2007/02/02 00:07:03 tgl Exp $ * $PostgreSQL: pgsql/src/backend/executor/nodeMergejoin.c,v 1.88 2007/05/21 17:57:33 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -706,6 +706,9 @@ ExecMergeJoin(MergeJoinState *node) ...@@ -706,6 +706,9 @@ ExecMergeJoin(MergeJoinState *node)
} }
else else
{ {
/* Mark before advancing, if wanted */
if (node->mj_ExtraMarks)
ExecMarkPos(innerPlan);
/* Stay in same state to fetch next inner tuple */ /* Stay in same state to fetch next inner tuple */
if (doFillInner) if (doFillInner)
{ {
...@@ -830,6 +833,9 @@ ExecMergeJoin(MergeJoinState *node) ...@@ -830,6 +833,9 @@ ExecMergeJoin(MergeJoinState *node)
* now we get the next inner tuple, if any. If there's none, * now we get the next inner tuple, if any. If there's none,
* advance to next outer tuple (which may be able to join to * advance to next outer tuple (which may be able to join to
* previously marked tuples). * previously marked tuples).
*
* NB: must NOT do "extraMarks" here, since we may need to
* return to previously marked tuples.
*/ */
innerTupleSlot = ExecProcNode(innerPlan); innerTupleSlot = ExecProcNode(innerPlan);
node->mj_InnerTupleSlot = innerTupleSlot; node->mj_InnerTupleSlot = innerTupleSlot;
...@@ -1140,6 +1146,9 @@ ExecMergeJoin(MergeJoinState *node) ...@@ -1140,6 +1146,9 @@ ExecMergeJoin(MergeJoinState *node)
break; break;
/* /*
* SKIPOUTER_ADVANCE: advance over an outer tuple that is
* known not to join to any inner tuple.
*
* Before advancing, we check to see if we must emit an * Before advancing, we check to see if we must emit an
* outer-join fill tuple for this outer tuple. * outer-join fill tuple for this outer tuple.
*/ */
...@@ -1204,6 +1213,9 @@ ExecMergeJoin(MergeJoinState *node) ...@@ -1204,6 +1213,9 @@ ExecMergeJoin(MergeJoinState *node)
break; break;
/* /*
* SKIPINNER_ADVANCE: advance over an inner tuple that is
* known not to join to any outer tuple.
*
* Before advancing, we check to see if we must emit an * Before advancing, we check to see if we must emit an
* outer-join fill tuple for this inner tuple. * outer-join fill tuple for this inner tuple.
*/ */
...@@ -1225,6 +1237,10 @@ ExecMergeJoin(MergeJoinState *node) ...@@ -1225,6 +1237,10 @@ ExecMergeJoin(MergeJoinState *node)
return result; return result;
} }
/* Mark before advancing, if wanted */
if (node->mj_ExtraMarks)
ExecMarkPos(innerPlan);
/* /*
* now we get the next inner tuple, if any * now we get the next inner tuple, if any
*/ */
...@@ -1295,6 +1311,10 @@ ExecMergeJoin(MergeJoinState *node) ...@@ -1295,6 +1311,10 @@ ExecMergeJoin(MergeJoinState *node)
return result; return result;
} }
/* Mark before advancing, if wanted */
if (node->mj_ExtraMarks)
ExecMarkPos(innerPlan);
/* /*
* now we get the next inner tuple, if any * now we get the next inner tuple, if any
*/ */
...@@ -1425,6 +1445,22 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags) ...@@ -1425,6 +1445,22 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate, innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate,
eflags | EXEC_FLAG_MARK); eflags | EXEC_FLAG_MARK);
/*
* For certain types of inner child nodes, it is advantageous to issue
* MARK every time we advance past an inner tuple we will never return
* to. For other types, MARK on a tuple we cannot return to is a waste
* of cycles. Detect which case applies and set mj_ExtraMarks if we
* want to issue "unnecessary" MARK calls.
*
* Currently, only Material wants the extra MARKs, and it will be helpful
* only if eflags doesn't specify REWIND.
*/
if (IsA(innerPlan(node), Material) &&
(eflags & EXEC_FLAG_REWIND) == 0)
mergestate->mj_ExtraMarks = true;
else
mergestate->mj_ExtraMarks = false;
#define MERGEJOIN_NSLOTS 4 #define MERGEJOIN_NSLOTS 4
/* /*
......
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $ * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.183 2007/05/21 17:57:33 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -1038,6 +1038,23 @@ cost_sort(Path *path, PlannerInfo *root, ...@@ -1038,6 +1038,23 @@ cost_sort(Path *path, PlannerInfo *root,
path->total_cost = startup_cost + run_cost; path->total_cost = startup_cost + run_cost;
} }
/*
* sort_exceeds_work_mem
* Given a finished Sort plan node, detect whether it is expected to
* spill to disk (ie, will need more than work_mem workspace)
*
* This assumes there will be no available LIMIT.
*/
bool
sort_exceeds_work_mem(Sort *sort)
{
double input_bytes = relation_byte_size(sort->plan.plan_rows,
sort->plan.plan_width);
long work_mem_bytes = work_mem * 1024L;
return (input_bytes > work_mem_bytes);
}
/* /*
* cost_material * cost_material
* Determines and returns the cost of materializing a relation, including * Determines and returns the cost of materializing a relation, including
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $ * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.231 2007/05/21 17:57:34 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -1600,6 +1600,30 @@ create_mergejoin_plan(PlannerInfo *root, ...@@ -1600,6 +1600,30 @@ create_mergejoin_plan(PlannerInfo *root,
else else
innerpathkeys = best_path->jpath.innerjoinpath->pathkeys; innerpathkeys = best_path->jpath.innerjoinpath->pathkeys;
/*
* If inner plan is a sort that is expected to spill to disk, add a
* materialize node to shield it from the need to handle mark/restore.
* This will allow it to perform the last merge pass on-the-fly, while
* in most cases not requiring the materialize to spill to disk.
*
* XXX really, Sort oughta do this for itself, probably, to avoid the
* overhead of a separate plan node.
*/
if (IsA(inner_plan, Sort) &&
sort_exceeds_work_mem((Sort *) inner_plan))
{
Plan *matplan = (Plan *) make_material(inner_plan);
/*
* We assume the materialize will not spill to disk, and therefore
* charge just cpu_tuple_cost per tuple.
*/
copy_plan_costsize(matplan, inner_plan);
matplan->total_cost += cpu_tuple_cost * matplan->plan_rows;
inner_plan = matplan;
}
/* /*
* Compute the opfamily/strategy/nullsfirst arrays needed by the executor. * Compute the opfamily/strategy/nullsfirst arrays needed by the executor.
* The information is in the pathkeys for the two inputs, but we need to * The information is in the pathkeys for the two inputs, but we need to
......
...@@ -20,10 +20,12 @@ ...@@ -20,10 +20,12 @@
* maxKBytes, we dump all the tuples into a temp file and then read from that * maxKBytes, we dump all the tuples into a temp file and then read from that
* when needed. * when needed.
* *
* When the caller requests random access to the data, we write the temp file * When the caller requests backward-scan capability, we write the temp file
* in a format that allows either forward or backward scan. Otherwise, only * in a format that allows either forward or backward scan. Otherwise, only
* forward scan is allowed. But rewind and markpos/restorepos are allowed * forward scan is allowed. Rewind and markpos/restorepos are normally allowed
* in any case. * but can be turned off via tuplestore_set_eflags; turning off both backward
* scan and rewind enables truncation of the tuplestore at the mark point
* (if any) for minimal memory usage.
* *
* Because we allow reading before writing is complete, there are two * Because we allow reading before writing is complete, there are two
* interesting positions in the temp file: the current read position and * interesting positions in the temp file: the current read position and
...@@ -36,7 +38,7 @@ ...@@ -36,7 +38,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.30 2007/01/05 22:19:47 momjian Exp $ * $PostgreSQL: pgsql/src/backend/utils/sort/tuplestore.c,v 1.31 2007/05/21 17:57:34 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -44,6 +46,7 @@ ...@@ -44,6 +46,7 @@
#include "postgres.h" #include "postgres.h"
#include "access/heapam.h" #include "access/heapam.h"
#include "executor/executor.h"
#include "storage/buffile.h" #include "storage/buffile.h"
#include "utils/memutils.h" #include "utils/memutils.h"
#include "utils/tuplestore.h" #include "utils/tuplestore.h"
...@@ -66,7 +69,7 @@ typedef enum ...@@ -66,7 +69,7 @@ typedef enum
struct Tuplestorestate struct Tuplestorestate
{ {
TupStoreStatus status; /* enumerated value as shown above */ TupStoreStatus status; /* enumerated value as shown above */
bool randomAccess; /* did caller request random access? */ int eflags; /* capability flags */
bool interXact; /* keep open through transactions? */ bool interXact; /* keep open through transactions? */
long availMem; /* remaining memory available, in bytes */ long availMem; /* remaining memory available, in bytes */
BufFile *myfile; /* underlying file, or NULL if none */ BufFile *myfile; /* underlying file, or NULL if none */
...@@ -157,11 +160,11 @@ struct Tuplestorestate ...@@ -157,11 +160,11 @@ struct Tuplestorestate
* may or may not match the in-memory representation of the tuple --- * may or may not match the in-memory representation of the tuple ---
* any conversion needed is the job of the writetup and readtup routines. * any conversion needed is the job of the writetup and readtup routines.
* *
* If state->randomAccess is true, then the stored representation of the * If state->eflags & EXEC_FLAG_BACKWARD, then the stored representation of
* tuple must be followed by another "unsigned int" that is a copy of the * the tuple must be followed by another "unsigned int" that is a copy of the
* length --- so the total tape space used is actually sizeof(unsigned int) * length --- so the total tape space used is actually sizeof(unsigned int)
* more than the stored length value. This allows read-backwards. When * more than the stored length value. This allows read-backwards. When
* randomAccess is not true, the write/read routines may omit the extra * EXEC_FLAG_BACKWARD is not set, the write/read routines may omit the extra
* length word. * length word.
* *
* writetup is expected to write both length words as well as the tuple * writetup is expected to write both length words as well as the tuple
...@@ -192,11 +195,12 @@ struct Tuplestorestate ...@@ -192,11 +195,12 @@ struct Tuplestorestate
*/ */
static Tuplestorestate *tuplestore_begin_common(bool randomAccess, static Tuplestorestate *tuplestore_begin_common(int eflags,
bool interXact, bool interXact,
int maxKBytes); int maxKBytes);
static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple); static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple);
static void dumptuples(Tuplestorestate *state); static void dumptuples(Tuplestorestate *state);
static void tuplestore_trim(Tuplestorestate *state, int ntuples);
static unsigned int getlen(Tuplestorestate *state, bool eofOK); static unsigned int getlen(Tuplestorestate *state, bool eofOK);
static void *copytup_heap(Tuplestorestate *state, void *tup); static void *copytup_heap(Tuplestorestate *state, void *tup);
static void writetup_heap(Tuplestorestate *state, void *tup); static void writetup_heap(Tuplestorestate *state, void *tup);
...@@ -209,14 +213,14 @@ static void *readtup_heap(Tuplestorestate *state, unsigned int len); ...@@ -209,14 +213,14 @@ static void *readtup_heap(Tuplestorestate *state, unsigned int len);
* Initialize for a tuple store operation. * Initialize for a tuple store operation.
*/ */
static Tuplestorestate * static Tuplestorestate *
tuplestore_begin_common(bool randomAccess, bool interXact, int maxKBytes) tuplestore_begin_common(int eflags, bool interXact, int maxKBytes)
{ {
Tuplestorestate *state; Tuplestorestate *state;
state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate)); state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate));
state->status = TSS_INMEM; state->status = TSS_INMEM;
state->randomAccess = randomAccess; state->eflags = eflags;
state->interXact = interXact; state->interXact = interXact;
state->availMem = maxKBytes * 1024L; state->availMem = maxKBytes * 1024L;
state->myfile = NULL; state->myfile = NULL;
...@@ -255,9 +259,18 @@ tuplestore_begin_common(bool randomAccess, bool interXact, int maxKBytes) ...@@ -255,9 +259,18 @@ tuplestore_begin_common(bool randomAccess, bool interXact, int maxKBytes)
Tuplestorestate * Tuplestorestate *
tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes) tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
{ {
Tuplestorestate *state = tuplestore_begin_common(randomAccess, Tuplestorestate *state;
interXact, int eflags;
maxKBytes);
/*
* This interpretation of the meaning of randomAccess is compatible
* with the pre-8.3 behavior of tuplestores.
*/
eflags = randomAccess ?
(EXEC_FLAG_BACKWARD | EXEC_FLAG_REWIND | EXEC_FLAG_MARK) :
(EXEC_FLAG_REWIND | EXEC_FLAG_MARK);
state = tuplestore_begin_common(eflags, interXact, maxKBytes);
state->copytup = copytup_heap; state->copytup = copytup_heap;
state->writetup = writetup_heap; state->writetup = writetup_heap;
...@@ -266,6 +279,30 @@ tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes) ...@@ -266,6 +279,30 @@ tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
return state; return state;
} }
/*
* tuplestore_set_eflags
*
* Set capability flags at a finer grain than is allowed by
* tuplestore_begin_xxx. This must be called before inserting any data
* into the tuplestore.
*
* eflags is a bitmask following the meanings used for executor node
* startup flags (see executor.h). tuplestore pays attention to these bits:
* EXEC_FLAG_REWIND need rewind to start
* EXEC_FLAG_BACKWARD need backward fetch
* EXEC_FLAG_MARK need mark/restore
* If tuplestore_set_eflags is not called, REWIND and MARK are allowed,
* and BACKWARD is set per "randomAccess" in the tuplestore_begin_xxx call.
*/
void
tuplestore_set_eflags(Tuplestorestate *state, int eflags)
{
Assert(state->status == TSS_INMEM);
Assert(state->memtupcount == 0);
state->eflags = eflags;
}
/* /*
* tuplestore_end * tuplestore_end
* *
...@@ -420,6 +457,9 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) ...@@ -420,6 +457,9 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple)
* Fetch the next tuple in either forward or back direction. * Fetch the next tuple in either forward or back direction.
* Returns NULL if no more tuples. If should_free is set, the * Returns NULL if no more tuples. If should_free is set, the
* caller must pfree the returned tuple when done with it. * caller must pfree the returned tuple when done with it.
*
* Backward scan is only allowed if randomAccess was set true or
* EXEC_FLAG_BACKWARD was specified to tuplestore_set_eflags().
*/ */
static void * static void *
tuplestore_gettuple(Tuplestorestate *state, bool forward, tuplestore_gettuple(Tuplestorestate *state, bool forward,
...@@ -428,7 +468,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, ...@@ -428,7 +468,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
unsigned int tuplen; unsigned int tuplen;
void *tup; void *tup;
Assert(forward || state->randomAccess); Assert(forward || (state->eflags & EXEC_FLAG_BACKWARD));
switch (state->status) switch (state->status)
{ {
...@@ -643,6 +683,8 @@ dumptuples(Tuplestorestate *state) ...@@ -643,6 +683,8 @@ dumptuples(Tuplestorestate *state)
void void
tuplestore_rescan(Tuplestorestate *state) tuplestore_rescan(Tuplestorestate *state)
{ {
Assert(state->eflags & EXEC_FLAG_REWIND);
switch (state->status) switch (state->status)
{ {
case TSS_INMEM: case TSS_INMEM:
...@@ -671,10 +713,26 @@ tuplestore_rescan(Tuplestorestate *state) ...@@ -671,10 +713,26 @@ tuplestore_rescan(Tuplestorestate *state)
void void
tuplestore_markpos(Tuplestorestate *state) tuplestore_markpos(Tuplestorestate *state)
{ {
Assert(state->eflags & EXEC_FLAG_MARK);
switch (state->status) switch (state->status)
{ {
case TSS_INMEM: case TSS_INMEM:
state->markpos_current = state->current; state->markpos_current = state->current;
/*
* We can truncate the tuplestore if neither backward scan nor
* rewind capability are required by the caller. There will
* never be a need to back up past the mark point.
*
* Note: you might think we could remove all the tuples before
* "current", since that one is the next to be returned. However,
* since tuplestore_gettuple returns a direct pointer to our
* internal copy of the tuple, it's likely that the caller has
* still got the tuple just before "current" referenced in a slot.
* Don't free it yet.
*/
if (!(state->eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_REWIND)))
tuplestore_trim(state, 1);
break; break;
case TSS_WRITEFILE: case TSS_WRITEFILE:
if (state->eof_reached) if (state->eof_reached)
...@@ -708,6 +766,8 @@ tuplestore_markpos(Tuplestorestate *state) ...@@ -708,6 +766,8 @@ tuplestore_markpos(Tuplestorestate *state)
void void
tuplestore_restorepos(Tuplestorestate *state) tuplestore_restorepos(Tuplestorestate *state)
{ {
Assert(state->eflags & EXEC_FLAG_MARK);
switch (state->status) switch (state->status)
{ {
case TSS_INMEM: case TSS_INMEM:
...@@ -733,6 +793,55 @@ tuplestore_restorepos(Tuplestorestate *state) ...@@ -733,6 +793,55 @@ tuplestore_restorepos(Tuplestorestate *state)
} }
} }
/*
* tuplestore_trim - remove all but ntuples tuples before current
*/
static void
tuplestore_trim(Tuplestorestate *state, int ntuples)
{
int nremove;
int i;
/*
* We don't bother trimming temp files since it usually would mean more
* work than just letting them sit in kernel buffers until they age out.
*/
if (state->status != TSS_INMEM)
return;
nremove = state->current - ntuples;
if (nremove <= 0)
return; /* nothing to do */
Assert(nremove <= state->memtupcount);
/* Release no-longer-needed tuples */
for (i = 0; i < nremove; i++)
{
FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
pfree(state->memtuples[i]);
}
/*
* Slide the array down and readjust pointers. This may look pretty
* stupid, but we expect that there will usually not be very many
* tuple-pointers to move, so this isn't that expensive; and it keeps
* a lot of other logic simple.
*
* In fact, in the current usage for merge joins, it's demonstrable that
* there will always be exactly one non-removed tuple; so optimize that
* case.
*/
if (nremove + 1 == state->memtupcount)
state->memtuples[0] = state->memtuples[nremove];
else
memmove(state->memtuples, state->memtuples + nremove,
(state->memtupcount - nremove) * sizeof(void *));
state->memtupcount -= nremove;
state->current -= nremove;
state->markpos_current -= nremove;
}
/* /*
* Tape interface routines * Tape interface routines
...@@ -783,7 +892,7 @@ writetup_heap(Tuplestorestate *state, void *tup) ...@@ -783,7 +892,7 @@ writetup_heap(Tuplestorestate *state, void *tup)
if (BufFileWrite(state->myfile, (void *) tuple, tuplen) != (size_t) tuplen) if (BufFileWrite(state->myfile, (void *) tuple, tuplen) != (size_t) tuplen)
elog(ERROR, "write failed"); elog(ERROR, "write failed");
if (state->randomAccess) /* need trailing length word? */ if (state->eflags & EXEC_FLAG_BACKWARD) /* need trailing length word? */
if (BufFileWrite(state->myfile, (void *) &tuplen, if (BufFileWrite(state->myfile, (void *) &tuplen,
sizeof(tuplen)) != sizeof(tuplen)) sizeof(tuplen)) != sizeof(tuplen))
elog(ERROR, "write failed"); elog(ERROR, "write failed");
...@@ -804,7 +913,7 @@ readtup_heap(Tuplestorestate *state, unsigned int len) ...@@ -804,7 +913,7 @@ readtup_heap(Tuplestorestate *state, unsigned int len)
if (BufFileRead(state->myfile, (void *) ((char *) tuple + sizeof(int)), if (BufFileRead(state->myfile, (void *) ((char *) tuple + sizeof(int)),
len - sizeof(int)) != (size_t) (len - sizeof(int))) len - sizeof(int)) != (size_t) (len - sizeof(int)))
elog(ERROR, "unexpected end of data"); elog(ERROR, "unexpected end of data");
if (state->randomAccess) /* need trailing length word? */ if (state->eflags & EXEC_FLAG_BACKWARD) /* need trailing length word? */
if (BufFileRead(state->myfile, (void *) &tuplen, if (BufFileRead(state->myfile, (void *) &tuplen,
sizeof(tuplen)) != sizeof(tuplen)) sizeof(tuplen)) != sizeof(tuplen))
elog(ERROR, "unexpected end of data"); elog(ERROR, "unexpected end of data");
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.174 2007/05/17 19:35:08 tgl Exp $ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.175 2007/05/21 17:57:34 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -1180,6 +1180,7 @@ typedef struct NestLoopState ...@@ -1180,6 +1180,7 @@ typedef struct NestLoopState
* NumClauses number of mergejoinable join clauses * NumClauses number of mergejoinable join clauses
* Clauses info for each mergejoinable clause * Clauses info for each mergejoinable clause
* JoinState current "state" of join. see execdefs.h * JoinState current "state" of join. see execdefs.h
* ExtraMarks true to issue extra Mark operations on inner scan
* FillOuter true if should emit unjoined outer tuples anyway * FillOuter true if should emit unjoined outer tuples anyway
* FillInner true if should emit unjoined inner tuples anyway * FillInner true if should emit unjoined inner tuples anyway
* MatchedOuter true if found a join match for current outer tuple * MatchedOuter true if found a join match for current outer tuple
...@@ -1202,6 +1203,7 @@ typedef struct MergeJoinState ...@@ -1202,6 +1203,7 @@ typedef struct MergeJoinState
int mj_NumClauses; int mj_NumClauses;
MergeJoinClause mj_Clauses; /* array of length mj_NumClauses */ MergeJoinClause mj_Clauses; /* array of length mj_NumClauses */
int mj_JoinState; int mj_JoinState;
bool mj_ExtraMarks;
bool mj_FillOuter; bool mj_FillOuter;
bool mj_FillInner; bool mj_FillInner;
bool mj_MatchedOuter; bool mj_MatchedOuter;
...@@ -1281,7 +1283,7 @@ typedef struct HashJoinState ...@@ -1281,7 +1283,7 @@ typedef struct HashJoinState
typedef struct MaterialState typedef struct MaterialState
{ {
ScanState ss; /* its first field is NodeTag */ ScanState ss; /* its first field is NodeTag */
bool randomAccess; /* need random access to subplan output? */ int eflags; /* capability flags to pass to tuplestore */
bool eof_underlying; /* reached end of underlying plan? */ bool eof_underlying; /* reached end of underlying plan? */
void *tuplestorestate; /* private state of tuplestore.c */ void *tuplestorestate; /* private state of tuplestore.c */
} MaterialState; } MaterialState;
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.86 2007/05/04 01:13:45 tgl Exp $ * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.87 2007/05/21 17:57:34 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -75,6 +75,7 @@ extern void cost_valuesscan(Path *path, PlannerInfo *root, ...@@ -75,6 +75,7 @@ extern void cost_valuesscan(Path *path, PlannerInfo *root,
extern void cost_sort(Path *path, PlannerInfo *root, extern void cost_sort(Path *path, PlannerInfo *root,
List *pathkeys, Cost input_cost, double tuples, int width, List *pathkeys, Cost input_cost, double tuples, int width,
double limit_tuples); double limit_tuples);
extern bool sort_exceeds_work_mem(Sort *sort);
extern void cost_material(Path *path, extern void cost_material(Path *path,
Cost input_cost, double tuples, int width); Cost input_cost, double tuples, int width);
extern void cost_agg(Path *path, PlannerInfo *root, extern void cost_agg(Path *path, PlannerInfo *root,
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/utils/tuplestore.h,v 1.20 2007/01/05 22:20:00 momjian Exp $ * $PostgreSQL: pgsql/src/include/utils/tuplestore.h,v 1.21 2007/05/21 17:57:35 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -46,6 +46,8 @@ extern Tuplestorestate *tuplestore_begin_heap(bool randomAccess, ...@@ -46,6 +46,8 @@ extern Tuplestorestate *tuplestore_begin_heap(bool randomAccess,
bool interXact, bool interXact,
int maxKBytes); int maxKBytes);
extern void tuplestore_set_eflags(Tuplestorestate *state, int eflags);
extern void tuplestore_puttupleslot(Tuplestorestate *state, extern void tuplestore_puttupleslot(Tuplestorestate *state,
TupleTableSlot *slot); TupleTableSlot *slot);
extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple); extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple);
...@@ -53,7 +55,6 @@ extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple); ...@@ -53,7 +55,6 @@ extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple);
/* tuplestore_donestoring() used to be required, but is no longer used */ /* tuplestore_donestoring() used to be required, but is no longer used */
#define tuplestore_donestoring(state) ((void) 0) #define tuplestore_donestoring(state) ((void) 0)
/* backwards scan is only allowed if randomAccess was specified 'true' */
extern bool tuplestore_gettupleslot(Tuplestorestate *state, bool forward, extern bool tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
TupleTableSlot *slot); TupleTableSlot *slot);
extern bool tuplestore_advance(Tuplestorestate *state, bool forward); extern bool tuplestore_advance(Tuplestorestate *state, bool forward);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment