Commit 92ee2528 authored by Tom Lane's avatar Tom Lane

Tweak processing of multiple-index-scan plans to reduce overhead when

handling many-way scans: instead of re-evaluating all prior indexscan
quals to see if a tuple has been fetched more than once, use a hash table
indexed by tuple CTID.  But fall back to the old way if the hash table
grows to exceed SortMem.
parent 38e2bf62
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.82 2003/08/04 02:39:59 momjian Exp $ * $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.83 2003/08/22 20:26:43 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -28,19 +28,51 @@ ...@@ -28,19 +28,51 @@
#include "access/heapam.h" #include "access/heapam.h"
#include "executor/execdebug.h" #include "executor/execdebug.h"
#include "executor/nodeIndexscan.h" #include "executor/nodeIndexscan.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h" #include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h" #include "optimizer/clauses.h"
#include "parser/parsetree.h" #include "parser/parsetree.h"
/* ----------------
* Misc stuff to move to executor.h soon -cim 6/5/90
* ----------------
*/
#define NO_OP 0 #define NO_OP 0
#define LEFT_OP 1 #define LEFT_OP 1
#define RIGHT_OP 2 #define RIGHT_OP 2
/*
* In a multiple-index plan, we must take care to return any given tuple
* only once, even if it matches conditions of several index scans. Our
* preferred way to do this is to record already-returned tuples in a hash
* table (using the TID as unique identifier). However, in a very large
* scan this could conceivably run out of memory. We limit the hash table
* to no more than SortMem KB; if it grows past that, we fall back to the
* pre-7.4 technique: evaluate the prior-scan index quals again for each
* tuple (which is space-efficient, but slow).
*
* When scanning backwards, we use scannum to determine when to emit the
* tuple --- we have to re-emit a tuple in the same scan as it was first
* encountered.
*
* Note: this code would break if the planner were ever to create a multiple
* index plan with overall backwards direction, because the hashtable code
* will emit a tuple the first time it is encountered (which would be the
* highest scan in which it matches the index), but the evaluate-the-quals
* code will emit a tuple in the lowest-numbered scan in which it's valid.
* This could be fixed at need by making the evaluate-the-quals case more
* complex. Currently the planner will never create such a plan (since it
* considers multi-index plans unordered anyway), so there's no need for
* more complexity.
*/
typedef struct
{
/* tid is the hash key and so must be first! */
ItemPointerData tid; /* TID of a tuple we've returned */
int scannum; /* number of scan we returned it in */
} DupHashTabEntry;
static TupleTableSlot *IndexNext(IndexScanState *node); static TupleTableSlot *IndexNext(IndexScanState *node);
static void create_duphash(IndexScanState *node);
/* ---------------------------------------------------------------- /* ----------------------------------------------------------------
* IndexNext * IndexNext
...@@ -163,7 +195,7 @@ IndexNext(IndexScanState *node) ...@@ -163,7 +195,7 @@ IndexNext(IndexScanState *node)
while ((tuple = index_getnext(scandesc, direction)) != NULL) while ((tuple = index_getnext(scandesc, direction)) != NULL)
{ {
/* /*
* store the scanned tuple in the scan tuple slot of the scan * Store the scanned tuple in the scan tuple slot of the scan
* state. Note: we pass 'false' because tuples returned by * state. Note: we pass 'false' because tuples returned by
* amgetnext are pointers onto disk pages and must not be * amgetnext are pointers onto disk pages and must not be
* pfree()'d. * pfree()'d.
...@@ -174,12 +206,55 @@ IndexNext(IndexScanState *node) ...@@ -174,12 +206,55 @@ IndexNext(IndexScanState *node)
false); /* don't pfree */ false); /* don't pfree */
/* /*
* We must check to see if the current tuple was already * If it's a multiple-index scan, make sure not to double-report
* matched by an earlier index, so we don't double-report it. * a tuple matched by more than one index. (See notes above.)
* We do this by passing the tuple through ExecQual and
* checking for failure with all previous qualifications.
*/ */
if (node->iss_IndexPtr > 0) if (numIndices > 1)
{
/* First try the hash table */
if (node->iss_DupHash)
{
DupHashTabEntry *entry;
bool found;
entry = (DupHashTabEntry *)
hash_search(node->iss_DupHash,
&tuple->t_data->t_ctid,
HASH_ENTER,
&found);
if (entry == NULL ||
node->iss_DupHash->hctl->nentries > node->iss_MaxHash)
{
/* out of memory (either hard or soft limit) */
/* release hash table and fall thru to old code */
hash_destroy(node->iss_DupHash);
node->iss_DupHash = NULL;
}
else if (found)
{
/* pre-existing entry */
/*
* It's duplicate if first emitted in a different
* scan. If same scan, we must be backing up, so
* okay to emit again.
*/
if (entry->scannum != node->iss_IndexPtr)
{
/* Dup, so drop it and loop back for another */
ExecClearTuple(slot);
continue;
}
}
else
{
/* new entry, finish filling it in */
entry->scannum = node->iss_IndexPtr;
}
}
/* If hash table has overflowed, do it the hard way */
if (node->iss_DupHash == NULL &&
node->iss_IndexPtr > 0)
{ {
bool prev_matches = false; bool prev_matches = false;
int prev_index; int prev_index;
...@@ -201,11 +276,12 @@ IndexNext(IndexScanState *node) ...@@ -201,11 +276,12 @@ IndexNext(IndexScanState *node)
} }
if (prev_matches) if (prev_matches)
{ {
/* Duplicate, so drop it and loop back for another */ /* Dup, so drop it and loop back for another */
ExecClearTuple(slot); ExecClearTuple(slot);
continue; continue;
} }
} }
}
return slot; /* OK to return tuple */ return slot; /* OK to return tuple */
} }
...@@ -383,6 +459,14 @@ ExecIndexReScan(IndexScanState *node, ExprContext *exprCtxt) ...@@ -383,6 +459,14 @@ ExecIndexReScan(IndexScanState *node, ExprContext *exprCtxt)
return; return;
} }
/* reset hash table */
if (numIndices > 1)
{
if (node->iss_DupHash)
hash_destroy(node->iss_DupHash);
create_duphash(node);
}
/* reset index scans */ /* reset index scans */
if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indxorderdir)) if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indxorderdir))
node->iss_IndexPtr = numIndices; node->iss_IndexPtr = numIndices;
...@@ -432,6 +516,10 @@ ExecEndIndexScan(IndexScanState *node) ...@@ -432,6 +516,10 @@ ExecEndIndexScan(IndexScanState *node)
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
ExecClearTuple(node->ss.ss_ScanTupleSlot); ExecClearTuple(node->ss.ss_ScanTupleSlot);
/* drop hash table */
if (node->iss_DupHash)
hash_destroy(node->iss_DupHash);
/* /*
* close the index relations * close the index relations
*/ */
...@@ -919,12 +1007,42 @@ ExecInitIndexScan(IndexScan *node, EState *estate) ...@@ -919,12 +1007,42 @@ ExecInitIndexScan(IndexScan *node, EState *estate)
ExecAssignResultTypeFromTL(&indexstate->ss.ps); ExecAssignResultTypeFromTL(&indexstate->ss.ps);
ExecAssignScanProjectionInfo(&indexstate->ss); ExecAssignScanProjectionInfo(&indexstate->ss);
/*
* Initialize hash table if needed.
*/
if (numIndices > 1)
create_duphash(indexstate);
else
indexstate->iss_DupHash = NULL;
/* /*
* all done. * all done.
*/ */
return indexstate; return indexstate;
} }
static void
create_duphash(IndexScanState *node)
{
HASHCTL hash_ctl;
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = SizeOfIptrData;
hash_ctl.entrysize = sizeof(DupHashTabEntry);
hash_ctl.hash = tag_hash;
hash_ctl.hcxt = CurrentMemoryContext;
node->iss_DupHash = hash_create("DupHashTable",
(long) ceil(node->ss.ps.plan->plan_rows),
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
if (node->iss_DupHash == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
node->iss_MaxHash = (SortMem * 1024L) /
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(DupHashTabEntry)));
}
int int
ExecCountSlotsIndexScan(IndexScan *node) ExecCountSlotsIndexScan(IndexScan *node)
{ {
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $Id: execnodes.h,v 1.104 2003/08/19 01:13:41 tgl Exp $ * $Id: execnodes.h,v 1.105 2003/08/22 20:26:43 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -768,6 +768,8 @@ typedef ScanState SeqScanState; ...@@ -768,6 +768,8 @@ typedef ScanState SeqScanState;
* RuntimeKeysReady true if runtime Skeys have been computed * RuntimeKeysReady true if runtime Skeys have been computed
* RelationDescs ptr to array of relation descriptors * RelationDescs ptr to array of relation descriptors
* ScanDescs ptr to array of scan descriptors * ScanDescs ptr to array of scan descriptors
* DupHash hashtable for recognizing dups in multiple scan
* MaxHash max # entries we will allow in hashtable
* ---------------- * ----------------
*/ */
typedef struct IndexScanState typedef struct IndexScanState
...@@ -785,6 +787,8 @@ typedef struct IndexScanState ...@@ -785,6 +787,8 @@ typedef struct IndexScanState
bool iss_RuntimeKeysReady; bool iss_RuntimeKeysReady;
RelationPtr iss_RelationDescs; RelationPtr iss_RelationDescs;
IndexScanDescPtr iss_ScanDescs; IndexScanDescPtr iss_ScanDescs;
HTAB *iss_DupHash;
long iss_MaxHash;
} IndexScanState; } IndexScanState;
/* ---------------- /* ----------------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment