Commit 09cb5c0e authored by Tom Lane's avatar Tom Lane

Rewrite btree index scans to work a page at a time in all cases (both

btgettuple and btgetmulti).  This eliminates the problem of "re-finding" the
exact stopping point, since the stopping point is effectively always a page
boundary, and index items are never moved across pre-existing page boundaries.
A small penalty is that the keys_are_unique optimization is effectively
disabled (and, therefore, is removed in this patch), causing us to apply
_bt_checkkeys() to at least one more tuple than necessary when looking up a
unique key.  However, the advantages for non-unique cases seem great enough to
accept this tradeoff.  Aside from simplifying and (sometimes) speeding up the
indexscan code, this will allow us to reimplement btbulkdelete as a largely
sequential scan instead of index-order traversal, thereby significantly
reducing the cost of VACUUM.  Those changes will come in a separate patch.

Original patch by Heikki Linnakangas, rework by Tom Lane.
parent 88d94a11
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.54 2006/03/05 15:58:21 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.55 2006/05/07 01:21:30 tgl Exp $
* *
* NOTES * NOTES
* many of the old access method routines have been turned into * many of the old access method routines have been turned into
...@@ -90,8 +90,6 @@ RelationGetIndexScan(Relation indexRelation, ...@@ -90,8 +90,6 @@ RelationGetIndexScan(Relation indexRelation,
scan->have_lock = false; /* ditto */ scan->have_lock = false; /* ditto */
scan->kill_prior_tuple = false; scan->kill_prior_tuple = false;
scan->ignore_killed_tuples = true; /* default setting */ scan->ignore_killed_tuples = true; /* default setting */
scan->keys_are_unique = false; /* may be set by index AM */
scan->got_tuple = false;
scan->opaque = NULL; scan->opaque = NULL;
...@@ -102,9 +100,6 @@ RelationGetIndexScan(Relation indexRelation, ...@@ -102,9 +100,6 @@ RelationGetIndexScan(Relation indexRelation,
scan->xs_ctup.t_data = NULL; scan->xs_ctup.t_data = NULL;
scan->xs_cbuf = InvalidBuffer; scan->xs_cbuf = InvalidBuffer;
scan->unique_tuple_pos = 0;
scan->unique_tuple_mark = 0;
pgstat_initstats(&scan->xs_pgstat_info, indexRelation); pgstat_initstats(&scan->xs_pgstat_info, indexRelation);
/* /*
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.92 2006/05/02 22:25:10 tgl Exp $ * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.93 2006/05/07 01:21:30 tgl Exp $
* *
* INTERFACE ROUTINES * INTERFACE ROUTINES
* index_open - open an index relation by relation OID * index_open - open an index relation by relation OID
...@@ -362,10 +362,6 @@ index_rescan(IndexScanDesc scan, ScanKey key) ...@@ -362,10 +362,6 @@ index_rescan(IndexScanDesc scan, ScanKey key)
} }
scan->kill_prior_tuple = false; /* for safety */ scan->kill_prior_tuple = false; /* for safety */
scan->keys_are_unique = false; /* may be set by index AM */
scan->got_tuple = false;
scan->unique_tuple_pos = 0;
scan->unique_tuple_mark = 0;
FunctionCall2(procedure, FunctionCall2(procedure,
PointerGetDatum(scan), PointerGetDatum(scan),
...@@ -417,8 +413,6 @@ index_markpos(IndexScanDesc scan) ...@@ -417,8 +413,6 @@ index_markpos(IndexScanDesc scan)
SCAN_CHECKS; SCAN_CHECKS;
GET_SCAN_PROCEDURE(ammarkpos); GET_SCAN_PROCEDURE(ammarkpos);
scan->unique_tuple_mark = scan->unique_tuple_pos;
FunctionCall1(procedure, PointerGetDatum(scan)); FunctionCall1(procedure, PointerGetDatum(scan));
} }
...@@ -440,13 +434,6 @@ index_restrpos(IndexScanDesc scan) ...@@ -440,13 +434,6 @@ index_restrpos(IndexScanDesc scan)
scan->kill_prior_tuple = false; /* for safety */ scan->kill_prior_tuple = false; /* for safety */
/*
* We do not reset got_tuple; so if the scan is actually being
* short-circuited by index_getnext, the effective position restoration is
* done by restoring unique_tuple_pos.
*/
scan->unique_tuple_pos = scan->unique_tuple_mark;
FunctionCall1(procedure, PointerGetDatum(scan)); FunctionCall1(procedure, PointerGetDatum(scan));
} }
...@@ -456,8 +443,7 @@ index_restrpos(IndexScanDesc scan) ...@@ -456,8 +443,7 @@ index_restrpos(IndexScanDesc scan)
* The result is the next heap tuple satisfying the scan keys and the * The result is the next heap tuple satisfying the scan keys and the
* snapshot, or NULL if no more matching tuples exist. On success, * snapshot, or NULL if no more matching tuples exist. On success,
* the buffer containing the heap tuple is pinned (the pin will be dropped * the buffer containing the heap tuple is pinned (the pin will be dropped
* at the next index_getnext or index_endscan). The index TID corresponding * at the next index_getnext or index_endscan).
* to the heap tuple can be obtained if needed from scan->currentItemData.
* ---------------- * ----------------
*/ */
HeapTuple HeapTuple
...@@ -469,65 +455,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) ...@@ -469,65 +455,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
SCAN_CHECKS; SCAN_CHECKS;
GET_SCAN_PROCEDURE(amgettuple); GET_SCAN_PROCEDURE(amgettuple);
/*
* If we already got a tuple and it must be unique, there's no need to
* make the index AM look through any additional tuples. (This can save a
* useful amount of work in scenarios where there are many dead tuples due
* to heavy update activity.)
*
* To do this we must keep track of the logical scan position
* (before/on/after tuple). Also, we have to be sure to release scan
* resources before returning NULL; if we fail to do so then a multi-index
* scan can easily run the system out of free buffers. We can release
* index-level resources fairly cheaply by calling index_rescan. This
* means there are two persistent states as far as the index AM is
* concerned: on-tuple and rescanned. If we are actually asked to
* re-fetch the single tuple, we have to go through a fresh indexscan
* startup, which penalizes that (infrequent) case.
*/
if (scan->keys_are_unique && scan->got_tuple)
{
int new_tuple_pos = scan->unique_tuple_pos;
if (ScanDirectionIsForward(direction))
{
if (new_tuple_pos <= 0)
new_tuple_pos++;
}
else
{
if (new_tuple_pos >= 0)
new_tuple_pos--;
}
if (new_tuple_pos == 0)
{
/*
* We are moving onto the unique tuple from having been off it. We
* just fall through and let the index AM do the work. Note we
* should get the right answer regardless of scan direction.
*/
scan->unique_tuple_pos = 0; /* need to update position */
}
else
{
/*
* Moving off the tuple; must do amrescan to release index-level
* pins before we return NULL. Since index_rescan will reset my
* state, must save and restore...
*/
int unique_tuple_mark = scan->unique_tuple_mark;
index_rescan(scan, NULL /* no change to key */ );
scan->keys_are_unique = true;
scan->got_tuple = true;
scan->unique_tuple_pos = new_tuple_pos;
scan->unique_tuple_mark = unique_tuple_mark;
return NULL;
}
}
/* just make sure this is false... */ /* just make sure this is false... */
scan->kill_prior_tuple = false; scan->kill_prior_tuple = false;
...@@ -588,14 +515,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) ...@@ -588,14 +515,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
} }
/* Success exit */ /* Success exit */
scan->got_tuple = true;
/*
* If we just fetched a known-unique tuple, then subsequent calls will go
* through the short-circuit code above. unique_tuple_pos has been
* initialized to 0, which is the correct state ("on row").
*/
return heapTuple; return heapTuple;
} }
...@@ -608,8 +527,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) ...@@ -608,8 +527,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
* (which most callers of this routine will probably want to suppress by * (which most callers of this routine will probably want to suppress by
* setting scan->ignore_killed_tuples = false). * setting scan->ignore_killed_tuples = false).
* *
* On success (TRUE return), the found index TID is in scan->currentItemData, * On success (TRUE return), the heap TID of the found index entry is in
* and its heap TID is in scan->xs_ctup.t_self. scan->xs_cbuf is untouched. * scan->xs_ctup.t_self. scan->xs_cbuf is untouched.
* ---------------- * ----------------
*/ */
bool bool
......
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $ $PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.11 2006/05/07 01:21:30 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao, high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
...@@ -67,13 +67,22 @@ move right until we find a page whose right-link matches the page we ...@@ -67,13 +67,22 @@ move right until we find a page whose right-link matches the page we
came from. (Actually, it's even harder than that; see deletion discussion came from. (Actually, it's even harder than that; see deletion discussion
below.) below.)
Read locks on a page are held for as long as a scan is examining a page. Page read locks are held only for as long as a scan is examining a page.
But nbtree.c arranges to drop the read lock, but not the buffer pin, To minimize lock/unlock traffic, an index scan always searches a leaf page
on the current page of a scan before control leaves nbtree. When we to identify all the matching items at once, copying their heap tuple IDs
come back to resume the scan, we have to re-grab the read lock and into backend-local storage. The heap tuple IDs are then processed while
then move right if the current item moved (see _bt_restscan()). Keeping not holding any page lock within the index. We do continue to hold a pin
the pin ensures that the current item cannot move left or be deleted on the leaf page, to protect against concurrent deletions (see below).
(see btbulkdelete). In this state the scan is effectively stopped "between" pages, either
before or after the page it has pinned. This is safe in the presence of
concurrent insertions and even page splits, because items are never moved
across pre-existing page boundaries --- so the scan cannot miss any items
it should have seen, nor accidentally return the same item twice. The scan
must remember the page's right-link at the time it was scanned, since that
is the page to move right to; if we move right to the current right-link
then we'd re-scan any items moved by a page split. We don't similarly
remember the left-link, since it's best to use the most up-to-date
left-link when trying to move left (see detailed move-left algorithm below).
In most cases we release our lock and pin on a page before attempting In most cases we release our lock and pin on a page before attempting
to acquire pin and lock on the page we are moving to. In a few places to acquire pin and lock on the page we are moving to. In a few places
...@@ -119,14 +128,33 @@ item doesn't fit on the split page where it needs to go! ...@@ -119,14 +128,33 @@ item doesn't fit on the split page where it needs to go!
The deletion algorithm The deletion algorithm
---------------------- ----------------------
Deletions of leaf items are handled by getting a super-exclusive lock on Before deleting a leaf item, we get a super-exclusive lock on the target
the target page, so that no other backend has a pin on the page when the page, so that no other backend has a pin on the page when the deletion
deletion starts. This means no scan is pointing at the page, so no other starts. This is not necessary for correctness in terms of the btree index
backend can lose its place due to the item deletion. operations themselves; as explained above, index scans logically stop
"between" pages and so can't lose their place. The reason we do it is to
The above does not work for deletion of items in internal pages, since provide an interlock between non-full VACUUM and indexscans. Since VACUUM
other backends keep no lock nor pin on a page they have descended past. deletes index entries before deleting tuples, the super-exclusive lock
Instead, when a backend is ascending the tree using its stack, it must guarantees that VACUUM can't delete any heap tuple that an indexscanning
process might be about to visit. (This guarantee works only for simple
indexscans that visit the heap in sync with the index scan, not for bitmap
scans. We only need the guarantee when using non-MVCC snapshot rules such
as SnapshotNow, so in practice this is only important for system catalog
accesses.)
Because a page can be split even while someone holds a pin on it, it is
possible that an indexscan will return items that are no longer stored on
the page it has a pin on, but rather somewhere to the right of that page.
To ensure that VACUUM can't prematurely remove such heap tuples, we require
btbulkdelete to obtain super-exclusive lock on every leaf page in the index
(even pages that don't contain any deletable tuples). This guarantees that
the btbulkdelete call cannot return while any indexscan is still holding
a copy of a deleted index tuple. Note that this requirement does not say
that btbulkdelete must visit the pages in any particular order.
There is no such interlocking for deletion of items in internal pages,
since backends keep no lock nor pin on a page they have descended past.
Hence, when a backend is ascending the tree using its stack, it must
be prepared for the possibility that the item it wants is to the left of be prepared for the possibility that the item it wants is to the left of
the recorded position (but it can't have moved left out of the recorded the recorded position (but it can't have moved left out of the recorded
page). Since we hold a lock on the lower page (per L&Y) until we have page). Since we hold a lock on the lower page (per L&Y) until we have
...@@ -201,7 +229,7 @@ accordingly. Searches and forward scans simply follow the right-link ...@@ -201,7 +229,7 @@ accordingly. Searches and forward scans simply follow the right-link
until they find a non-dead page --- this will be where the deleted page's until they find a non-dead page --- this will be where the deleted page's
key-space moved to. key-space moved to.
Stepping left in a backward scan is complicated because we must consider Moving left in a backward scan is complicated because we must consider
the possibility that the left sibling was just split (meaning we must find the possibility that the left sibling was just split (meaning we must find
the rightmost page derived from the left sibling), plus the possibility the rightmost page derived from the left sibling), plus the possibility
that the page we were just on has now been deleted and hence isn't in the that the page we were just on has now been deleted and hence isn't in the
......
This diff is collapsed.
This diff is collapsed.
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.72 2006/03/05 15:58:21 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.73 2006/05/07 01:21:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -196,11 +196,6 @@ _bt_freestack(BTStack stack) ...@@ -196,11 +196,6 @@ _bt_freestack(BTStack stack)
* Again though, only keys with RHS datatype equal to the index datatype * Again though, only keys with RHS datatype equal to the index datatype
* can be checked for contradictions. * can be checked for contradictions.
* *
* Furthermore, we detect the case where the index is unique and we have
* equality quals for all columns. In this case there can be at most one
* (visible) matching tuple. index_getnext uses this to avoid uselessly
* continuing the scan after finding one match.
*
* Row comparison keys are treated the same as comparisons to nondefault * Row comparison keys are treated the same as comparisons to nondefault
* datatypes: we just transfer them into the preprocessed array without any * datatypes: we just transfer them into the preprocessed array without any
* editorialization. We can treat them the same as an ordinary inequality * editorialization. We can treat them the same as an ordinary inequality
...@@ -216,7 +211,6 @@ _bt_freestack(BTStack stack) ...@@ -216,7 +211,6 @@ _bt_freestack(BTStack stack)
void void
_bt_preprocess_keys(IndexScanDesc scan) _bt_preprocess_keys(IndexScanDesc scan)
{ {
Relation relation = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque; BTScanOpaque so = (BTScanOpaque) scan->opaque;
int numberOfKeys = scan->numberOfKeys; int numberOfKeys = scan->numberOfKeys;
int new_numberOfKeys; int new_numberOfKeys;
...@@ -234,7 +228,6 @@ _bt_preprocess_keys(IndexScanDesc scan) ...@@ -234,7 +228,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
/* initialize result variables */ /* initialize result variables */
so->qual_ok = true; so->qual_ok = true;
so->numberOfKeys = 0; so->numberOfKeys = 0;
scan->keys_are_unique = false;
if (numberOfKeys < 1) if (numberOfKeys < 1)
return; /* done if qual-less scan */ return; /* done if qual-less scan */
...@@ -256,13 +249,6 @@ _bt_preprocess_keys(IndexScanDesc scan) ...@@ -256,13 +249,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
*/ */
if (cur->sk_flags & SK_ISNULL) if (cur->sk_flags & SK_ISNULL)
so->qual_ok = false; so->qual_ok = false;
else if (relation->rd_index->indisunique &&
relation->rd_rel->relnatts == 1)
{
/* it's a unique index, do we have an equality qual? */
if (cur->sk_strategy == BTEqualStrategyNumber)
scan->keys_are_unique = true;
}
memcpy(outkeys, inkeys, sizeof(ScanKeyData)); memcpy(outkeys, inkeys, sizeof(ScanKeyData));
so->numberOfKeys = 1; so->numberOfKeys = 1;
/* We can mark the qual as required if it's for first index col */ /* We can mark the qual as required if it's for first index col */
...@@ -464,14 +450,6 @@ _bt_preprocess_keys(IndexScanDesc scan) ...@@ -464,14 +450,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
} }
so->numberOfKeys = new_numberOfKeys; so->numberOfKeys = new_numberOfKeys;
/*
* If unique index and we have equality keys for all columns, set
* keys_are_unique flag for higher levels.
*/
if (relation->rd_index->indisunique &&
relation->rd_rel->relnatts == numberOfEqualCols)
scan->keys_are_unique = true;
} }
/* /*
...@@ -826,3 +804,89 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, ...@@ -826,3 +804,89 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
return result; return result;
} }
/*
* _bt_killitems - set LP_DELETE bit for items an indexscan caller has
* told us were killed
*
* scan->so contains information about the current page and killed tuples
* thereon (generally, this should only be called if so->numKilled > 0).
*
* The caller must have pin on so->currPos.buf, but may or may not have
* read-lock, as indicated by haveLock. Note that we assume read-lock
* is sufficient for setting LP_DELETE hint bits.
*
* We match items by heap TID before assuming they are the right ones to
* delete. We cope with cases where items have moved right due to insertions.
* If an item has moved off the current page due to a split, we'll fail to
* find it and do nothing (this is not an error case --- we assume the item
* will eventually get marked in a future indexscan). Likewise, if the item
* has moved left due to deletions or disappeared itself, we'll not find it,
* but these cases are not worth optimizing. (Since deletions are only done
* by VACUUM, any deletion makes it highly likely that the dead item has been
* removed itself, and therefore searching left is not worthwhile.)
*/
void
_bt_killitems(IndexScanDesc scan, bool haveLock)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
int i;
bool killedsomething = false;
Assert(BufferIsValid(so->currPos.buf));
if (!haveLock)
LockBuffer(so->currPos.buf, BT_READ);
page = BufferGetPage(so->currPos.buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
for (i = 0; i < so->numKilled; i++)
{
int itemIndex = so->killedItems[i];
BTScanPosItem *kitem = &so->currPos.items[itemIndex];
OffsetNumber offnum = kitem->indexOffset;
Assert(itemIndex >= so->currPos.firstItem &&
itemIndex <= so->currPos.lastItem);
if (offnum < minoff)
continue; /* pure paranoia */
while (offnum <= maxoff)
{
ItemId iid = PageGetItemId(page, offnum);
IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
{
/* found the item */
iid->lp_flags |= LP_DELETE;
killedsomething = true;
break; /* out of inner search loop */
}
offnum = OffsetNumberNext(offnum);
}
}
/*
* Since this can be redone later if needed, it's treated the same
* as a commit-hint-bit status update for heap tuples: we mark the
* buffer dirty but don't make a WAL log entry.
*/
if (killedsomething)
SetBufferCommitInfoNeedsSave(so->currPos.buf);
if (!haveLock)
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
/*
* Always reset the scan state, so we don't look for same items
* on other pages.
*/
so->numKilled = 0;
}
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/itup.h,v 1.45 2006/03/05 15:58:53 momjian Exp $ * $PostgreSQL: pgsql/src/include/access/itup.h,v 1.46 2006/05/07 01:21:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -126,6 +126,17 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap; ...@@ -126,6 +126,17 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap;
) \ ) \
) )
/*
* MaxIndexTuplesPerPage is an upper bound on the number of tuples that can
* fit on one index page. An index tuple must have either data or a null
* bitmap, so we can safely assume it's at least 1 byte bigger than a bare
* IndexTupleData struct. We arrive at the divisor because each tuple
* must be maxaligned, and it must have an associated item pointer.
*/
#define MaxIndexTuplesPerPage \
((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \
(MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))))
/* routines in indextuple.c */ /* routines in indextuple.c */
extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor, extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor,
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.96 2006/04/13 03:53:05 tgl Exp $ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.97 2006/05/07 01:21:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -336,30 +336,82 @@ typedef struct BTStackData ...@@ -336,30 +336,82 @@ typedef struct BTStackData
typedef BTStackData *BTStack; typedef BTStackData *BTStack;
/* /*
* BTScanOpaqueData is used to remember which buffers we're currently * BTScanOpaqueData is the btree-private state needed for an indexscan.
* examining in an indexscan. Between calls to btgettuple or btgetmulti, * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
* we keep these buffers pinned (but not locked, see nbtree.c) to avoid * details of the preprocessing), information about the current location
* doing a ReadBuffer() for every tuple in the index. * of the scan, and information about the marked location, if any. (We use
* BTScanPosData to represent the data needed for each of current and marked
* locations.) In addition we can remember some known-killed index entries
* that must be marked before we can move off the current page.
* *
* We also store preprocessed versions of the scan keys in this structure. * Index scans work a page at a time: we pin and read-lock the page, identify
* See _bt_preprocess_keys() for details of the preprocessing. * all the matching items on the page and save them in BTScanPosData, then
* release the read-lock while returning the items to the caller for
* processing. This approach minimizes lock/unlock traffic. Note that we
* keep the pin on the index page until the caller is done with all the items
* (this is needed for VACUUM synchronization, see nbtree/README). When we
* are ready to step to the next page, if the caller has told us any of the
* items were killed, we re-lock the page to mark them killed, then unlock.
* Finally we drop the pin and step to the next page in the appropriate
* direction.
* *
* curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked * NOTE: in this implementation, btree does not use or set the
* index tuples: we don't adjust scans on insertions - instead we * currentItemData and currentMarkData fields of IndexScanDesc at all.
* use these pointers to restore index scan positions...
* - vadim 07/29/98
*/ */
typedef struct BTScanPosItem /* what we remember about each match */
{
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
} BTScanPosItem;
typedef struct BTScanPosData
{
Buffer buf; /* if valid, the buffer is pinned */
BlockNumber nextPage; /* page's right link when we scanned it */
/*
* moreLeft and moreRight track whether we think there may be matching
* index entries to the left and right of the current page, respectively.
* We can clear the appropriate one of these flags when _bt_checkkeys()
* returns continuescan = false.
*/
bool moreLeft;
bool moreRight;
/*
* The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the
* array back-to-front, so we start at the last slot and fill downwards.
* Hence we need both a first-valid-entry and a last-valid-entry counter.
* itemIndex is a cursor showing which entry was last returned to caller.
*/
int firstItem; /* first valid index in items[] */
int lastItem; /* last valid index in items[] */
int itemIndex; /* current index in items[] */
BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
} BTScanPosData;
typedef BTScanPosData *BTScanPos;
#define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)
typedef struct BTScanOpaqueData typedef struct BTScanOpaqueData
{ {
Buffer btso_curbuf;
Buffer btso_mrkbuf;
ItemPointerData curHeapIptr;
ItemPointerData mrkHeapIptr;
/* these fields are set by _bt_preprocess_keys(): */ /* these fields are set by _bt_preprocess_keys(): */
bool qual_ok; /* false if qual can never be satisfied */ bool qual_ok; /* false if qual can never be satisfied */
int numberOfKeys; /* number of preprocessed scan keys */ int numberOfKeys; /* number of preprocessed scan keys */
ScanKey keyData; /* array of preprocessed scan keys */ ScanKey keyData; /* array of preprocessed scan keys */
/* info about killed items if any (killedItems is NULL if never used) */
int *killedItems; /* currPos.items indexes of killed items */
int numKilled; /* number of currently stored items */
/* keep these last in struct for efficiency */
BTScanPosData currPos; /* current position data */
BTScanPosData markPos; /* marked position, if any */
} BTScanOpaqueData; } BTScanOpaqueData;
typedef BTScanOpaqueData *BTScanOpaque; typedef BTScanOpaqueData *BTScanOpaque;
...@@ -424,9 +476,8 @@ extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ...@@ -424,9 +476,8 @@ extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
ScanKey scankey, bool nextkey); ScanKey scankey, bool nextkey);
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
Page page, OffsetNumber offnum); Page page, OffsetNumber offnum);
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
/* /*
...@@ -440,6 +491,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan); ...@@ -440,6 +491,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan);
extern bool _bt_checkkeys(IndexScanDesc scan, extern bool _bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum, Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan); ScanDirection dir, bool *continuescan);
extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
/* /*
* prototypes for functions in nbtsort.c * prototypes for functions in nbtsort.c
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.44 2006/03/05 15:58:53 momjian Exp $ * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.45 2006/05/07 01:21:30 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -67,12 +67,9 @@ typedef struct IndexScanDescData ...@@ -67,12 +67,9 @@ typedef struct IndexScanDescData
bool kill_prior_tuple; /* last-returned tuple is dead */ bool kill_prior_tuple; /* last-returned tuple is dead */
bool ignore_killed_tuples; /* do not return killed entries */ bool ignore_killed_tuples; /* do not return killed entries */
/* set by index AM if scan keys satisfy index's uniqueness constraint */ /* index access method's private state */
bool keys_are_unique;
/* scan current state */
bool got_tuple; /* true after successful index_getnext */
void *opaque; /* access-method-specific info */ void *opaque; /* access-method-specific info */
/* these fields are used by some but not all AMs: */
ItemPointerData currentItemData; /* current index pointer */ ItemPointerData currentItemData; /* current index pointer */
ItemPointerData currentMarkData; /* marked position, if any */ ItemPointerData currentMarkData; /* marked position, if any */
...@@ -85,15 +82,6 @@ typedef struct IndexScanDescData ...@@ -85,15 +82,6 @@ typedef struct IndexScanDescData
Buffer xs_cbuf; /* current heap buffer in scan, if any */ Buffer xs_cbuf; /* current heap buffer in scan, if any */
/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
/*
* If keys_are_unique and got_tuple are both true, we stop calling the
* index AM; it is then necessary for index_getnext to keep track of the
* logical scan position for itself. It does that using unique_tuple_pos:
* -1 = before row, 0 = on row, +1 = after row.
*/
int unique_tuple_pos; /* logical position */
int unique_tuple_mark; /* logical marked position */
PgStat_Info xs_pgstat_info; /* statistics collector hook */ PgStat_Info xs_pgstat_info; /* statistics collector hook */
} IndexScanDescData; } IndexScanDescData;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment