Commit cbfa92c2 authored by Tom Lane's avatar Tom Lane

Improve index-only scans to avoid repeated access to the index page.

We copy all the matched tuples off the page during _bt_readpage, instead of
expensively re-locking the page during each subsequent tuple fetch.  This
costs a bit more local storage, but not more than 2*BLCKSZ worth, and the
reduction in LWLock traffic is certainly worth that.  What's more, this
lets us get rid of the API wart in the original patch that said an index AM
could randomly decline to supply an index tuple despite having asserted
pg_am.amcanreturn.  That will be important for future improvements in the
index-only-scan feature, since the executor will now be able to rely on
having the index data available.
parent 45401c1c
...@@ -394,12 +394,13 @@ amgettuple (IndexScanDesc scan, ...@@ -394,12 +394,13 @@ amgettuple (IndexScanDesc scan,
If the access method supports index-only scans (i.e., If the access method supports index-only scans (i.e.,
<structfield>amcanreturn</structfield> is TRUE in its <structname>pg_am</> <structfield>amcanreturn</structfield> is TRUE in its <structname>pg_am</>
row), then on success it must also check row), then on success it must also check
<literal>scan-&gt;xs_want_itup</>, and if that is true it should return <literal>scan-&gt;xs_want_itup</>, and if that is true it must return
the original indexed data for the index entry, in the form of an the original indexed data for the index entry, in the form of an
<structname>IndexTuple</> stored at <literal>scan-&gt;xs_itup</>. However, <structname>IndexTuple</> pointer stored at <literal>scan-&gt;xs_itup</>.
it is permissible for the access method to sometimes fail to provide this (Management of the data referenced by the pointer is the access method's
data, in which case it must set <literal>scan-&gt;xs_itup</> to NULL. That responsibility. The data must remain good at least until the next
will result in a regular heap fetch occurring. <function>amgettuple</>, <function>amrescan</>, or <function>amendscan</>
call for the scan.)
</para> </para>
<para> <para>
......
...@@ -443,9 +443,10 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) ...@@ -443,9 +443,10 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
Assert(TransactionIdIsValid(RecentGlobalXmin)); Assert(TransactionIdIsValid(RecentGlobalXmin));
/* /*
* The AM's gettuple proc finds the next index entry matching the scan * The AM's amgettuple proc finds the next index entry matching the scan
* keys, and puts the TID in xs_ctup.t_self. It should also set * keys, and puts the TID into scan->xs_ctup.t_self. It should also set
* scan->xs_recheck, though we pay no attention to that here. * scan->xs_recheck and possibly scan->xs_itup, though we pay no attention
* to those fields here.
*/ */
found = DatumGetBool(FunctionCall2(procedure, found = DatumGetBool(FunctionCall2(procedure,
PointerGetDatum(scan), PointerGetDatum(scan),
......
...@@ -73,7 +73,6 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, ...@@ -73,7 +73,6 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
BTCycleId cycleid); BTCycleId cycleid);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno); BlockNumber orig_blkno);
static IndexTuple bt_getindextuple(IndexScanDesc scan);
/* /*
...@@ -311,94 +310,9 @@ btgettuple(PG_FUNCTION_ARGS) ...@@ -311,94 +310,9 @@ btgettuple(PG_FUNCTION_ARGS)
else else
res = _bt_first(scan, dir); res = _bt_first(scan, dir);
/* Return the whole index tuple if requested */
if (scan->xs_want_itup)
{
/* First, free the last one ... */
if (scan->xs_itup != NULL)
{
pfree(scan->xs_itup);
scan->xs_itup = NULL;
}
if (res)
scan->xs_itup = bt_getindextuple(scan);
}
PG_RETURN_BOOL(res); PG_RETURN_BOOL(res);
} }
/*
* bt_getindextuple - fetch index tuple at current position.
*
* This can fail to find the tuple if new tuples have been inserted on the
* index page since we stepped onto the page. NULL is returned in that case.
* (We could try a bit harder by searching for the TID; but if insertions
* are happening, it's reasonably likely that an index-only scan will fail
* anyway because of visibility. So probably not worth the trouble.)
*
* The tuple returned is a palloc'd copy, so that we don't need to keep a
* lock on the index page.
*
* The caller must have pin on so->currPos.buf.
*/
static IndexTuple
bt_getindextuple(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
int itemIndex;
OffsetNumber offnum;
IndexTuple ituple,
result;
Assert(BufferIsValid(so->currPos.buf));
LockBuffer(so->currPos.buf, BT_READ);
/* Locate the tuple, being paranoid about possibility the page changed */
page = BufferGetPage(so->currPos.buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
itemIndex = so->currPos.itemIndex;
/* pure paranoia */
Assert(itemIndex >= so->currPos.firstItem &&
itemIndex <= so->currPos.lastItem);
offnum = so->currPos.items[itemIndex].indexOffset;
if (offnum < minoff || offnum > maxoff)
{
/* should never happen, since we have pin on page, but be careful */
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
return NULL;
}
ituple = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
if (ItemPointerEquals(&ituple->t_tid, &scan->xs_ctup.t_self))
{
/* yup, it's the desired tuple, so make a copy */
Size itupsz = IndexTupleSize(ituple);
result = palloc(itupsz);
memcpy(result, ituple, itupsz);
}
else
{
/* oops, it got moved */
result = NULL;
}
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
return result;
}
/* /*
* btgetbitmap() -- gets all matching tuples, and adds them to a bitmap * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
*/ */
...@@ -471,6 +385,15 @@ btbeginscan(PG_FUNCTION_ARGS) ...@@ -471,6 +385,15 @@ btbeginscan(PG_FUNCTION_ARGS)
so->keyData = NULL; so->keyData = NULL;
so->killedItems = NULL; /* until needed */ so->killedItems = NULL; /* until needed */
so->numKilled = 0; so->numKilled = 0;
/*
* We don't know yet whether the scan will be index-only, so we do not
* allocate the tuple workspace arrays until btrescan.
*/
so->currTuples = so->markTuples = NULL;
so->currPos.nextTupleOffset = 0;
so->markPos.nextTupleOffset = 0;
scan->opaque = so; scan->opaque = so;
PG_RETURN_POINTER(scan); PG_RETURN_POINTER(scan);
...@@ -505,6 +428,18 @@ btrescan(PG_FUNCTION_ARGS) ...@@ -505,6 +428,18 @@ btrescan(PG_FUNCTION_ARGS)
} }
so->markItemIndex = -1; so->markItemIndex = -1;
/*
* Allocate tuple workspace arrays, if needed for an index-only scan and
* not already done in a previous rescan call. To save on palloc
* overhead, both workspaces are allocated as one palloc block; only this
* function and btendscan know that.
*/
if (scan->xs_want_itup && so->currTuples == NULL)
{
so->currTuples = (char *) palloc(BLCKSZ * 2);
so->markTuples = so->currTuples + BLCKSZ;
}
/* /*
* Reset the scan keys. Note that keys ordering stuff moved to _bt_first. * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
* - vadim 05/05/97 * - vadim 05/05/97
...@@ -544,18 +479,16 @@ btendscan(PG_FUNCTION_ARGS) ...@@ -544,18 +479,16 @@ btendscan(PG_FUNCTION_ARGS)
} }
so->markItemIndex = -1; so->markItemIndex = -1;
/* Release storage */
if (so->killedItems != NULL) if (so->killedItems != NULL)
pfree(so->killedItems); pfree(so->killedItems);
if (so->keyData != NULL) if (so->keyData != NULL)
pfree(so->keyData); pfree(so->keyData);
if (so->currTuples != NULL)
pfree(so->currTuples);
/* so->markTuples should not be pfree'd, see btrescan */
pfree(so); pfree(so);
if (scan->xs_itup != NULL)
{
pfree(scan->xs_itup);
scan->xs_itup = NULL;
}
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
...@@ -626,6 +559,9 @@ btrestrpos(PG_FUNCTION_ARGS) ...@@ -626,6 +559,9 @@ btrestrpos(PG_FUNCTION_ARGS)
memcpy(&so->currPos, &so->markPos, memcpy(&so->currPos, &so->markPos,
offsetof(BTScanPosData, items[1]) + offsetof(BTScanPosData, items[1]) +
so->markPos.lastItem * sizeof(BTScanPosItem)); so->markPos.lastItem * sizeof(BTScanPosItem));
if (so->currTuples)
memcpy(so->currTuples, so->markTuples,
so->markPos.nextTupleOffset);
} }
} }
......
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
OffsetNumber offnum); OffsetNumber offnum);
static void _bt_saveitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup);
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
static Buffer _bt_walk_left(Relation rel, Buffer buf); static Buffer _bt_walk_left(Relation rel, Buffer buf);
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
...@@ -429,8 +431,9 @@ _bt_compare(Relation rel, ...@@ -429,8 +431,9 @@ _bt_compare(Relation rel,
* if backwards scan, the last item) in the tree that satisfies the * if backwards scan, the last item) in the tree that satisfies the
* qualifications in the scan key. On success exit, the page containing * qualifications in the scan key. On success exit, the page containing
* the current index tuple is pinned but not locked, and data about * the current index tuple is pinned but not locked, and data about
* the matching tuple(s) on the page has been loaded into so->currPos, * the matching tuple(s) on the page has been loaded into so->currPos.
* and scan->xs_ctup.t_self is set to the heap TID of the current tuple. * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
* and if requested, scan->xs_itup points to a copy of the index tuple.
* *
* If there are no matching items in the index, we return FALSE, with no * If there are no matching items in the index, we return FALSE, with no
* pins or locks held. * pins or locks held.
...@@ -456,6 +459,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ...@@ -456,6 +459,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
int keysCount = 0; int keysCount = 0;
int i; int i;
StrategyNumber strat_total; StrategyNumber strat_total;
BTScanPosItem *currItem;
pgstat_count_index_scan(rel); pgstat_count_index_scan(rel);
...@@ -912,7 +916,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ...@@ -912,7 +916,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
/* OK, itemIndex says what to return */ /* OK, itemIndex says what to return */
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true; return true;
} }
...@@ -925,7 +932,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ...@@ -925,7 +932,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* previously returned. * previously returned.
* *
* On successful exit, scan->xs_ctup.t_self is set to the TID of the * On successful exit, scan->xs_ctup.t_self is set to the TID of the
* next heap tuple, and so->currPos is updated as needed. * next heap tuple, and if requested, scan->xs_itup points to a copy of
* the index tuple. so->currPos is updated as needed.
* *
* On failure exit (no more tuples), we release pin and set * On failure exit (no more tuples), we release pin and set
* so->currPos.buf to InvalidBuffer. * so->currPos.buf to InvalidBuffer.
...@@ -934,6 +942,7 @@ bool ...@@ -934,6 +942,7 @@ bool
_bt_next(IndexScanDesc scan, ScanDirection dir) _bt_next(IndexScanDesc scan, ScanDirection dir)
{ {
BTScanOpaque so = (BTScanOpaque) scan->opaque; BTScanOpaque so = (BTScanOpaque) scan->opaque;
BTScanPosItem *currItem;
/* /*
* Advance to next tuple on current page; or if there's no more, try to * Advance to next tuple on current page; or if there's no more, try to
...@@ -967,7 +976,10 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) ...@@ -967,7 +976,10 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
} }
/* OK, itemIndex says what to return */ /* OK, itemIndex says what to return */
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true; return true;
} }
...@@ -996,6 +1008,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) ...@@ -996,6 +1008,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
OffsetNumber minoff; OffsetNumber minoff;
OffsetNumber maxoff; OffsetNumber maxoff;
int itemIndex; int itemIndex;
IndexTuple itup;
bool continuescan; bool continuescan;
/* we must have the buffer pinned and locked */ /* we must have the buffer pinned and locked */
...@@ -1013,6 +1026,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) ...@@ -1013,6 +1026,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
*/ */
so->currPos.nextPage = opaque->btpo_next; so->currPos.nextPage = opaque->btpo_next;
/* initialize tuple workspace to empty */
so->currPos.nextTupleOffset = 0;
if (ScanDirectionIsForward(dir)) if (ScanDirectionIsForward(dir))
{ {
/* load items[] in ascending order */ /* load items[] in ascending order */
...@@ -1022,12 +1038,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) ...@@ -1022,12 +1038,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
while (offnum <= maxoff) while (offnum <= maxoff)
{ {
if (_bt_checkkeys(scan, page, offnum, dir, &continuescan)) itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
if (itup != NULL)
{ {
/* tuple passes all scan key conditions, so remember it */ /* tuple passes all scan key conditions, so remember it */
/* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */ _bt_saveitem(so, itemIndex, offnum, itup);
so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
so->currPos.items[itemIndex].indexOffset = offnum;
itemIndex++; itemIndex++;
} }
if (!continuescan) if (!continuescan)
...@@ -1054,13 +1069,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) ...@@ -1054,13 +1069,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
while (offnum >= minoff) while (offnum >= minoff)
{ {
if (_bt_checkkeys(scan, page, offnum, dir, &continuescan)) itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
if (itup != NULL)
{ {
/* tuple passes all scan key conditions, so remember it */ /* tuple passes all scan key conditions, so remember it */
/* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
itemIndex--; itemIndex--;
so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self; _bt_saveitem(so, itemIndex, offnum, itup);
so->currPos.items[itemIndex].indexOffset = offnum;
} }
if (!continuescan) if (!continuescan)
{ {
...@@ -1081,6 +1095,25 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) ...@@ -1081,6 +1095,25 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
return (so->currPos.firstItem <= so->currPos.lastItem); return (so->currPos.firstItem <= so->currPos.lastItem);
} }
/* Save an index item into so->currPos.items[itemIndex] */
static void
_bt_saveitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup)
{
BTScanPosItem *currItem = &so->currPos.items[itemIndex];
currItem->heapTid = itup->t_tid;
currItem->indexOffset = offnum;
if (so->currTuples)
{
Size itupsz = IndexTupleSize(itup);
currItem->tupleOffset = so->currPos.nextTupleOffset;
memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
so->currPos.nextTupleOffset += MAXALIGN(itupsz);
}
}
/* /*
* _bt_steppage() -- Step to next page containing valid data for scan * _bt_steppage() -- Step to next page containing valid data for scan
* *
...@@ -1119,6 +1152,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) ...@@ -1119,6 +1152,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
memcpy(&so->markPos, &so->currPos, memcpy(&so->markPos, &so->currPos,
offsetof(BTScanPosData, items[1]) + offsetof(BTScanPosData, items[1]) +
so->currPos.lastItem * sizeof(BTScanPosItem)); so->currPos.lastItem * sizeof(BTScanPosItem));
if (so->markTuples)
memcpy(so->markTuples, so->currTuples,
so->currPos.nextTupleOffset);
so->markPos.itemIndex = so->markItemIndex; so->markPos.itemIndex = so->markItemIndex;
so->markItemIndex = -1; so->markItemIndex = -1;
} }
...@@ -1428,6 +1464,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) ...@@ -1428,6 +1464,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Page page; Page page;
BTPageOpaque opaque; BTPageOpaque opaque;
OffsetNumber start; OffsetNumber start;
BTScanPosItem *currItem;
/* /*
* Scan down to the leftmost or rightmost leaf page. This is a simplified * Scan down to the leftmost or rightmost leaf page. This is a simplified
...@@ -1505,7 +1542,10 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) ...@@ -1505,7 +1542,10 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
/* OK, itemIndex says what to return */ /* OK, itemIndex says what to return */
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid; currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true; return true;
} }
...@@ -835,8 +835,8 @@ _bt_mark_scankey_required(ScanKey skey) ...@@ -835,8 +835,8 @@ _bt_mark_scankey_required(ScanKey skey)
/* /*
* Test whether an indextuple satisfies all the scankey conditions. * Test whether an indextuple satisfies all the scankey conditions.
* *
* If so, copy its TID into scan->xs_ctup.t_self, and return TRUE. * If so, return the address of the index tuple on the index page.
* If not, return FALSE (xs_ctup is not changed). * If not, return NULL.
* *
* If the tuple fails to pass the qual, we also determine whether there's * If the tuple fails to pass the qual, we also determine whether there's
* any need to continue the scan beyond this tuple, and set *continuescan * any need to continue the scan beyond this tuple, and set *continuescan
...@@ -848,14 +848,16 @@ _bt_mark_scankey_required(ScanKey skey) ...@@ -848,14 +848,16 @@ _bt_mark_scankey_required(ScanKey skey)
* offnum: offset number of index tuple (must be a valid item!) * offnum: offset number of index tuple (must be a valid item!)
* dir: direction we are scanning in * dir: direction we are scanning in
* continuescan: output parameter (will be set correctly in all cases) * continuescan: output parameter (will be set correctly in all cases)
*
* Caller must hold pin and lock on the index page.
*/ */
bool IndexTuple
_bt_checkkeys(IndexScanDesc scan, _bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum, Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan) ScanDirection dir, bool *continuescan)
{ {
ItemId iid = PageGetItemId(page, offnum); ItemId iid = PageGetItemId(page, offnum);
bool tuple_valid; bool tuple_alive;
IndexTuple tuple; IndexTuple tuple;
TupleDesc tupdesc; TupleDesc tupdesc;
BTScanOpaque so; BTScanOpaque so;
...@@ -879,24 +881,24 @@ _bt_checkkeys(IndexScanDesc scan, ...@@ -879,24 +881,24 @@ _bt_checkkeys(IndexScanDesc scan,
if (ScanDirectionIsForward(dir)) if (ScanDirectionIsForward(dir))
{ {
if (offnum < PageGetMaxOffsetNumber(page)) if (offnum < PageGetMaxOffsetNumber(page))
return false; return NULL;
} }
else else
{ {
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (offnum > P_FIRSTDATAKEY(opaque)) if (offnum > P_FIRSTDATAKEY(opaque))
return false; return NULL;
} }
/* /*
* OK, we want to check the keys, but we'll return FALSE even if the * OK, we want to check the keys so we can set continuescan correctly,
* tuple passes the key tests. * but we'll return NULL even if the tuple passes the key tests.
*/ */
tuple_valid = false; tuple_alive = false;
} }
else else
tuple_valid = true; tuple_alive = true;
tuple = (IndexTuple) PageGetItem(page, iid); tuple = (IndexTuple) PageGetItem(page, iid);
...@@ -915,7 +917,7 @@ _bt_checkkeys(IndexScanDesc scan, ...@@ -915,7 +917,7 @@ _bt_checkkeys(IndexScanDesc scan,
{ {
if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan)) if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
continue; continue;
return false; return NULL;
} }
datum = index_getattr(tuple, datum = index_getattr(tuple,
...@@ -953,7 +955,7 @@ _bt_checkkeys(IndexScanDesc scan, ...@@ -953,7 +955,7 @@ _bt_checkkeys(IndexScanDesc scan,
/* /*
* In any case, this indextuple doesn't match the qual. * In any case, this indextuple doesn't match the qual.
*/ */
return false; return NULL;
} }
if (isNull) if (isNull)
...@@ -988,7 +990,7 @@ _bt_checkkeys(IndexScanDesc scan, ...@@ -988,7 +990,7 @@ _bt_checkkeys(IndexScanDesc scan,
/* /*
* In any case, this indextuple doesn't match the qual. * In any case, this indextuple doesn't match the qual.
*/ */
return false; return NULL;
} }
test = FunctionCall2Coll(&key->sk_func, key->sk_collation, test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
...@@ -1016,15 +1018,16 @@ _bt_checkkeys(IndexScanDesc scan, ...@@ -1016,15 +1018,16 @@ _bt_checkkeys(IndexScanDesc scan,
/* /*
* In any case, this indextuple doesn't match the qual. * In any case, this indextuple doesn't match the qual.
*/ */
return false; return NULL;
} }
} }
/* If we get here, the tuple passes all index quals. */ /* Check for failure due to it being a killed tuple. */
if (tuple_valid) if (!tuple_alive)
scan->xs_ctup.t_self = tuple->t_tid; return NULL;
return tuple_valid; /* If we get here, the tuple passes all index quals. */
return tuple;
} }
/* /*
......
...@@ -92,7 +92,7 @@ IndexNext(IndexScanState *node) ...@@ -92,7 +92,7 @@ IndexNext(IndexScanState *node)
* scan's xs_cbuf, ie, the previously visited heap page. It's not * scan's xs_cbuf, ie, the previously visited heap page. It's not
* clear whether it'd be better to release that pin. * clear whether it'd be better to release that pin.
*/ */
if (scandesc->xs_itup != NULL && if (scandesc->xs_want_itup &&
visibilitymap_test(scandesc->heapRelation, visibilitymap_test(scandesc->heapRelation,
ItemPointerGetBlockNumber(tid), ItemPointerGetBlockNumber(tid),
&node->iss_VMBuffer)) &node->iss_VMBuffer))
......
...@@ -472,12 +472,18 @@ typedef BTStackData *BTStack; ...@@ -472,12 +472,18 @@ typedef BTStackData *BTStack;
* items were killed, we re-lock the page to mark them killed, then unlock. * items were killed, we re-lock the page to mark them killed, then unlock.
* Finally we drop the pin and step to the next page in the appropriate * Finally we drop the pin and step to the next page in the appropriate
* direction. * direction.
*
* If we are doing an index-only scan, we save the entire IndexTuple for each
* matched item, otherwise only its heap TID and offset. The IndexTuples go
* into a separate workspace array; each BTScanPosItem stores its tuple's
* offset within that array.
*/ */
typedef struct BTScanPosItem /* what we remember about each match */ typedef struct BTScanPosItem /* what we remember about each match */
{ {
ItemPointerData heapTid; /* TID of referenced heap item */ ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */ OffsetNumber indexOffset; /* index item's location within page */
LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
} BTScanPosItem; } BTScanPosItem;
typedef struct BTScanPosData typedef struct BTScanPosData
...@@ -495,6 +501,12 @@ typedef struct BTScanPosData ...@@ -495,6 +501,12 @@ typedef struct BTScanPosData
bool moreLeft; bool moreLeft;
bool moreRight; bool moreRight;
/*
* If we are doing an index-only scan, nextTupleOffset is the first free
* location in the associated tuple storage workspace.
*/
int nextTupleOffset;
/* /*
* The items array is always ordered in index order (ie, increasing * The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the * indexoffset). When scanning backwards it is convenient to fill the
...@@ -524,6 +536,14 @@ typedef struct BTScanOpaqueData ...@@ -524,6 +536,14 @@ typedef struct BTScanOpaqueData
int *killedItems; /* currPos.items indexes of killed items */ int *killedItems; /* currPos.items indexes of killed items */
int numKilled; /* number of currently stored items */ int numKilled; /* number of currently stored items */
/*
* If we are doing an index-only scan, these are the tuple storage
* workspaces for the currPos and markPos respectively. Each is of
* size BLCKSZ, so it can hold as much as a full page's worth of tuples.
*/
char *currTuples; /* tuple storage for currPos */
char *markTuples; /* tuple storage for markPos */
/* /*
* If the marked position is on the same page as current position, we * If the marked position is on the same page as current position, we
* don't use markPos, but just keep the marked itemIndex in markItemIndex * don't use markPos, but just keep the marked itemIndex in markItemIndex
...@@ -620,7 +640,7 @@ extern ScanKey _bt_mkscankey_nodata(Relation rel); ...@@ -620,7 +640,7 @@ extern ScanKey _bt_mkscankey_nodata(Relation rel);
extern void _bt_freeskey(ScanKey skey); extern void _bt_freeskey(ScanKey skey);
extern void _bt_freestack(BTStack stack); extern void _bt_freestack(BTStack stack);
extern void _bt_preprocess_keys(IndexScanDesc scan); extern void _bt_preprocess_keys(IndexScanDesc scan);
extern bool _bt_checkkeys(IndexScanDesc scan, extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum, Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan); ScanDirection dir, bool *continuescan);
extern void _bt_killitems(IndexScanDesc scan, bool haveLock); extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
......
...@@ -79,7 +79,7 @@ typedef struct IndexScanDescData ...@@ -79,7 +79,7 @@ typedef struct IndexScanDescData
void *opaque; /* access-method-specific info */ void *opaque; /* access-method-specific info */
/* in an index-only scan, this is valid after a successful amgettuple */ /* in an index-only scan, this is valid after a successful amgettuple */
IndexTuple xs_itup; /* index tuple returned by AM, or NULL */ IndexTuple xs_itup; /* index tuple returned by AM */
/* xs_ctup/xs_cbuf/xs_recheck are valid after a successful index_getnext */ /* xs_ctup/xs_cbuf/xs_recheck are valid after a successful index_getnext */
HeapTupleData xs_ctup; /* current heap tuple, if any */ HeapTupleData xs_ctup; /* current heap tuple, if any */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment