Commit cbfa92c2 authored by Tom Lane's avatar Tom Lane

Improve index-only scans to avoid repeated access to the index page.

We copy all the matched tuples off the page during _bt_readpage, instead of
expensively re-locking the page during each subsequent tuple fetch.  This
costs a bit more local storage, but not more than 2*BLCKSZ worth, and the
reduction in LWLock traffic is certainly worth that.  What's more, this
lets us get rid of the API wart in the original patch that said an index AM
could randomly decline to supply an index tuple despite having asserted
pg_am.amcanreturn.  That will be important for future improvements in the
index-only-scan feature, since the executor will now be able to rely on
having the index data available.
parent 45401c1c
......@@ -394,12 +394,13 @@ amgettuple (IndexScanDesc scan,
If the access method supports index-only scans (i.e.,
<structfield>amcanreturn</structfield> is TRUE in its <structname>pg_am</>
row), then on success it must also check
<literal>scan-&gt;xs_want_itup</>, and if that is true it should return
<literal>scan-&gt;xs_want_itup</>, and if that is true it must return
the original indexed data for the index entry, in the form of an
<structname>IndexTuple</> stored at <literal>scan-&gt;xs_itup</>. However,
it is permissible for the access method to sometimes fail to provide this
data, in which case it must set <literal>scan-&gt;xs_itup</> to NULL. That
will result in a regular heap fetch occurring.
<structname>IndexTuple</> pointer stored at <literal>scan-&gt;xs_itup</>.
(Management of the data referenced by the pointer is the access method's
responsibility. The data must remain good at least until the next
<function>amgettuple</>, <function>amrescan</>, or <function>amendscan</>
call for the scan.)
</para>
<para>
......
......@@ -443,9 +443,10 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
Assert(TransactionIdIsValid(RecentGlobalXmin));
/*
* The AM's gettuple proc finds the next index entry matching the scan
* keys, and puts the TID in xs_ctup.t_self. It should also set
* scan->xs_recheck, though we pay no attention to that here.
* The AM's amgettuple proc finds the next index entry matching the scan
* keys, and puts the TID into scan->xs_ctup.t_self. It should also set
* scan->xs_recheck and possibly scan->xs_itup, though we pay no attention
* to those fields here.
*/
found = DatumGetBool(FunctionCall2(procedure,
PointerGetDatum(scan),
......
......@@ -73,7 +73,6 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
BTCycleId cycleid);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
static IndexTuple bt_getindextuple(IndexScanDesc scan);
/*
......@@ -311,94 +310,9 @@ btgettuple(PG_FUNCTION_ARGS)
else
res = _bt_first(scan, dir);
/* Return the whole index tuple if requested */
if (scan->xs_want_itup)
{
/* First, free the last one ... */
if (scan->xs_itup != NULL)
{
pfree(scan->xs_itup);
scan->xs_itup = NULL;
}
if (res)
scan->xs_itup = bt_getindextuple(scan);
}
PG_RETURN_BOOL(res);
}
/*
* bt_getindextuple - fetch index tuple at current position.
*
* This can fail to find the tuple if new tuples have been inserted on the
* index page since we stepped onto the page. NULL is returned in that case.
* (We could try a bit harder by searching for the TID; but if insertions
* are happening, it's reasonably likely that an index-only scan will fail
* anyway because of visibility. So probably not worth the trouble.)
*
* The tuple returned is a palloc'd copy, so that we don't need to keep a
* lock on the index page.
*
* The caller must have pin on so->currPos.buf.
*/
static IndexTuple
bt_getindextuple(IndexScanDesc scan)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
int itemIndex;
OffsetNumber offnum;
IndexTuple ituple,
result;
Assert(BufferIsValid(so->currPos.buf));
LockBuffer(so->currPos.buf, BT_READ);
/* Locate the tuple, being paranoid about possibility the page changed */
page = BufferGetPage(so->currPos.buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
itemIndex = so->currPos.itemIndex;
/* pure paranoia */
Assert(itemIndex >= so->currPos.firstItem &&
itemIndex <= so->currPos.lastItem);
offnum = so->currPos.items[itemIndex].indexOffset;
if (offnum < minoff || offnum > maxoff)
{
/* should never happen, since we have pin on page, but be careful */
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
return NULL;
}
ituple = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
if (ItemPointerEquals(&ituple->t_tid, &scan->xs_ctup.t_self))
{
/* yup, it's the desired tuple, so make a copy */
Size itupsz = IndexTupleSize(ituple);
result = palloc(itupsz);
memcpy(result, ituple, itupsz);
}
else
{
/* oops, it got moved */
result = NULL;
}
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
return result;
}
/*
* btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
*/
......@@ -471,6 +385,15 @@ btbeginscan(PG_FUNCTION_ARGS)
so->keyData = NULL;
so->killedItems = NULL; /* until needed */
so->numKilled = 0;
/*
* We don't know yet whether the scan will be index-only, so we do not
* allocate the tuple workspace arrays until btrescan.
*/
so->currTuples = so->markTuples = NULL;
so->currPos.nextTupleOffset = 0;
so->markPos.nextTupleOffset = 0;
scan->opaque = so;
PG_RETURN_POINTER(scan);
......@@ -505,6 +428,18 @@ btrescan(PG_FUNCTION_ARGS)
}
so->markItemIndex = -1;
/*
* Allocate tuple workspace arrays, if needed for an index-only scan and
* not already done in a previous rescan call. To save on palloc
* overhead, both workspaces are allocated as one palloc block; only this
* function and btendscan know that.
*/
if (scan->xs_want_itup && so->currTuples == NULL)
{
so->currTuples = (char *) palloc(BLCKSZ * 2);
so->markTuples = so->currTuples + BLCKSZ;
}
/*
* Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
* - vadim 05/05/97
......@@ -544,18 +479,16 @@ btendscan(PG_FUNCTION_ARGS)
}
so->markItemIndex = -1;
/* Release storage */
if (so->killedItems != NULL)
pfree(so->killedItems);
if (so->keyData != NULL)
pfree(so->keyData);
if (so->currTuples != NULL)
pfree(so->currTuples);
/* so->markTuples should not be pfree'd, see btrescan */
pfree(so);
if (scan->xs_itup != NULL)
{
pfree(scan->xs_itup);
scan->xs_itup = NULL;
}
PG_RETURN_VOID();
}
......@@ -626,6 +559,9 @@ btrestrpos(PG_FUNCTION_ARGS)
memcpy(&so->currPos, &so->markPos,
offsetof(BTScanPosData, items[1]) +
so->markPos.lastItem * sizeof(BTScanPosItem));
if (so->currTuples)
memcpy(so->currTuples, so->markTuples,
so->markPos.nextTupleOffset);
}
}
......
......@@ -26,6 +26,8 @@
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
OffsetNumber offnum);
static void _bt_saveitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup);
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
static Buffer _bt_walk_left(Relation rel, Buffer buf);
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
......@@ -429,8 +431,9 @@ _bt_compare(Relation rel,
* if backwards scan, the last item) in the tree that satisfies the
* qualifications in the scan key. On success exit, the page containing
* the current index tuple is pinned but not locked, and data about
* the matching tuple(s) on the page has been loaded into so->currPos,
* and scan->xs_ctup.t_self is set to the heap TID of the current tuple.
* the matching tuple(s) on the page has been loaded into so->currPos.
* scan->xs_ctup.t_self is set to the heap TID of the current tuple,
* and if requested, scan->xs_itup points to a copy of the index tuple.
*
* If there are no matching items in the index, we return FALSE, with no
* pins or locks held.
......@@ -456,6 +459,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
int keysCount = 0;
int i;
StrategyNumber strat_total;
BTScanPosItem *currItem;
pgstat_count_index_scan(rel);
......@@ -912,7 +916,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
/* OK, itemIndex says what to return */
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true;
}
......@@ -925,7 +932,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* previously returned.
*
* On successful exit, scan->xs_ctup.t_self is set to the TID of the
* next heap tuple, and so->currPos is updated as needed.
* next heap tuple, and if requested, scan->xs_itup points to a copy of
* the index tuple. so->currPos is updated as needed.
*
* On failure exit (no more tuples), we release pin and set
* so->currPos.buf to InvalidBuffer.
......@@ -934,6 +942,7 @@ bool
_bt_next(IndexScanDesc scan, ScanDirection dir)
{
BTScanOpaque so = (BTScanOpaque) scan->opaque;
BTScanPosItem *currItem;
/*
* Advance to next tuple on current page; or if there's no more, try to
......@@ -967,7 +976,10 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
}
/* OK, itemIndex says what to return */
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true;
}
......@@ -996,6 +1008,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
OffsetNumber minoff;
OffsetNumber maxoff;
int itemIndex;
IndexTuple itup;
bool continuescan;
/* we must have the buffer pinned and locked */
......@@ -1013,6 +1026,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
*/
so->currPos.nextPage = opaque->btpo_next;
/* initialize tuple workspace to empty */
so->currPos.nextTupleOffset = 0;
if (ScanDirectionIsForward(dir))
{
/* load items[] in ascending order */
......@@ -1022,12 +1038,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
while (offnum <= maxoff)
{
if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
if (itup != NULL)
{
/* tuple passes all scan key conditions, so remember it */
/* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
so->currPos.items[itemIndex].indexOffset = offnum;
_bt_saveitem(so, itemIndex, offnum, itup);
itemIndex++;
}
if (!continuescan)
......@@ -1054,13 +1069,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
while (offnum >= minoff)
{
if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
if (itup != NULL)
{
/* tuple passes all scan key conditions, so remember it */
/* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
itemIndex--;
so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
so->currPos.items[itemIndex].indexOffset = offnum;
_bt_saveitem(so, itemIndex, offnum, itup);
}
if (!continuescan)
{
......@@ -1081,6 +1095,25 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
return (so->currPos.firstItem <= so->currPos.lastItem);
}
/* Save an index item into so->currPos.items[itemIndex] */
static void
_bt_saveitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup)
{
BTScanPosItem *currItem = &so->currPos.items[itemIndex];
currItem->heapTid = itup->t_tid;
currItem->indexOffset = offnum;
if (so->currTuples)
{
Size itupsz = IndexTupleSize(itup);
currItem->tupleOffset = so->currPos.nextTupleOffset;
memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
so->currPos.nextTupleOffset += MAXALIGN(itupsz);
}
}
/*
* _bt_steppage() -- Step to next page containing valid data for scan
*
......@@ -1119,6 +1152,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
memcpy(&so->markPos, &so->currPos,
offsetof(BTScanPosData, items[1]) +
so->currPos.lastItem * sizeof(BTScanPosItem));
if (so->markTuples)
memcpy(so->markTuples, so->currTuples,
so->currPos.nextTupleOffset);
so->markPos.itemIndex = so->markItemIndex;
so->markItemIndex = -1;
}
......@@ -1428,6 +1464,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Page page;
BTPageOpaque opaque;
OffsetNumber start;
BTScanPosItem *currItem;
/*
* Scan down to the leftmost or rightmost leaf page. This is a simplified
......@@ -1505,7 +1542,10 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
/* OK, itemIndex says what to return */
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
currItem = &so->currPos.items[so->currPos.itemIndex];
scan->xs_ctup.t_self = currItem->heapTid;
if (scan->xs_want_itup)
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
return true;
}
......@@ -835,8 +835,8 @@ _bt_mark_scankey_required(ScanKey skey)
/*
* Test whether an indextuple satisfies all the scankey conditions.
*
* If so, copy its TID into scan->xs_ctup.t_self, and return TRUE.
* If not, return FALSE (xs_ctup is not changed).
* If so, return the address of the index tuple on the index page.
* If not, return NULL.
*
* If the tuple fails to pass the qual, we also determine whether there's
* any need to continue the scan beyond this tuple, and set *continuescan
......@@ -848,14 +848,16 @@ _bt_mark_scankey_required(ScanKey skey)
* offnum: offset number of index tuple (must be a valid item!)
* dir: direction we are scanning in
* continuescan: output parameter (will be set correctly in all cases)
*
* Caller must hold pin and lock on the index page.
*/
bool
IndexTuple
_bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan)
{
ItemId iid = PageGetItemId(page, offnum);
bool tuple_valid;
bool tuple_alive;
IndexTuple tuple;
TupleDesc tupdesc;
BTScanOpaque so;
......@@ -879,24 +881,24 @@ _bt_checkkeys(IndexScanDesc scan,
if (ScanDirectionIsForward(dir))
{
if (offnum < PageGetMaxOffsetNumber(page))
return false;
return NULL;
}
else
{
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (offnum > P_FIRSTDATAKEY(opaque))
return false;
return NULL;
}
/*
* OK, we want to check the keys, but we'll return FALSE even if the
* tuple passes the key tests.
* OK, we want to check the keys so we can set continuescan correctly,
* but we'll return NULL even if the tuple passes the key tests.
*/
tuple_valid = false;
tuple_alive = false;
}
else
tuple_valid = true;
tuple_alive = true;
tuple = (IndexTuple) PageGetItem(page, iid);
......@@ -915,7 +917,7 @@ _bt_checkkeys(IndexScanDesc scan,
{
if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
continue;
return false;
return NULL;
}
datum = index_getattr(tuple,
......@@ -953,7 +955,7 @@ _bt_checkkeys(IndexScanDesc scan,
/*
* In any case, this indextuple doesn't match the qual.
*/
return false;
return NULL;
}
if (isNull)
......@@ -988,7 +990,7 @@ _bt_checkkeys(IndexScanDesc scan,
/*
* In any case, this indextuple doesn't match the qual.
*/
return false;
return NULL;
}
test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
......@@ -1016,15 +1018,16 @@ _bt_checkkeys(IndexScanDesc scan,
/*
* In any case, this indextuple doesn't match the qual.
*/
return false;
return NULL;
}
}
/* If we get here, the tuple passes all index quals. */
if (tuple_valid)
scan->xs_ctup.t_self = tuple->t_tid;
/* Check for failure due to it being a killed tuple. */
if (!tuple_alive)
return NULL;
return tuple_valid;
/* If we get here, the tuple passes all index quals. */
return tuple;
}
/*
......
......@@ -92,7 +92,7 @@ IndexNext(IndexScanState *node)
* scan's xs_cbuf, ie, the previously visited heap page. It's not
* clear whether it'd be better to release that pin.
*/
if (scandesc->xs_itup != NULL &&
if (scandesc->xs_want_itup &&
visibilitymap_test(scandesc->heapRelation,
ItemPointerGetBlockNumber(tid),
&node->iss_VMBuffer))
......
......@@ -472,12 +472,18 @@ typedef BTStackData *BTStack;
* items were killed, we re-lock the page to mark them killed, then unlock.
* Finally we drop the pin and step to the next page in the appropriate
* direction.
*
* If we are doing an index-only scan, we save the entire IndexTuple for each
* matched item, otherwise only its heap TID and offset. The IndexTuples go
* into a separate workspace array; each BTScanPosItem stores its tuple's
* offset within that array.
*/
typedef struct BTScanPosItem /* what we remember about each match */
{
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
} BTScanPosItem;
typedef struct BTScanPosData
......@@ -495,6 +501,12 @@ typedef struct BTScanPosData
bool moreLeft;
bool moreRight;
/*
* If we are doing an index-only scan, nextTupleOffset is the first free
* location in the associated tuple storage workspace.
*/
int nextTupleOffset;
/*
* The items array is always ordered in index order (ie, increasing
* indexoffset). When scanning backwards it is convenient to fill the
......@@ -524,6 +536,14 @@ typedef struct BTScanOpaqueData
int *killedItems; /* currPos.items indexes of killed items */
int numKilled; /* number of currently stored items */
/*
* If we are doing an index-only scan, these are the tuple storage
* workspaces for the currPos and markPos respectively. Each is of
* size BLCKSZ, so it can hold as much as a full page's worth of tuples.
*/
char *currTuples; /* tuple storage for currPos */
char *markTuples; /* tuple storage for markPos */
/*
* If the marked position is on the same page as current position, we
* don't use markPos, but just keep the marked itemIndex in markItemIndex
......@@ -620,7 +640,7 @@ extern ScanKey _bt_mkscankey_nodata(Relation rel);
extern void _bt_freeskey(ScanKey skey);
extern void _bt_freestack(BTStack stack);
extern void _bt_preprocess_keys(IndexScanDesc scan);
extern bool _bt_checkkeys(IndexScanDesc scan,
extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan);
extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
......
......@@ -79,7 +79,7 @@ typedef struct IndexScanDescData
void *opaque; /* access-method-specific info */
/* in an index-only scan, this is valid after a successful amgettuple */
IndexTuple xs_itup; /* index tuple returned by AM, or NULL */
IndexTuple xs_itup; /* index tuple returned by AM */
/* xs_ctup/xs_cbuf/xs_recheck are valid after a successful index_getnext */
HeapTupleData xs_ctup; /* current heap tuple, if any */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment