Commit 29b64d1d authored by Peter Geoghegan's avatar Peter Geoghegan

Add nbtree high key "continuescan" optimization.

Teach nbtree forward index scans to check the high key before moving to
the right sibling page in the hope of finding that it isn't actually
necessary to do so.  The new check may indicate that the scan definitely
cannot find matching tuples to the right, ending the scan immediately.
We already opportunistically force a similar "continuescan orientated"
key check of the final non-pivot tuple when it's clear that it cannot be
returned to the scan due to being dead-to-all.  The new high key check
is complementary.

The new approach for forward scans is more effective than checking the
final non-pivot tuple, especially with composite indexes and non-unique
indexes.  The improvements to the logic for picking a split point added
by commit fab25024 make it likely that relatively dissimilar high keys
will appear on a page.  A distinguishing key value that can only appear
on non-pivot tuples on the right sibling page will often be present in
leaf page high keys.

Since forcing the final item to be key checked no longer makes any
difference in the case of forward scans, the existing extra key check is
now only used for backwards scans.  Backward scans continue to
opportunistically check the final non-pivot tuple, which is actually the
first non-pivot tuple on the page (not the last).

Note that even pg_upgrade'd v3 indexes make use of this optimization.

Author: Peter Geoghegan, Heikki Linnakangas
Reviewed-By: Heikki Linnakangas
Discussion: https://postgr.es/m/CAH2-WzkOmUduME31QnuTFpimejuQoiZ-HOf0pOWeFZNhTMctvA@mail.gmail.com
parent 4ba96d1b
......@@ -1406,8 +1406,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
OffsetNumber minoff;
OffsetNumber maxoff;
int itemIndex;
IndexTuple itup;
bool continuescan;
int indnatts;
/*
* We must have the buffer pinned and locked, but the usual macro can't be
......@@ -1427,6 +1427,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
}
continuescan = true; /* default assumption */
indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
......@@ -1468,23 +1470,58 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
while (offnum <= maxoff)
{
itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
if (itup != NULL)
ItemId iid = PageGetItemId(page, offnum);
IndexTuple itup;
/*
* If the scan specifies not to return killed tuples, then we
* treat a killed tuple as not passing the qual
*/
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
{
offnum = OffsetNumberNext(offnum);
continue;
}
itup = (IndexTuple) PageGetItem(page, iid);
if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
{
/* tuple passes all scan key conditions, so remember it */
_bt_saveitem(so, itemIndex, offnum, itup);
itemIndex++;
}
/* When !continuescan, there can't be any more matches, so stop */
if (!continuescan)
{
/* there can't be any more matches, so stop */
so->currPos.moreRight = false;
break;
}
offnum = OffsetNumberNext(offnum);
}
/*
* We don't need to visit page to the right when the high key
* indicates that no more matches will be found there.
*
* Checking the high key like this works out more often than you might
* think. Leaf page splits pick a split point between the two most
* dissimilar tuples (this is weighed against the need to evenly share
* free space). Leaf pages with high key attribute values that can
* only appear on non-pivot tuples on the right sibling page are
* common.
*/
if (continuescan && !P_RIGHTMOST(opaque))
{
ItemId iid = PageGetItemId(page, P_HIKEY);
IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
int truncatt;
truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
_bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
}
if (!continuescan)
so->currPos.moreRight = false;
Assert(itemIndex <= MaxIndexTuplesPerPage);
so->currPos.firstItem = 0;
so->currPos.lastItem = itemIndex - 1;
......@@ -1499,8 +1536,40 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
while (offnum >= minoff)
{
itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
if (itup != NULL)
ItemId iid = PageGetItemId(page, offnum);
IndexTuple itup;
bool tuple_alive;
bool passes_quals;
/*
* If the scan specifies not to return killed tuples, then we
* treat a killed tuple as not passing the qual. Most of the
* time, it's a win to not bother examining the tuple's index
* keys, but just skip to the next tuple (previous, actually,
* since we're scanning backwards). However, if this is the first
* tuple on the page, we do check the index keys, to prevent
* uselessly advancing to the page to the left. This is similar
* to the high key optimization used by forward scans.
*/
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
{
Assert(offnum >= P_FIRSTDATAKEY(opaque));
if (offnum > P_FIRSTDATAKEY(opaque))
{
offnum = OffsetNumberPrev(offnum);
continue;
}
tuple_alive = false;
}
else
tuple_alive = true;
itup = (IndexTuple) PageGetItem(page, iid);
passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
&continuescan);
if (passes_quals && tuple_alive)
{
/* tuple passes all scan key conditions, so remember it */
itemIndex--;
......
......@@ -48,7 +48,7 @@ static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
static void _bt_mark_scankey_required(ScanKey skey);
static bool _bt_check_rowcompare(ScanKey skey,
IndexTuple tuple, TupleDesc tupdesc,
IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
ScanDirection dir, bool *continuescan);
static int _bt_keep_natts(Relation rel, IndexTuple lastleft,
IndexTuple firstright, BTScanInsert itup_key);
......@@ -1333,72 +1333,34 @@ _bt_mark_scankey_required(ScanKey skey)
/*
* Test whether an indextuple satisfies all the scankey conditions.
*
* If so, return the address of the index tuple on the index page.
* If not, return NULL.
* Return true if so, false if not. If the tuple fails to pass the qual,
* we also determine whether there's any need to continue the scan beyond
* this tuple, and set *continuescan accordingly. See comments for
* _bt_preprocess_keys(), above, about how this is done.
*
* If the tuple fails to pass the qual, we also determine whether there's
* any need to continue the scan beyond this tuple, and set *continuescan
* accordingly. See comments for _bt_preprocess_keys(), above, about how
* this is done.
* Forward scan callers can pass a high key tuple in the hopes of having
* us set *continuescanthat to false, and avoiding an unnecessary visit to
* the page to the right.
*
* scan: index scan descriptor (containing a search-type scankey)
* page: buffer page containing index tuple
* offnum: offset number of index tuple (must be a valid item!)
* tuple: index tuple to test
* tupnatts: number of attributes in tupnatts (high key may be truncated)
* dir: direction we are scanning in
* continuescan: output parameter (will be set correctly in all cases)
*
* Caller must hold pin and lock on the index page.
*/
IndexTuple
_bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum,
bool
_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
ScanDirection dir, bool *continuescan)
{
ItemId iid = PageGetItemId(page, offnum);
bool tuple_alive;
IndexTuple tuple;
TupleDesc tupdesc;
BTScanOpaque so;
int keysz;
int ikey;
ScanKey key;
*continuescan = true; /* default assumption */
/*
* If the scan specifies not to return killed tuples, then we treat a
* killed tuple as not passing the qual. Most of the time, it's a win to
* not bother examining the tuple's index keys, but just return
* immediately with continuescan = true to proceed to the next tuple.
* However, if this is the last tuple on the page, we should check the
* index keys to prevent uselessly advancing to the next page.
*/
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
{
/* return immediately if there are more tuples on the page */
if (ScanDirectionIsForward(dir))
{
if (offnum < PageGetMaxOffsetNumber(page))
return NULL;
}
else
{
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (offnum > P_FIRSTDATAKEY(opaque))
return NULL;
}
/*
* OK, we want to check the keys so we can set continuescan correctly,
* but we'll return NULL even if the tuple passes the key tests.
*/
tuple_alive = false;
}
else
tuple_alive = true;
Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
tuple = (IndexTuple) PageGetItem(page, iid);
*continuescan = true; /* default assumption */
tupdesc = RelationGetDescr(scan->indexRelation);
so = (BTScanOpaque) scan->opaque;
......@@ -1410,13 +1372,25 @@ _bt_checkkeys(IndexScanDesc scan,
bool isNull;
Datum test;
Assert(key->sk_attno <= BTreeTupleGetNAtts(tuple, scan->indexRelation));
if (key->sk_attno > tupnatts)
{
/*
* This attribute is truncated (must be high key). The value for
* this attribute in the first non-pivot tuple on the page to the
* right could be any possible value. Assume that truncated
* attribute passes the qual.
*/
Assert(ScanDirectionIsForward(dir));
continue;
}
/* row-comparison keys need special processing */
if (key->sk_flags & SK_ROW_HEADER)
{
if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
continuescan))
continue;
return NULL;
return false;
}
datum = index_getattr(tuple,
......@@ -1454,7 +1428,7 @@ _bt_checkkeys(IndexScanDesc scan,
/*
* In any case, this indextuple doesn't match the qual.
*/
return NULL;
return false;
}
if (isNull)
......@@ -1495,7 +1469,7 @@ _bt_checkkeys(IndexScanDesc scan,
/*
* In any case, this indextuple doesn't match the qual.
*/
return NULL;
return false;
}
test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
......@@ -1523,16 +1497,12 @@ _bt_checkkeys(IndexScanDesc scan,
/*
* In any case, this indextuple doesn't match the qual.
*/
return NULL;
return false;
}
}
/* Check for failure due to it being a killed tuple. */
if (!tuple_alive)
return NULL;
/* If we get here, the tuple passes all index quals. */
return tuple;
return true;
}
/*
......@@ -1545,8 +1515,8 @@ _bt_checkkeys(IndexScanDesc scan,
* This is a subroutine for _bt_checkkeys, which see for more info.
*/
static bool
_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
ScanDirection dir, bool *continuescan)
_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
{
ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
int32 cmpresult = 0;
......@@ -1563,6 +1533,19 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
Assert(subkey->sk_flags & SK_ROW_MEMBER);
if (subkey->sk_attno > tupnatts)
{
/*
* This attribute is truncated (must be high key). The value for
* this attribute in the first non-pivot tuple on the page to the
* right could be any possible value. Assume that truncated
* attribute passes the qual.
*/
Assert(ScanDirectionIsForward(dir));
cmpresult = 0;
continue;
}
datum = index_getattr(tuple,
subkey->sk_attno,
tupdesc,
......
......@@ -772,9 +772,8 @@ extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
extern void _bt_mark_array_keys(IndexScanDesc scan);
extern void _bt_restore_array_keys(IndexScanDesc scan);
extern void _bt_preprocess_keys(IndexScanDesc scan);
extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
Page page, OffsetNumber offnum,
ScanDirection dir, bool *continuescan);
extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
int tupnatts, ScanDirection dir, bool *continuescan);
extern void _bt_killitems(IndexScanDesc scan);
extern BTCycleId _bt_vacuum_cycleid(Relation rel);
extern BTCycleId _bt_start_vacuum(Relation rel);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment