Commit e20c70cb authored by Heikki Linnakangas's avatar Heikki Linnakangas

Allow skipping some items in a multi-key GIN search.

In a multi-key search, ie. something like "col @> 'foo' AND col @> 'bar'",
as soon as we find the next item that matches the first criteria, we don't
need to check the second criteria for TIDs smaller the first match. That
saves a lot of effort, especially if one of the terms is rare, while the
second occurs very frequently.

Based on ideas from Alexander Korotkov's fast scan patch.
parent 2013e5ee
......@@ -67,29 +67,6 @@ callConsistentFn(GinState *ginstate, GinScanKey key)
PointerGetDatum(key->queryCategories)));
}
/*
* Tries to refind previously taken ItemPointer on a posting page.
*/
static bool
needToStepRight(Page page, ItemPointer item)
{
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
/* page was deleted by concurrent vacuum */
return true;
if (ginCompareItemPointers(item, GinDataPageGetRightBound(page)) > 0
&& !GinPageRightMost(page))
{
/*
* the item we're looking is > the right bound of the page, so it
* can't be on this page.
*/
return true;
}
return false;
}
/*
* Goes to the next page if current offset is outside of bounds
*/
......@@ -447,8 +424,7 @@ restartScanEntry:
page = BufferGetPage(entry->buffer);
/*
* Copy page content to memory to avoid keeping it locked for
* a long time.
* Load the first page into memory.
*/
entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
......@@ -518,88 +494,78 @@ startScan(IndexScanDesc scan)
}
/*
* Gets next ItemPointer from PostingTree. Note, that we copy
* page into GinScanEntry->list array and unlock page, but keep it pinned
* to prevent interference with vacuum
* Load the next batch of item pointers from a posting tree.
*
* Note that we copy the page into GinScanEntry->list array and unlock it, but
* keep it pinned to prevent interference with vacuum.
*/
static void
entryGetNextItem(GinState *ginstate, GinScanEntry entry)
entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast)
{
Page page;
int i;
LockBuffer(entry->buffer, GIN_SHARE);
page = BufferGetPage(entry->buffer);
for (;;)
{
if (entry->offset < entry->nlist)
entry->offset = InvalidOffsetNumber;
if (entry->list)
{
entry->curItem = entry->list[entry->offset++];
return;
pfree(entry->list);
entry->list = NULL;
entry->nlist = 0;
}
LockBuffer(entry->buffer, GIN_SHARE);
page = BufferGetPage(entry->buffer);
for (;;)
{
/*
* It's needed to go by right link. During that we should refind
* first ItemPointer greater that stored
* We've processed all the entries on this page. If it was the last
* page in the tree, we're done.
*/
if (GinPageRightMost(page))
{
UnlockReleaseBuffer(entry->buffer);
ItemPointerSetInvalid(&entry->curItem);
entry->buffer = InvalidBuffer;
entry->isFinished = TRUE;
return;
}
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
continue; /* page was deleted by concurrent vacuum */
/*
* Step to next page, following the right link. then find the first
* ItemPointer greater than advancePast.
*/
entry->buffer = ginStepRight(entry->buffer,
ginstate->index,
GIN_SHARE);
page = BufferGetPage(entry->buffer);
entry->offset = InvalidOffsetNumber;
if (entry->list)
{
pfree(entry->list);
entry->list = NULL;
}
/*
* If the page was concurrently split, we have to re-find the
* item we were stopped on. If the page was split more than once,
* the item might not be on this page, but somewhere to the right.
* Keep following the right-links until we re-find the correct
* The first item > advancePast might not be on this page, but
* somewhere to the right, if the page was split, or a non-match from
* another key in the query allowed us to skip some items from this
* entry. Keep following the right-links until we re-find the correct
* page.
*/
if (ItemPointerIsValid(&entry->curItem) &&
needToStepRight(page, &entry->curItem))
if (!GinPageRightMost(page) &&
ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
{
/*
* the item we're looking is > the right bound of the page, so it
* can't be on this page.
*/
continue;
}
entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
/* re-find the item we were stopped on. */
if (ItemPointerIsValid(&entry->curItem))
{
for (i = 0; i < entry->nlist; i++)
{
if (ginCompareItemPointers(&entry->curItem,
&entry->list[i]) < 0)
{
LockBuffer(entry->buffer, GIN_UNLOCK);
entry->offset = i + 1;
entry->curItem = entry->list[entry->offset - 1];
return;
}
}
}
else
if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
{
LockBuffer(entry->buffer, GIN_UNLOCK);
entry->offset = 1; /* scan all items on the page. */
entry->curItem = entry->list[entry->offset - 1];
entry->offset = i;
return;
}
}
......@@ -610,10 +576,10 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
/*
* Sets entry->curItem to next heap item pointer for one entry of one scan key,
* or sets entry->isFinished to TRUE if there are no more.
* Sets entry->curItem to next heap item pointer > advancePast, for one entry
* of one scan key, or sets entry->isFinished to TRUE if there are no more.
*
* Item pointers must be returned in ascending order.
* Item pointers are returned in ascending order.
*
* Note: this can return a "lossy page" item pointer, indicating that the
* entry potentially matches all items on that heap page. However, it is
......@@ -623,12 +589,20 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
* current implementation this is guaranteed by the behavior of tidbitmaps.
*/
static void
entryGetItem(GinState *ginstate, GinScanEntry entry)
entryGetItem(GinState *ginstate, GinScanEntry entry,
ItemPointerData advancePast)
{
Assert(!entry->isFinished);
Assert(!ItemPointerIsValid(&entry->curItem) ||
ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
if (entry->matchBitmap)
{
/* A bitmap result */
BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
do
{
if (entry->matchResult == NULL ||
......@@ -645,6 +619,18 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
break;
}
/*
* If all the matches on this page are <= advancePast, skip
* to next page.
*/
if (entry->matchResult->blockno < advancePastBlk ||
(entry->matchResult->blockno == advancePastBlk &&
entry->matchResult->offsets[entry->offset] <= advancePastOff))
{
entry->offset = entry->matchResult->ntuples;
continue;
}
/*
* Reset counter to the beginning of entry->matchResult. Note:
* entry->offset is still greater than matchResult->ntuples if
......@@ -670,6 +656,17 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
break;
}
if (entry->matchResult->blockno == advancePastBlk)
{
/*
* Skip to the right offset on this page. We already checked
* in above loop that there is at least one item > advancePast
* on the page.
*/
while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
entry->offset++;
}
ItemPointerSet(&entry->curItem,
entry->matchResult->blockno,
entry->matchResult->offsets[entry->offset]);
......@@ -678,29 +675,48 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
}
else if (!BufferIsValid(entry->buffer))
{
entry->offset++;
if (entry->offset <= entry->nlist)
entry->curItem = entry->list[entry->offset - 1];
else
/* A posting list from an entry tuple */
do
{
if (entry->offset >= entry->nlist)
{
ItemPointerSetInvalid(&entry->curItem);
entry->isFinished = TRUE;
break;
}
entry->curItem = entry->list[entry->offset++];
} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
/* XXX: shouldn't we apply the fuzzy search limit here? */
}
else
{
/* A posting tree */
do
{
entryGetNextItem(ginstate, entry);
} while (entry->isFinished == FALSE &&
entry->reduceResult == TRUE &&
dropItem(entry));
/* If we've processed the current batch, load more items */
while (entry->offset >= entry->nlist)
{
entryLoadMoreItems(ginstate, entry, advancePast);
if (entry->isFinished)
{
ItemPointerSetInvalid(&entry->curItem);
return;
}
}
entry->curItem = entry->list[entry->offset++];
} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
(entry->reduceResult == TRUE && dropItem(entry)));
}
}
/*
* Identify the "current" item among the input entry streams for this scan key,
* and test whether it passes the scan key qual condition.
* Identify the "current" item among the input entry streams for this scan key
* that is greater than advancePast, and test whether it passes the scan key
* qual condition.
*
* The current item is the smallest curItem among the inputs. key->curItem
* is set to that value. key->curItemMatches is set to indicate whether that
......@@ -719,7 +735,8 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
* logic in scanGetItem.)
*/
static void
keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
ItemPointerData advancePast)
{
ItemPointerData minItem;
ItemPointerData curPageLossy;
......@@ -729,11 +746,20 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
GinScanEntry entry;
bool res;
MemoryContext oldCtx;
bool allFinished;
Assert(!key->isFinished);
/*
* Find the minimum of the active entry curItems.
* We might have already tested this item; if so, no need to repeat work.
* (Note: the ">" case can happen, if minItem is exact but we previously
* had to set curItem to a lossy-page pointer.)
*/
if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
return;
/*
* Find the minimum item > advancePast among the active entry streams.
*
* Note: a lossy-page entry is encoded by a ItemPointer with max value for
* offset (0xffff), so that it will sort after any exact entries for the
......@@ -741,16 +767,33 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
* pointers, which is good.
*/
ItemPointerSetMax(&minItem);
allFinished = true;
for (i = 0; i < key->nentries; i++)
{
entry = key->scanEntry[i];
if (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem, &minItem) < 0)
/*
* Advance this stream if necessary.
*
* In particular, since entry->curItem was initialized with
* ItemPointerSetMin, this ensures we fetch the first item for each
* entry on the first call.
*/
while (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
{
entryGetItem(ginstate, entry, advancePast);
}
if (!entry->isFinished)
{
allFinished = FALSE;
if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
minItem = entry->curItem;
}
}
if (ItemPointerIsMax(&minItem))
if (allFinished)
{
/* all entries are finished */
key->isFinished = TRUE;
......@@ -758,15 +801,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
}
/*
* We might have already tested this item; if so, no need to repeat work.
* (Note: the ">" case can happen, if minItem is exact but we previously
* had to set curItem to a lossy-page pointer.)
*/
if (ginCompareItemPointers(&key->curItem, &minItem) >= 0)
return;
/*
* OK, advance key->curItem and perform consistentFn test.
* OK, set key->curItem and perform consistentFn test.
*/
key->curItem = minItem;
......@@ -895,72 +930,18 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
* keyGetItem() the combination logic is known only to the consistentFn.
*/
static bool
scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
ItemPointerData *item, bool *recheck)
{
GinScanOpaque so = (GinScanOpaque) scan->opaque;
GinState *ginstate = &so->ginstate;
ItemPointerData myAdvancePast = *advancePast;
uint32 i;
bool allFinished;
bool match;
for (;;)
{
/*
* Advance any entries that are <= myAdvancePast. In particular,
* since entry->curItem was initialized with ItemPointerSetMin, this
* ensures we fetch the first item for each entry on the first call.
*/
allFinished = TRUE;
for (i = 0; i < so->totalentries; i++)
{
GinScanEntry entry = so->entries[i];
while (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem,
&myAdvancePast) <= 0)
entryGetItem(ginstate, entry);
if (entry->isFinished == FALSE)
allFinished = FALSE;
}
if (allFinished)
{
/* all entries exhausted, so we're done */
return false;
}
/*
* Perform the consistentFn test for each scan key. If any key
* reports isFinished, meaning its subset of the entries is exhausted,
* we can stop. Otherwise, set *item to the minimum of the key
* curItems.
*/
ItemPointerSetMax(item);
for (i = 0; i < so->nkeys; i++)
{
GinScanKey key = so->keys + i;
keyGetItem(&so->ginstate, so->tempCtx, key);
if (key->isFinished)
return false; /* finished one of keys */
if (ginCompareItemPointers(&key->curItem, item) < 0)
*item = key->curItem;
}
Assert(!ItemPointerIsMax(item));
/*----------
* Now *item contains first ItemPointer after previous result.
*
* The item is a valid hit only if all the keys succeeded for either
* that exact TID, or a lossy reference to the same page.
* Advance the scan keys in lock-step, until we find an item that matches
* all the keys. If any key reports isFinished, meaning its subset of the
* entries is exhausted, we can stop. Otherwise, set *item to the next
* matching item.
*
* This logic works only if a keyGetItem stream can never contain both
* exact and lossy pointers for the same page. Else we could have a
......@@ -977,35 +958,94 @@ scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
* (keyGetItem has a similar problem versus entryGetItem.)
*----------
*/
do
{
ItemPointerSetMin(item);
match = true;
for (i = 0; i < so->nkeys; i++)
for (i = 0; i < so->nkeys && match; i++)
{
GinScanKey key = so->keys + i;
if (key->curItemMatches)
/* Fetch the next item for this key that is > advancePast. */
keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
if (key->isFinished)
return false;
/*
* If it's not a match, we can immediately conclude that nothing
* <= this item matches, without checking the rest of the keys.
*/
if (!key->curItemMatches)
{
if (ginCompareItemPointers(item, &key->curItem) == 0)
continue;
if (ItemPointerIsLossyPage(&key->curItem) &&
GinItemPointerGetBlockNumber(&key->curItem) ==
GinItemPointerGetBlockNumber(item))
continue;
}
advancePast = key->curItem;
match = false;
break;
}
if (match)
break;
/*
* It's a match. We can conclude that nothing < matches, so
* the other key streams can skip to this item.
*
* Beware of lossy pointers, though; from a lossy pointer, we
* can only conclude that nothing smaller than this *block*
* matches.
*/
if (ItemPointerIsLossyPage(&key->curItem))
{
if (GinItemPointerGetBlockNumber(&advancePast) <
GinItemPointerGetBlockNumber(&key->curItem))
{
advancePast.ip_blkid = key->curItem.ip_blkid;
advancePast.ip_posid = 0;
}
}
else
{
Assert(key->curItem.ip_posid > 0);
advancePast = key->curItem;
advancePast.ip_posid--;
}
/*
* No hit. Update myAdvancePast to this TID, so that on the next pass
* we'll move to the next possible entry.
* If this is the first key, remember this location as a
* potential match.
*
* Otherwise, check if this is the same item that we checked the
* previous keys for (or a lossy pointer for the same page). If
* not, loop back to check the previous keys for this item (we
* will check this key again too, but keyGetItem returns quickly
* for that)
*/
myAdvancePast = *item;
if (i == 0)
{
*item = key->curItem;
}
else
{
if (ItemPointerIsLossyPage(&key->curItem) ||
ItemPointerIsLossyPage(item))
{
Assert (GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
match = (GinItemPointerGetBlockNumber(&key->curItem) ==
GinItemPointerGetBlockNumber(item));
}
else
{
Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
match = (ginCompareItemPointers(&key->curItem, item) == 0);
}
}
}
} while (!match);
Assert(!ItemPointerIsMin(item));
/*
* Now *item contains the first ItemPointer after previous result that
* satisfied all the keys for that exact TID, or a lossy reference
* to the same page.
*
* We must return recheck = true if any of the keys are marked recheck.
*/
*recheck = false;
......@@ -1536,7 +1576,7 @@ gingetbitmap(PG_FUNCTION_ARGS)
{
CHECK_FOR_INTERRUPTS();
if (!scanGetItem(scan, &iptr, &iptr, &recheck))
if (!scanGetItem(scan, iptr, &iptr, &recheck))
break;
if (ItemPointerIsLossyPage(&iptr))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment