Commit e20c70cb authored by Heikki Linnakangas's avatar Heikki Linnakangas

Allow skipping some items in a multi-key GIN search.

In a multi-key search, ie. something like "col @> 'foo' AND col @> 'bar'",
as soon as we find the next item that matches the first criteria, we don't
need to check the second criteria for TIDs smaller the first match. That
saves a lot of effort, especially if one of the terms is rare, while the
second occurs very frequently.

Based on ideas from Alexander Korotkov's fast scan patch.
parent 2013e5ee
...@@ -67,29 +67,6 @@ callConsistentFn(GinState *ginstate, GinScanKey key) ...@@ -67,29 +67,6 @@ callConsistentFn(GinState *ginstate, GinScanKey key)
PointerGetDatum(key->queryCategories))); PointerGetDatum(key->queryCategories)));
} }
/*
* Tries to refind previously taken ItemPointer on a posting page.
*/
static bool
needToStepRight(Page page, ItemPointer item)
{
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
/* page was deleted by concurrent vacuum */
return true;
if (ginCompareItemPointers(item, GinDataPageGetRightBound(page)) > 0
&& !GinPageRightMost(page))
{
/*
* the item we're looking is > the right bound of the page, so it
* can't be on this page.
*/
return true;
}
return false;
}
/* /*
* Goes to the next page if current offset is outside of bounds * Goes to the next page if current offset is outside of bounds
*/ */
...@@ -447,8 +424,7 @@ restartScanEntry: ...@@ -447,8 +424,7 @@ restartScanEntry:
page = BufferGetPage(entry->buffer); page = BufferGetPage(entry->buffer);
/* /*
* Copy page content to memory to avoid keeping it locked for * Load the first page into memory.
* a long time.
*/ */
entry->list = GinDataLeafPageGetItems(page, &entry->nlist); entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
...@@ -518,88 +494,78 @@ startScan(IndexScanDesc scan) ...@@ -518,88 +494,78 @@ startScan(IndexScanDesc scan)
} }
/* /*
* Gets next ItemPointer from PostingTree. Note, that we copy * Load the next batch of item pointers from a posting tree.
* page into GinScanEntry->list array and unlock page, but keep it pinned *
* to prevent interference with vacuum * Note that we copy the page into GinScanEntry->list array and unlock it, but
* keep it pinned to prevent interference with vacuum.
*/ */
static void static void
entryGetNextItem(GinState *ginstate, GinScanEntry entry) entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast)
{ {
Page page; Page page;
int i; int i;
LockBuffer(entry->buffer, GIN_SHARE);
page = BufferGetPage(entry->buffer);
for (;;) for (;;)
{ {
if (entry->offset < entry->nlist) entry->offset = InvalidOffsetNumber;
if (entry->list)
{ {
entry->curItem = entry->list[entry->offset++]; pfree(entry->list);
return; entry->list = NULL;
entry->nlist = 0;
} }
LockBuffer(entry->buffer, GIN_SHARE);
page = BufferGetPage(entry->buffer);
for (;;)
{
/* /*
* It's needed to go by right link. During that we should refind * We've processed all the entries on this page. If it was the last
* first ItemPointer greater that stored * page in the tree, we're done.
*/ */
if (GinPageRightMost(page)) if (GinPageRightMost(page))
{ {
UnlockReleaseBuffer(entry->buffer); UnlockReleaseBuffer(entry->buffer);
ItemPointerSetInvalid(&entry->curItem);
entry->buffer = InvalidBuffer; entry->buffer = InvalidBuffer;
entry->isFinished = TRUE; entry->isFinished = TRUE;
return; return;
} }
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
continue; /* page was deleted by concurrent vacuum */
/*
* Step to next page, following the right link. then find the first
* ItemPointer greater than advancePast.
*/
entry->buffer = ginStepRight(entry->buffer, entry->buffer = ginStepRight(entry->buffer,
ginstate->index, ginstate->index,
GIN_SHARE); GIN_SHARE);
page = BufferGetPage(entry->buffer); page = BufferGetPage(entry->buffer);
entry->offset = InvalidOffsetNumber;
if (entry->list)
{
pfree(entry->list);
entry->list = NULL;
}
/* /*
* If the page was concurrently split, we have to re-find the * The first item > advancePast might not be on this page, but
* item we were stopped on. If the page was split more than once, * somewhere to the right, if the page was split, or a non-match from
* the item might not be on this page, but somewhere to the right. * another key in the query allowed us to skip some items from this
* Keep following the right-links until we re-find the correct * entry. Keep following the right-links until we re-find the correct
* page. * page.
*/ */
if (ItemPointerIsValid(&entry->curItem) && if (!GinPageRightMost(page) &&
needToStepRight(page, &entry->curItem)) ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
{ {
/*
* the item we're looking is > the right bound of the page, so it
* can't be on this page.
*/
continue; continue;
} }
entry->list = GinDataLeafPageGetItems(page, &entry->nlist); entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
/* re-find the item we were stopped on. */
if (ItemPointerIsValid(&entry->curItem))
{
for (i = 0; i < entry->nlist; i++) for (i = 0; i < entry->nlist; i++)
{ {
if (ginCompareItemPointers(&entry->curItem, if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
&entry->list[i]) < 0)
{
LockBuffer(entry->buffer, GIN_UNLOCK);
entry->offset = i + 1;
entry->curItem = entry->list[entry->offset - 1];
return;
}
}
}
else
{ {
LockBuffer(entry->buffer, GIN_UNLOCK); LockBuffer(entry->buffer, GIN_UNLOCK);
entry->offset = 1; /* scan all items on the page. */ entry->offset = i;
entry->curItem = entry->list[entry->offset - 1];
return; return;
} }
} }
...@@ -610,10 +576,10 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry) ...@@ -610,10 +576,10 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
/* /*
* Sets entry->curItem to next heap item pointer for one entry of one scan key, * Sets entry->curItem to next heap item pointer > advancePast, for one entry
* or sets entry->isFinished to TRUE if there are no more. * of one scan key, or sets entry->isFinished to TRUE if there are no more.
* *
* Item pointers must be returned in ascending order. * Item pointers are returned in ascending order.
* *
* Note: this can return a "lossy page" item pointer, indicating that the * Note: this can return a "lossy page" item pointer, indicating that the
* entry potentially matches all items on that heap page. However, it is * entry potentially matches all items on that heap page. However, it is
...@@ -623,12 +589,20 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry) ...@@ -623,12 +589,20 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
* current implementation this is guaranteed by the behavior of tidbitmaps. * current implementation this is guaranteed by the behavior of tidbitmaps.
*/ */
static void static void
entryGetItem(GinState *ginstate, GinScanEntry entry) entryGetItem(GinState *ginstate, GinScanEntry entry,
ItemPointerData advancePast)
{ {
Assert(!entry->isFinished); Assert(!entry->isFinished);
Assert(!ItemPointerIsValid(&entry->curItem) ||
ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
if (entry->matchBitmap) if (entry->matchBitmap)
{ {
/* A bitmap result */
BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
do do
{ {
if (entry->matchResult == NULL || if (entry->matchResult == NULL ||
...@@ -645,6 +619,18 @@ entryGetItem(GinState *ginstate, GinScanEntry entry) ...@@ -645,6 +619,18 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
break; break;
} }
/*
* If all the matches on this page are <= advancePast, skip
* to next page.
*/
if (entry->matchResult->blockno < advancePastBlk ||
(entry->matchResult->blockno == advancePastBlk &&
entry->matchResult->offsets[entry->offset] <= advancePastOff))
{
entry->offset = entry->matchResult->ntuples;
continue;
}
/* /*
* Reset counter to the beginning of entry->matchResult. Note: * Reset counter to the beginning of entry->matchResult. Note:
* entry->offset is still greater than matchResult->ntuples if * entry->offset is still greater than matchResult->ntuples if
...@@ -670,6 +656,17 @@ entryGetItem(GinState *ginstate, GinScanEntry entry) ...@@ -670,6 +656,17 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
break; break;
} }
if (entry->matchResult->blockno == advancePastBlk)
{
/*
* Skip to the right offset on this page. We already checked
* in above loop that there is at least one item > advancePast
* on the page.
*/
while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
entry->offset++;
}
ItemPointerSet(&entry->curItem, ItemPointerSet(&entry->curItem,
entry->matchResult->blockno, entry->matchResult->blockno,
entry->matchResult->offsets[entry->offset]); entry->matchResult->offsets[entry->offset]);
...@@ -678,29 +675,48 @@ entryGetItem(GinState *ginstate, GinScanEntry entry) ...@@ -678,29 +675,48 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
} }
else if (!BufferIsValid(entry->buffer)) else if (!BufferIsValid(entry->buffer))
{ {
entry->offset++; /* A posting list from an entry tuple */
if (entry->offset <= entry->nlist) do
entry->curItem = entry->list[entry->offset - 1]; {
else if (entry->offset >= entry->nlist)
{ {
ItemPointerSetInvalid(&entry->curItem); ItemPointerSetInvalid(&entry->curItem);
entry->isFinished = TRUE; entry->isFinished = TRUE;
break;
} }
entry->curItem = entry->list[entry->offset++];
} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
/* XXX: shouldn't we apply the fuzzy search limit here? */
} }
else else
{ {
/* A posting tree */
do do
{ {
entryGetNextItem(ginstate, entry); /* If we've processed the current batch, load more items */
} while (entry->isFinished == FALSE && while (entry->offset >= entry->nlist)
entry->reduceResult == TRUE && {
dropItem(entry)); entryLoadMoreItems(ginstate, entry, advancePast);
if (entry->isFinished)
{
ItemPointerSetInvalid(&entry->curItem);
return;
}
}
entry->curItem = entry->list[entry->offset++];
} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
(entry->reduceResult == TRUE && dropItem(entry)));
} }
} }
/* /*
* Identify the "current" item among the input entry streams for this scan key, * Identify the "current" item among the input entry streams for this scan key
* and test whether it passes the scan key qual condition. * that is greater than advancePast, and test whether it passes the scan key
* qual condition.
* *
* The current item is the smallest curItem among the inputs. key->curItem * The current item is the smallest curItem among the inputs. key->curItem
* is set to that value. key->curItemMatches is set to indicate whether that * is set to that value. key->curItemMatches is set to indicate whether that
...@@ -719,7 +735,8 @@ entryGetItem(GinState *ginstate, GinScanEntry entry) ...@@ -719,7 +735,8 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
* logic in scanGetItem.) * logic in scanGetItem.)
*/ */
static void static void
keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
ItemPointerData advancePast)
{ {
ItemPointerData minItem; ItemPointerData minItem;
ItemPointerData curPageLossy; ItemPointerData curPageLossy;
...@@ -729,11 +746,20 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) ...@@ -729,11 +746,20 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
GinScanEntry entry; GinScanEntry entry;
bool res; bool res;
MemoryContext oldCtx; MemoryContext oldCtx;
bool allFinished;
Assert(!key->isFinished); Assert(!key->isFinished);
/* /*
* Find the minimum of the active entry curItems. * We might have already tested this item; if so, no need to repeat work.
* (Note: the ">" case can happen, if minItem is exact but we previously
* had to set curItem to a lossy-page pointer.)
*/
if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
return;
/*
* Find the minimum item > advancePast among the active entry streams.
* *
* Note: a lossy-page entry is encoded by a ItemPointer with max value for * Note: a lossy-page entry is encoded by a ItemPointer with max value for
* offset (0xffff), so that it will sort after any exact entries for the * offset (0xffff), so that it will sort after any exact entries for the
...@@ -741,16 +767,33 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) ...@@ -741,16 +767,33 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
* pointers, which is good. * pointers, which is good.
*/ */
ItemPointerSetMax(&minItem); ItemPointerSetMax(&minItem);
allFinished = true;
for (i = 0; i < key->nentries; i++) for (i = 0; i < key->nentries; i++)
{ {
entry = key->scanEntry[i]; entry = key->scanEntry[i];
if (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem, &minItem) < 0) /*
* Advance this stream if necessary.
*
* In particular, since entry->curItem was initialized with
* ItemPointerSetMin, this ensures we fetch the first item for each
* entry on the first call.
*/
while (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
{
entryGetItem(ginstate, entry, advancePast);
}
if (!entry->isFinished)
{
allFinished = FALSE;
if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
minItem = entry->curItem; minItem = entry->curItem;
} }
}
if (ItemPointerIsMax(&minItem)) if (allFinished)
{ {
/* all entries are finished */ /* all entries are finished */
key->isFinished = TRUE; key->isFinished = TRUE;
...@@ -758,15 +801,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) ...@@ -758,15 +801,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
} }
/* /*
* We might have already tested this item; if so, no need to repeat work. * OK, set key->curItem and perform consistentFn test.
* (Note: the ">" case can happen, if minItem is exact but we previously
* had to set curItem to a lossy-page pointer.)
*/
if (ginCompareItemPointers(&key->curItem, &minItem) >= 0)
return;
/*
* OK, advance key->curItem and perform consistentFn test.
*/ */
key->curItem = minItem; key->curItem = minItem;
...@@ -895,72 +930,18 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key) ...@@ -895,72 +930,18 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
* keyGetItem() the combination logic is known only to the consistentFn. * keyGetItem() the combination logic is known only to the consistentFn.
*/ */
static bool static bool
scanGetItem(IndexScanDesc scan, ItemPointer advancePast, scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
ItemPointerData *item, bool *recheck) ItemPointerData *item, bool *recheck)
{ {
GinScanOpaque so = (GinScanOpaque) scan->opaque; GinScanOpaque so = (GinScanOpaque) scan->opaque;
GinState *ginstate = &so->ginstate;
ItemPointerData myAdvancePast = *advancePast;
uint32 i; uint32 i;
bool allFinished;
bool match; bool match;
for (;;)
{
/*
* Advance any entries that are <= myAdvancePast. In particular,
* since entry->curItem was initialized with ItemPointerSetMin, this
* ensures we fetch the first item for each entry on the first call.
*/
allFinished = TRUE;
for (i = 0; i < so->totalentries; i++)
{
GinScanEntry entry = so->entries[i];
while (entry->isFinished == FALSE &&
ginCompareItemPointers(&entry->curItem,
&myAdvancePast) <= 0)
entryGetItem(ginstate, entry);
if (entry->isFinished == FALSE)
allFinished = FALSE;
}
if (allFinished)
{
/* all entries exhausted, so we're done */
return false;
}
/*
* Perform the consistentFn test for each scan key. If any key
* reports isFinished, meaning its subset of the entries is exhausted,
* we can stop. Otherwise, set *item to the minimum of the key
* curItems.
*/
ItemPointerSetMax(item);
for (i = 0; i < so->nkeys; i++)
{
GinScanKey key = so->keys + i;
keyGetItem(&so->ginstate, so->tempCtx, key);
if (key->isFinished)
return false; /* finished one of keys */
if (ginCompareItemPointers(&key->curItem, item) < 0)
*item = key->curItem;
}
Assert(!ItemPointerIsMax(item));
/*---------- /*----------
* Now *item contains first ItemPointer after previous result. * Advance the scan keys in lock-step, until we find an item that matches
* * all the keys. If any key reports isFinished, meaning its subset of the
* The item is a valid hit only if all the keys succeeded for either * entries is exhausted, we can stop. Otherwise, set *item to the next
* that exact TID, or a lossy reference to the same page. * matching item.
* *
* This logic works only if a keyGetItem stream can never contain both * This logic works only if a keyGetItem stream can never contain both
* exact and lossy pointers for the same page. Else we could have a * exact and lossy pointers for the same page. Else we could have a
...@@ -977,35 +958,94 @@ scanGetItem(IndexScanDesc scan, ItemPointer advancePast, ...@@ -977,35 +958,94 @@ scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
* (keyGetItem has a similar problem versus entryGetItem.) * (keyGetItem has a similar problem versus entryGetItem.)
*---------- *----------
*/ */
do
{
ItemPointerSetMin(item);
match = true; match = true;
for (i = 0; i < so->nkeys; i++) for (i = 0; i < so->nkeys && match; i++)
{ {
GinScanKey key = so->keys + i; GinScanKey key = so->keys + i;
if (key->curItemMatches) /* Fetch the next item for this key that is > advancePast. */
keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
if (key->isFinished)
return false;
/*
* If it's not a match, we can immediately conclude that nothing
* <= this item matches, without checking the rest of the keys.
*/
if (!key->curItemMatches)
{ {
if (ginCompareItemPointers(item, &key->curItem) == 0) advancePast = key->curItem;
continue;
if (ItemPointerIsLossyPage(&key->curItem) &&
GinItemPointerGetBlockNumber(&key->curItem) ==
GinItemPointerGetBlockNumber(item))
continue;
}
match = false; match = false;
break; break;
} }
if (match) /*
break; * It's a match. We can conclude that nothing < matches, so
* the other key streams can skip to this item.
*
* Beware of lossy pointers, though; from a lossy pointer, we
* can only conclude that nothing smaller than this *block*
* matches.
*/
if (ItemPointerIsLossyPage(&key->curItem))
{
if (GinItemPointerGetBlockNumber(&advancePast) <
GinItemPointerGetBlockNumber(&key->curItem))
{
advancePast.ip_blkid = key->curItem.ip_blkid;
advancePast.ip_posid = 0;
}
}
else
{
Assert(key->curItem.ip_posid > 0);
advancePast = key->curItem;
advancePast.ip_posid--;
}
/* /*
* No hit. Update myAdvancePast to this TID, so that on the next pass * If this is the first key, remember this location as a
* we'll move to the next possible entry. * potential match.
*
* Otherwise, check if this is the same item that we checked the
* previous keys for (or a lossy pointer for the same page). If
* not, loop back to check the previous keys for this item (we
* will check this key again too, but keyGetItem returns quickly
* for that)
*/ */
myAdvancePast = *item; if (i == 0)
{
*item = key->curItem;
} }
else
{
if (ItemPointerIsLossyPage(&key->curItem) ||
ItemPointerIsLossyPage(item))
{
Assert (GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
match = (GinItemPointerGetBlockNumber(&key->curItem) ==
GinItemPointerGetBlockNumber(item));
}
else
{
Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
match = (ginCompareItemPointers(&key->curItem, item) == 0);
}
}
}
} while (!match);
Assert(!ItemPointerIsMin(item));
/* /*
* Now *item contains the first ItemPointer after previous result that
* satisfied all the keys for that exact TID, or a lossy reference
* to the same page.
*
* We must return recheck = true if any of the keys are marked recheck. * We must return recheck = true if any of the keys are marked recheck.
*/ */
*recheck = false; *recheck = false;
...@@ -1536,7 +1576,7 @@ gingetbitmap(PG_FUNCTION_ARGS) ...@@ -1536,7 +1576,7 @@ gingetbitmap(PG_FUNCTION_ARGS)
{ {
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
if (!scanGetItem(scan, &iptr, &iptr, &recheck)) if (!scanGetItem(scan, iptr, &iptr, &recheck))
break; break;
if (ItemPointerIsLossyPage(&iptr)) if (ItemPointerIsLossyPage(&iptr))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment