Commit 626a1206 authored by Heikki Linnakangas's avatar Heikki Linnakangas

Further optimize GIN multi-key searches.

When skipping over some items in a posting tree, re-find the new location
by descending the tree from root, rather than walking the right links.
This can save a lot of I/O.

Heavily modified from Alexander Korotkov's fast scan patch.
parent 8440897b
...@@ -1639,16 +1639,15 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno, ...@@ -1639,16 +1639,15 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
* Starts a new scan on a posting tree. * Starts a new scan on a posting tree.
*/ */
GinBtreeStack * GinBtreeStack *
ginScanBeginPostingTree(Relation index, BlockNumber rootBlkno) ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno)
{ {
GinBtreeData btree;
GinBtreeStack *stack; GinBtreeStack *stack;
ginPrepareDataScan(&btree, index, rootBlkno); ginPrepareDataScan(btree, index, rootBlkno);
btree.fullScan = TRUE; btree->fullScan = TRUE;
stack = ginFindLeafPage(&btree, TRUE); stack = ginFindLeafPage(btree, TRUE);
return stack; return stack;
} }
...@@ -99,12 +99,13 @@ static void ...@@ -99,12 +99,13 @@ static void
scanPostingTree(Relation index, GinScanEntry scanEntry, scanPostingTree(Relation index, GinScanEntry scanEntry,
BlockNumber rootPostingTree) BlockNumber rootPostingTree)
{ {
GinBtreeData btree;
GinBtreeStack *stack; GinBtreeStack *stack;
Buffer buffer; Buffer buffer;
Page page; Page page;
/* Descend to the leftmost leaf page */ /* Descend to the leftmost leaf page */
stack = ginScanBeginPostingTree(index, rootPostingTree); stack = ginScanBeginPostingTree(&btree, index, rootPostingTree);
buffer = stack->buffer; buffer = stack->buffer;
IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
...@@ -412,7 +413,8 @@ restartScanEntry: ...@@ -412,7 +413,8 @@ restartScanEntry:
LockBuffer(stackEntry->buffer, GIN_UNLOCK); LockBuffer(stackEntry->buffer, GIN_UNLOCK);
needUnlock = FALSE; needUnlock = FALSE;
stack = ginScanBeginPostingTree(ginstate->index, rootPostingTree); stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
rootPostingTree);
entry->buffer = stack->buffer; entry->buffer = stack->buffer;
/* /*
...@@ -506,8 +508,60 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan ...@@ -506,8 +508,60 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
{ {
Page page; Page page;
int i; int i;
bool stepright;
if (!BufferIsValid(entry->buffer))
{
entry->isFinished = true;
return;
}
/*
* We have two strategies for finding the correct page: step right from
* the current page, or descend the tree again from the root. If
* advancePast equals the current item, the next matching item should be
* on the next page, so we step right. Otherwise, descend from root.
*/
if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
{
stepright = true;
LockBuffer(entry->buffer, GIN_SHARE);
}
else
{
GinBtreeStack *stack;
ReleaseBuffer(entry->buffer);
/*
* Set the search key, and find the correct leaf page.
*/
if (ItemPointerIsLossyPage(&advancePast))
{
ItemPointerSet(&entry->btree.itemptr,
GinItemPointerGetBlockNumber(&advancePast) + 1,
FirstOffsetNumber);
}
else
{
entry->btree.itemptr = advancePast;
entry->btree.itemptr.ip_posid++;
}
entry->btree.fullScan = false;
stack = ginFindLeafPage(&entry->btree, true);
/* we don't need the stack, just the buffer. */
entry->buffer = stack->buffer;
IncrBufferRefCount(entry->buffer);
freeGinBtreeStack(stack);
stepright = false;
}
elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
GinItemPointerGetBlockNumber(&advancePast),
GinItemPointerGetOffsetNumber(&advancePast),
!stepright);
LockBuffer(entry->buffer, GIN_SHARE);
page = BufferGetPage(entry->buffer); page = BufferGetPage(entry->buffer);
for (;;) for (;;)
{ {
...@@ -519,30 +573,34 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan ...@@ -519,30 +573,34 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
entry->nlist = 0; entry->nlist = 0;
} }
/* if (stepright)
* We've processed all the entries on this page. If it was the last
* page in the tree, we're done.
*/
if (GinPageRightMost(page))
{ {
UnlockReleaseBuffer(entry->buffer); /*
entry->buffer = InvalidBuffer; * We've processed all the entries on this page. If it was the last
entry->isFinished = TRUE; * page in the tree, we're done.
return; */
if (GinPageRightMost(page))
{
UnlockReleaseBuffer(entry->buffer);
entry->buffer = InvalidBuffer;
entry->isFinished = TRUE;
return;
}
/*
* Step to next page, following the right link. then find the first
* ItemPointer greater than advancePast.
*/
entry->buffer = ginStepRight(entry->buffer,
ginstate->index,
GIN_SHARE);
page = BufferGetPage(entry->buffer);
} }
stepright = true;
if (GinPageGetOpaque(page)->flags & GIN_DELETED) if (GinPageGetOpaque(page)->flags & GIN_DELETED)
continue; /* page was deleted by concurrent vacuum */ continue; /* page was deleted by concurrent vacuum */
/*
* Step to next page, following the right link. then find the first
* ItemPointer greater than advancePast.
*/
entry->buffer = ginStepRight(entry->buffer,
ginstate->index,
GIN_SHARE);
page = BufferGetPage(entry->buffer);
/* /*
* The first item > advancePast might not be on this page, but * The first item > advancePast might not be on this page, but
* somewhere to the right, if the page was split, or a non-match from * somewhere to the right, if the page was split, or a non-match from
...@@ -566,8 +624,16 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan ...@@ -566,8 +624,16 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
{ {
if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0) if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
{ {
LockBuffer(entry->buffer, GIN_UNLOCK);
entry->offset = i; entry->offset = i;
if (GinPageRightMost(page))
{
/* after processing the copied items, we're done. */
UnlockReleaseBuffer(entry->buffer);
entry->buffer = InvalidBuffer;
}
else
LockBuffer(entry->buffer, GIN_UNLOCK);
return; return;
} }
} }
...@@ -677,7 +743,10 @@ entryGetItem(GinState *ginstate, GinScanEntry entry, ...@@ -677,7 +743,10 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
} }
else if (!BufferIsValid(entry->buffer)) else if (!BufferIsValid(entry->buffer))
{ {
/* A posting list from an entry tuple */ /*
* A posting list from an entry tuple, or the last page of a posting
* tree.
*/
do do
{ {
if (entry->offset >= entry->nlist) if (entry->offset >= entry->nlist)
......
...@@ -702,7 +702,7 @@ extern void GinPageDeletePostingItem(Page page, OffsetNumber offset); ...@@ -702,7 +702,7 @@ extern void GinPageDeletePostingItem(Page page, OffsetNumber offset);
extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno, extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
ItemPointerData *items, uint32 nitem, ItemPointerData *items, uint32 nitem,
GinStatsData *buildStats); GinStatsData *buildStats);
extern GinBtreeStack *ginScanBeginPostingTree(Relation index, BlockNumber rootBlkno); extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno);
extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage); extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno); extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno);
...@@ -802,6 +802,7 @@ typedef struct GinScanEntryData ...@@ -802,6 +802,7 @@ typedef struct GinScanEntryData
bool isFinished; bool isFinished;
bool reduceResult; bool reduceResult;
uint32 predictNumberResult; uint32 predictNumberResult;
GinBtreeData btree;
} GinScanEntryData; } GinScanEntryData;
typedef struct GinScanOpaqueData typedef struct GinScanOpaqueData
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment