Commit d1996ed5 authored by Heikki Linnakangas's avatar Heikki Linnakangas

Change the way parent pages are tracked during buffered GiST build.

We used to mimic the way a stack is constructed when descending the tree
during normal GiST inserts, but that was quite complicated during a buffered
build. It was also wrong: in GiST, the left-to-right relationships on
different levels might not match each other, so that when you know the
parent of a child page, you won't necessarily find the parent of the page to
the right of the child page by following the rightlinks at the parent level.
This sometimes led to "could not re-find parent" errors while building a
GiST index.

We now use a simple hash table to track the parent of every internal page.
Whenever a page is split, and downlinks are moved from one page to another,
we update the hash table accordingly. This is also better for performance
than the old method, as we never need to move right to re-find the parent
page, which could take a significant amount of time for buffers that were
created much earlier in the index build.
parent be02b168
...@@ -55,16 +55,24 @@ typedef struct ...@@ -55,16 +55,24 @@ typedef struct
{ {
Relation indexrel; Relation indexrel;
GISTSTATE *giststate; GISTSTATE *giststate;
GISTBuildBuffers *gfbb;
int64 indtuples; /* number of tuples indexed */ int64 indtuples; /* number of tuples indexed */
int64 indtuplesSize; /* total size of all indexed tuples */ int64 indtuplesSize; /* total size of all indexed tuples */
Size freespace; /* amount of free space to leave on pages */ Size freespace; /* amount of free space to leave on pages */
/*
* Extra data structures used during a buffering build. 'gfbb' contains
* information related to managing the build buffers. 'parentMap' is a
* lookup table of the parent of each internal page.
*/
GISTBuildBuffers *gfbb;
HTAB *parentMap;
GistBufferingMode bufferingMode; GistBufferingMode bufferingMode;
} GISTBuildState; } GISTBuildState;
/* prototypes for private functions */
static void gistInitBuffering(GISTBuildState *buildstate); static void gistInitBuffering(GISTBuildState *buildstate);
static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
static void gistBuildCallback(Relation index, static void gistBuildCallback(Relation index,
...@@ -76,18 +84,24 @@ static void gistBuildCallback(Relation index, ...@@ -76,18 +84,24 @@ static void gistBuildCallback(Relation index,
static void gistBufferingBuildInsert(GISTBuildState *buildstate, static void gistBufferingBuildInsert(GISTBuildState *buildstate,
IndexTuple itup); IndexTuple itup);
static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
GISTBufferingInsertStack *startparent); BlockNumber startblkno, int startlevel);
static void gistbufferinginserttuples(GISTBuildState *buildstate, static void gistbufferinginserttuples(GISTBuildState *buildstate,
Buffer buffer, Buffer buffer, int level,
IndexTuple *itup, int ntup, OffsetNumber oldoffnum, IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
GISTBufferingInsertStack *path); BlockNumber parentblk, OffsetNumber downlinkoffnum);
static void gistBufferingFindCorrectParent(GISTBuildState *buildstate, static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
GISTBufferingInsertStack *child); BlockNumber childblkno, int level,
BlockNumber *parentblk,
OffsetNumber *downlinkoffnum);
static void gistProcessEmptyingQueue(GISTBuildState *buildstate); static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
static void gistEmptyAllBuffers(GISTBuildState *buildstate); static void gistEmptyAllBuffers(GISTBuildState *buildstate);
static void gistFreeUnreferencedPath(GISTBufferingInsertStack *path);
static int gistGetMaxLevel(Relation index); static int gistGetMaxLevel(Relation index);
static void gistInitParentMap(GISTBuildState *buildstate);
static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
BlockNumber parent);
static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
/* /*
* Main entry point to GiST index build. Initially calls insert over and over, * Main entry point to GiST index build. Initially calls insert over and over,
...@@ -407,6 +421,8 @@ gistInitBuffering(GISTBuildState *buildstate) ...@@ -407,6 +421,8 @@ gistInitBuffering(GISTBuildState *buildstate)
buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep, buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
gistGetMaxLevel(index)); gistGetMaxLevel(index));
gistInitParentMap(buildstate);
buildstate->bufferingMode = GIST_BUFFERING_ACTIVE; buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d", elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
...@@ -529,7 +545,7 @@ static void ...@@ -529,7 +545,7 @@ static void
gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup) gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
{ {
/* Insert the tuple to buffers. */ /* Insert the tuple to buffers. */
gistProcessItup(buildstate, itup, NULL); gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
/* If we filled up (half of a) buffer, process buffer emptying. */ /* If we filled up (half of a) buffer, process buffer emptying. */
gistProcessEmptyingQueue(buildstate); gistProcessEmptyingQueue(buildstate);
...@@ -543,30 +559,28 @@ gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup) ...@@ -543,30 +559,28 @@ gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
*/ */
static bool static bool
gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
GISTBufferingInsertStack *startparent) BlockNumber startblkno, int startlevel)
{ {
GISTSTATE *giststate = buildstate->giststate; GISTSTATE *giststate = buildstate->giststate;
GISTBuildBuffers *gfbb = buildstate->gfbb; GISTBuildBuffers *gfbb = buildstate->gfbb;
Relation indexrel = buildstate->indexrel; Relation indexrel = buildstate->indexrel;
GISTBufferingInsertStack *path;
BlockNumber childblkno; BlockNumber childblkno;
Buffer buffer; Buffer buffer;
bool result = false; bool result = false;
BlockNumber blkno;
int level;
OffsetNumber downlinkoffnum = InvalidOffsetNumber;
BlockNumber parentblkno = InvalidBlockNumber;
/* CHECK_FOR_INTERRUPTS();
* NULL passed in startparent means that we start index tuple processing
* from the root.
*/
if (!startparent)
path = gfbb->rootitem;
else
path = startparent;
/* /*
* Loop until we reach a leaf page (level == 0) or a level with buffers * Loop until we reach a leaf page (level == 0) or a level with buffers
* (not including the level we start at, because we would otherwise make * (not including the level we start at, because we would otherwise make
* no progress). * no progress).
*/ */
blkno = startblkno;
level = startlevel;
for (;;) for (;;)
{ {
ItemId iid; ItemId iid;
...@@ -574,21 +588,21 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, ...@@ -574,21 +588,21 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
newtup; newtup;
Page page; Page page;
OffsetNumber childoffnum; OffsetNumber childoffnum;
GISTBufferingInsertStack *parent;
/* Have we reached a level with buffers? */ /* Have we reached a level with buffers? */
if (LEVEL_HAS_BUFFERS(path->level, gfbb) && path != startparent) if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
break; break;
/* Have we reached a leaf page? */ /* Have we reached a leaf page? */
if (path->level == 0) if (level == 0)
break; break;
/* /*
* Nope. Descend down to the next level then. Choose a child to * Nope. Descend down to the next level then. Choose a child to
* descend down to. * descend down to.
*/ */
buffer = ReadBuffer(indexrel, path->blkno);
buffer = ReadBuffer(indexrel, blkno);
LockBuffer(buffer, GIST_EXCLUSIVE); LockBuffer(buffer, GIST_EXCLUSIVE);
page = (Page) BufferGetPage(buffer); page = (Page) BufferGetPage(buffer);
...@@ -597,32 +611,33 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, ...@@ -597,32 +611,33 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
idxtuple = (IndexTuple) PageGetItem(page, iid); idxtuple = (IndexTuple) PageGetItem(page, iid);
childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
if (level > 1)
gistMemorizeParent(buildstate, childblkno, blkno);
/* /*
* Check that the key representing the target child node is consistent * Check that the key representing the target child node is consistent
* with the key we're inserting. Update it if it's not. * with the key we're inserting. Update it if it's not.
*/ */
newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate); newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
if (newtup) if (newtup)
gistbufferinginserttuples(buildstate, buffer, &newtup, 1, {
childoffnum, path); gistbufferinginserttuples(buildstate, buffer, level,
UnlockReleaseBuffer(buffer); &newtup, 1, childoffnum,
InvalidBlockNumber, InvalidOffsetNumber);
/* gistbufferinginserttuples() released the buffer */
}
else
UnlockReleaseBuffer(buffer);
/* Create new path item representing current page */ /* Descend to the child */
parent = path; parentblkno = blkno;
path = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context, blkno = childblkno;
sizeof(GISTBufferingInsertStack)); downlinkoffnum = childoffnum;
path->parent = parent; Assert(level > 0);
path->level = parent->level - 1; level--;
path->blkno = childblkno;
path->downlinkoffnum = childoffnum;
path->refCount = 0; /* it's unreferenced for now */
/* Adjust reference count of parent */
if (parent)
parent->refCount++;
} }
if (LEVEL_HAS_BUFFERS(path->level, gfbb)) if (LEVEL_HAS_BUFFERS(level, gfbb))
{ {
/* /*
* We've reached level with buffers. Place the index tuple to the * We've reached level with buffers. Place the index tuple to the
...@@ -631,8 +646,7 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, ...@@ -631,8 +646,7 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
GISTNodeBuffer *childNodeBuffer; GISTNodeBuffer *childNodeBuffer;
/* Find the buffer or create a new one */ /* Find the buffer or create a new one */
childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, path->blkno, childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
path->downlinkoffnum, path->parent);
/* Add index tuple to it */ /* Add index tuple to it */
gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup); gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
...@@ -645,19 +659,15 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, ...@@ -645,19 +659,15 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
/* /*
* We've reached a leaf page. Place the tuple here. * We've reached a leaf page. Place the tuple here.
*/ */
buffer = ReadBuffer(indexrel, path->blkno); Assert(level == 0);
buffer = ReadBuffer(indexrel, blkno);
LockBuffer(buffer, GIST_EXCLUSIVE); LockBuffer(buffer, GIST_EXCLUSIVE);
gistbufferinginserttuples(buildstate, buffer, &itup, 1, gistbufferinginserttuples(buildstate, buffer, level,
InvalidOffsetNumber, path); &itup, 1, InvalidOffsetNumber,
UnlockReleaseBuffer(buffer); parentblkno, downlinkoffnum);
/* gistbufferinginserttuples() released the buffer */
} }
/*
* Free unreferenced path items, if any. Path item may be referenced by
* node buffer.
*/
gistFreeUnreferencedPath(path);
return result; return result;
} }
...@@ -665,11 +675,14 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, ...@@ -665,11 +675,14 @@ gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
* Insert tuples to a given page. * Insert tuples to a given page.
* *
* This is analogous with gistinserttuples() in the regular insertion code. * This is analogous with gistinserttuples() in the regular insertion code.
*
* Caller should hold a lock on 'buffer' on entry. This function will unlock
* and unpin it.
*/ */
static void static void
gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
IndexTuple *itup, int ntup, OffsetNumber oldoffnum, IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
GISTBufferingInsertStack *path) BlockNumber parentblk, OffsetNumber downlinkoffnum)
{ {
GISTBuildBuffers *gfbb = buildstate->gfbb; GISTBuildBuffers *gfbb = buildstate->gfbb;
List *splitinfo; List *splitinfo;
...@@ -692,33 +705,41 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, ...@@ -692,33 +705,41 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
*/ */
if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO) if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
{ {
GISTBufferingInsertStack *oldroot = gfbb->rootitem;
Page page = BufferGetPage(buffer); Page page = BufferGetPage(buffer);
ItemId iid; OffsetNumber off;
IndexTuple idxtuple; OffsetNumber maxoff;
BlockNumber leftmostchild;
gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc( Assert(level == gfbb->rootlevel);
gfbb->context, sizeof(GISTBufferingInsertStack)); gfbb->rootlevel++;
gfbb->rootitem->parent = NULL;
gfbb->rootitem->blkno = GIST_ROOT_BLKNO; elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
gfbb->rootitem->level = oldroot->level + 1;
gfbb->rootitem->refCount = 1;
/* /*
* All the downlinks on the old root page are now on one of the child * All the downlinks on the old root page are now on one of the child
* pages. Change the block number of the old root entry in the stack * pages. Visit all the new child pages to memorize the parents of
* to point to the leftmost child. The other child pages will be * the grandchildren.
* accessible from there by walking right.
*/ */
iid = PageGetItemId(page, FirstOffsetNumber); if (gfbb->rootlevel > 1)
idxtuple = (IndexTuple) PageGetItem(page, iid); {
leftmostchild = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); maxoff = PageGetMaxOffsetNumber(page);
for (off = FirstOffsetNumber; off <= maxoff; off++)
{
ItemId iid = PageGetItemId(page, off);
IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
oldroot->parent = gfbb->rootitem; LockBuffer(childbuf, GIST_SHARE);
oldroot->blkno = leftmostchild; gistMemorizeAllDownlinks(buildstate, childbuf);
oldroot->downlinkoffnum = InvalidOffsetNumber; UnlockReleaseBuffer(childbuf);
/*
* Also remember that the parent of the new child page is
* the root block.
*/
gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
}
}
} }
if (splitinfo) if (splitinfo)
...@@ -726,7 +747,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, ...@@ -726,7 +747,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
/* /*
* Insert the downlinks to the parent. This is analogous with * Insert the downlinks to the parent. This is analogous with
* gistfinishsplit() in the regular insertion code, but the locking is * gistfinishsplit() in the regular insertion code, but the locking is
* simpler, and we have to maintain the buffers. * simpler, and we have to maintain the buffers on internal nodes and
* the parent map.
*/ */
IndexTuple *downlinks; IndexTuple *downlinks;
int ndownlinks, int ndownlinks,
...@@ -735,7 +757,12 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, ...@@ -735,7 +757,12 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
ListCell *lc; ListCell *lc;
/* Parent may have changed since we memorized this path. */ /* Parent may have changed since we memorized this path. */
gistBufferingFindCorrectParent(buildstate, path); parentBuffer =
gistBufferingFindCorrectParent(buildstate,
BufferGetBlockNumber(buffer),
level,
&parentblk,
&downlinkoffnum);
/* /*
* If there's a buffer associated with this page, that needs to be * If there's a buffer associated with this page, that needs to be
...@@ -747,7 +774,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, ...@@ -747,7 +774,8 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
gistRelocateBuildBuffersOnSplit(gfbb, gistRelocateBuildBuffersOnSplit(gfbb,
buildstate->giststate, buildstate->giststate,
buildstate->indexrel, buildstate->indexrel,
path, buffer, splitinfo); level,
buffer, splitinfo);
/* Create an array of all the downlink tuples */ /* Create an array of all the downlink tuples */
ndownlinks = list_length(splitinfo); ndownlinks = list_length(splitinfo);
...@@ -757,125 +785,130 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, ...@@ -757,125 +785,130 @@ gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
{ {
GISTPageSplitInfo *splitinfo = lfirst(lc); GISTPageSplitInfo *splitinfo = lfirst(lc);
/*
* Remember the parent of each new child page in our parent map.
* This assumes that the downlinks fit on the parent page. If the
* parent page is split, too, when we recurse up to insert the
* downlinks, the recursive gistbufferinginserttuples() call
* will update the map again.
*/
if (level > 0)
gistMemorizeParent(buildstate,
BufferGetBlockNumber(splitinfo->buf),
BufferGetBlockNumber(parentBuffer));
/*
* Also update the parent map for all the downlinks that got moved
* to a different page. (actually this also loops through the
* downlinks that stayed on the original page, but it does no
* harm).
*/
if (level > 1)
gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
/* /*
* Since there's no concurrent access, we can release the lower * Since there's no concurrent access, we can release the lower
* level buffers immediately. Don't release the buffer for the * level buffers immediately. This includes the original page.
* original page, though, because the caller will release that.
*/ */
if (splitinfo->buf != buffer) UnlockReleaseBuffer(splitinfo->buf);
UnlockReleaseBuffer(splitinfo->buf);
downlinks[i++] = splitinfo->downlink; downlinks[i++] = splitinfo->downlink;
} }
/* Insert them into parent. */ /* Insert them into parent. */
parentBuffer = ReadBuffer(buildstate->indexrel, path->parent->blkno); gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
LockBuffer(parentBuffer, GIST_EXCLUSIVE); downlinks, ndownlinks, downlinkoffnum,
gistbufferinginserttuples(buildstate, parentBuffer, InvalidBlockNumber, InvalidOffsetNumber);
downlinks, ndownlinks,
path->downlinkoffnum, path->parent);
UnlockReleaseBuffer(parentBuffer);
list_free_deep(splitinfo); /* we don't need this anymore */ list_free_deep(splitinfo); /* we don't need this anymore */
} }
else
UnlockReleaseBuffer(buffer);
} }
/* /*
* Find correct parent by following rightlinks in buffering index build. This * Find the downlink pointing to a child page.
* method of parent searching is possible because no concurrent activity is *
* possible while index builds. * 'childblkno' indicates the child page to find the parent for. 'level' is
* the level of the child. On entry, *parentblkno and *downlinkoffnum can
* point to a location where the downlink used to be - we will check that
* location first, and save some cycles if it hasn't moved. The function
* returns a buffer containing the downlink, exclusively-locked, and
* *parentblkno and *downlinkoffnum are set to the real location of the
* downlink.
*
* If the child page is a leaf (level == 0), the caller must supply a correct
* parentblkno. Otherwise we use the parent map hash table to find the parent
* block.
*
* This function serves the same purpose as gistFindCorrectParent() during
* normal index inserts, but this is simpler because we don't need to deal
* with concurrent inserts.
*/ */
static void static Buffer
gistBufferingFindCorrectParent(GISTBuildState *buildstate, gistBufferingFindCorrectParent(GISTBuildState *buildstate,
GISTBufferingInsertStack *child) BlockNumber childblkno, int level,
BlockNumber *parentblkno,
OffsetNumber *downlinkoffnum)
{ {
GISTBuildBuffers *gfbb = buildstate->gfbb; BlockNumber parent;
Relation indexrel = buildstate->indexrel;
GISTBufferingInsertStack *parent = child->parent;
OffsetNumber i,
maxoff;
ItemId iid;
IndexTuple idxtuple;
Buffer buffer; Buffer buffer;
Page page; Page page;
bool copied = false; OffsetNumber maxoff;
OffsetNumber off;
buffer = ReadBuffer(indexrel, parent->blkno); if (level > 0)
parent = gistGetParent(buildstate, childblkno);
else
{
/*
* For a leaf page, the caller must supply a correct parent block
* number.
*/
if (*parentblkno == InvalidBlockNumber)
elog(ERROR, "no parent buffer provided of child %d", childblkno);
parent = *parentblkno;
}
buffer = ReadBuffer(buildstate->indexrel, parent);
page = BufferGetPage(buffer); page = BufferGetPage(buffer);
LockBuffer(buffer, GIST_EXCLUSIVE); LockBuffer(buffer, GIST_EXCLUSIVE);
gistcheckpage(indexrel, buffer); gistcheckpage(buildstate->indexrel, buffer);
maxoff = PageGetMaxOffsetNumber(page);
/* Check if it was not moved */ /* Check if it was not moved */
if (child->downlinkoffnum != InvalidOffsetNumber && if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
child->downlinkoffnum <= PageGetMaxOffsetNumber(page)) *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
{ {
iid = PageGetItemId(page, child->downlinkoffnum); ItemId iid = PageGetItemId(page, *downlinkoffnum);
idxtuple = (IndexTuple) PageGetItem(page, iid); IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
{ {
/* Still there */ /* Still there */
UnlockReleaseBuffer(buffer); return buffer;
return;
} }
} }
/* parent has changed, look child in right links until found */ /*
while (true) * Downlink was not at the offset where it used to be. Scan the page
* to find it. During normal gist insertions, it might've moved to another
* page, to the right, but during a buffering build, we keep track of
* the parent of each page in the lookup table so we should always know
* what page it's on.
*/
for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
{ {
/* Search for relevant downlink in the current page */ ItemId iid = PageGetItemId(page, off);
maxoff = PageGetMaxOffsetNumber(page); IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
{
iid = PageGetItemId(page, i);
idxtuple = (IndexTuple) PageGetItem(page, iid);
if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
{
/* yes!!, found */
child->downlinkoffnum = i;
UnlockReleaseBuffer(buffer);
return;
}
}
/*
* We should copy parent path item because some other path items can
* refer to it.
*/
if (!copied)
{ {
parent = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context, /* yes!!, found it */
sizeof(GISTBufferingInsertStack)); *downlinkoffnum = off;
memcpy(parent, child->parent, sizeof(GISTBufferingInsertStack)); return buffer;
if (parent->parent)
parent->parent->refCount++;
gistDecreasePathRefcount(child->parent);
child->parent = parent;
parent->refCount = 1;
copied = true;
} }
/*
* Not found in current page. Move towards rightlink.
*/
parent->blkno = GistPageGetOpaque(page)->rightlink;
UnlockReleaseBuffer(buffer);
if (parent->blkno == InvalidBlockNumber)
{
/*
* End of chain and still didn't find parent. Should not happen
* during index build.
*/
break;
}
/* Get the next page */
buffer = ReadBuffer(indexrel, parent->blkno);
page = BufferGetPage(buffer);
LockBuffer(buffer, GIST_EXCLUSIVE);
gistcheckpage(indexrel, buffer);
} }
elog(ERROR, "failed to re-find parent for block %u", child->blkno); elog(ERROR, "failed to re-find parent for block %u", childblkno);
return InvalidBuffer; /* keep compiler quiet */
} }
/* /*
...@@ -934,7 +967,7 @@ gistProcessEmptyingQueue(GISTBuildState *buildstate) ...@@ -934,7 +967,7 @@ gistProcessEmptyingQueue(GISTBuildState *buildstate)
* threshold, but we might as well keep flushing tuples from it * threshold, but we might as well keep flushing tuples from it
* until we fill a lower-level buffer. * until we fill a lower-level buffer.
*/ */
if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->path)) if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
{ {
/* /*
* A lower level buffer filled up. Stop emptying this buffer, * A lower level buffer filled up. Stop emptying this buffer,
...@@ -1003,45 +1036,11 @@ gistEmptyAllBuffers(GISTBuildState *buildstate) ...@@ -1003,45 +1036,11 @@ gistEmptyAllBuffers(GISTBuildState *buildstate)
gfbb->buffersOnLevels[i] = gfbb->buffersOnLevels[i] =
list_delete_first(gfbb->buffersOnLevels[i]); list_delete_first(gfbb->buffersOnLevels[i]);
} }
elog(DEBUG2, "emptied all buffers at level %d", i);
} }
MemoryContextSwitchTo(oldCtx); MemoryContextSwitchTo(oldCtx);
} }
/*
* Free unreferenced parts of a path stack.
*/
static void
gistFreeUnreferencedPath(GISTBufferingInsertStack *path)
{
while (path->refCount == 0)
{
/*
* Path part is unreferenced. We can free it and decrease reference
* count of parent. If parent becomes unreferenced too procedure
* should be repeated for it.
*/
GISTBufferingInsertStack *tmp = path->parent;
pfree(path);
path = tmp;
if (path)
path->refCount--;
else
break;
}
}
/*
* Decrease reference count of a path part, and free any unreferenced parts of
* the path stack.
*/
void
gistDecreasePathRefcount(GISTBufferingInsertStack *path)
{
path->refCount--;
gistFreeUnreferencedPath(path);
}
/* /*
* Get the depth of the GiST index. * Get the depth of the GiST index.
*/ */
...@@ -1091,9 +1090,114 @@ gistGetMaxLevel(Relation index) ...@@ -1091,9 +1090,114 @@ gistGetMaxLevel(Relation index)
/* /*
* We're going down on the tree. It means that there is yet one more * We're going down on the tree. It means that there is yet one more
* level is the tree. * level in the tree.
*/ */
maxLevel++; maxLevel++;
} }
return maxLevel; return maxLevel;
} }
/*
* Routines for managing the parent map.
*
* Whenever a page is split, we need to insert the downlinks into the parent.
* We need to somehow find the parent page to do that. In normal insertions,
* we keep a stack of nodes visited when we descend the tree. However, in
* buffering build, we can start descending the tree from any internal node,
* when we empty a buffer by cascading tuples to its children. So we don't
* have a full stack up to the root available at that time.
*
* So instead, we maintain a hash table to track the parent of every internal
* page. We don't need to track the parents of leaf nodes, however. Whenever
* we insert to a leaf, we've just descended down from its parent, so we know
* its immediate parent already. This helps a lot to limit the memory used
* by this hash table.
*
* Whenever an internal node is split, the parent map needs to be updated.
* the parent of the new child page needs to be recorded, and also the
* entries for all page whose downlinks are moved to a new page at the split
* needs to be updated.
*
* We also update the parent map whenever we descend the tree. That might seem
* unnecessary, because we maintain the map whenever a downlink is moved or
* created, but it is needed because we switch to buffering mode after
* creating a tree with regular index inserts. Any pages created before
* switching to buffering mode will not be present in the parent map initially,
* but will be added there the first time we visit them.
*/
typedef struct
{
BlockNumber childblkno; /* hash key */
BlockNumber parentblkno;
} ParentMapEntry;
static void
gistInitParentMap(GISTBuildState *buildstate)
{
HASHCTL hashCtl;
hashCtl.keysize = sizeof(BlockNumber);
hashCtl.entrysize = sizeof(ParentMapEntry);
hashCtl.hcxt = CurrentMemoryContext;
hashCtl.hash = oid_hash;
buildstate->parentMap = hash_create("gistbuild parent map",
1024,
&hashCtl,
HASH_ELEM | HASH_CONTEXT
| HASH_FUNCTION);
}
static void
gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
{
ParentMapEntry *entry;
bool found;
entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
(const void *) &child,
HASH_ENTER,
&found);
entry->parentblkno = parent;
}
/*
* Scan all downlinks on a page, and memorize their parent.
*/
static void
gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
{
OffsetNumber maxoff;
OffsetNumber off;
BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
Page page = BufferGetPage(parentbuf);
Assert(!GistPageIsLeaf(page));
maxoff = PageGetMaxOffsetNumber(page);
for (off = FirstOffsetNumber; off <= maxoff; off++)
{
ItemId iid = PageGetItemId(page, off);
IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
gistMemorizeParent(buildstate, childblkno, parentblkno);
}
}
static BlockNumber
gistGetParent(GISTBuildState *buildstate, BlockNumber child)
{
ParentMapEntry *entry;
bool found;
/* Find node buffer in hash table */
entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
(const void *) &child,
HASH_FIND,
&found);
if (!found)
elog(ERROR, "could not find parent of block %d in lookup table", child);
return entry->parentblkno;
}
...@@ -107,16 +107,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) ...@@ -107,16 +107,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
sizeof(GISTNodeBuffer *)); sizeof(GISTNodeBuffer *));
gfbb->loadedBuffersCount = 0; gfbb->loadedBuffersCount = 0;
/* gfbb->rootlevel = maxLevel;
* Root path item of the tree. Updated on each root node split.
*/
gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
gfbb->context, sizeof(GISTBufferingInsertStack));
gfbb->rootitem->parent = NULL;
gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
gfbb->rootitem->level = maxLevel;
gfbb->rootitem->refCount = 1;
return gfbb; return gfbb;
} }
...@@ -127,9 +118,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) ...@@ -127,9 +118,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
*/ */
GISTNodeBuffer * GISTNodeBuffer *
gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
BlockNumber nodeBlocknum, BlockNumber nodeBlocknum, int level)
OffsetNumber downlinkoffnum,
GISTBufferingInsertStack *parent)
{ {
GISTNodeBuffer *nodeBuffer; GISTNodeBuffer *nodeBuffer;
bool found; bool found;
...@@ -144,8 +133,6 @@ gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, ...@@ -144,8 +133,6 @@ gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
/* /*
* Node buffer wasn't found. Initialize the new buffer as empty. * Node buffer wasn't found. Initialize the new buffer as empty.
*/ */
GISTBufferingInsertStack *path;
int level;
MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
/* nodeBuffer->nodeBlocknum is the hash key and was filled in already */ /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */
...@@ -153,33 +140,12 @@ gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, ...@@ -153,33 +140,12 @@ gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
nodeBuffer->pageBlocknum = InvalidBlockNumber; nodeBuffer->pageBlocknum = InvalidBlockNumber;
nodeBuffer->pageBuffer = NULL; nodeBuffer->pageBuffer = NULL;
nodeBuffer->queuedForEmptying = false; nodeBuffer->queuedForEmptying = false;
nodeBuffer->level = level;
/*
* Create a path stack for the page.
*/
if (nodeBlocknum != GIST_ROOT_BLKNO)
{
path = (GISTBufferingInsertStack *) palloc(
sizeof(GISTBufferingInsertStack));
path->parent = parent;
path->blkno = nodeBlocknum;
path->downlinkoffnum = downlinkoffnum;
path->level = parent->level - 1;
path->refCount = 0; /* initially unreferenced */
parent->refCount++; /* this path references its parent */
Assert(path->level > 0);
}
else
path = gfbb->rootitem;
nodeBuffer->path = path;
path->refCount++;
/* /*
* Add this buffer to the list of buffers on this level. Enlarge * Add this buffer to the list of buffers on this level. Enlarge
* buffersOnLevels array if needed. * buffersOnLevels array if needed.
*/ */
level = path->level;
if (level >= gfbb->buffersOnLevelsLen) if (level >= gfbb->buffersOnLevelsLen)
{ {
int i; int i;
...@@ -210,20 +176,6 @@ gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, ...@@ -210,20 +176,6 @@ gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
MemoryContextSwitchTo(oldcxt); MemoryContextSwitchTo(oldcxt);
} }
else
{
if (parent != nodeBuffer->path->parent)
{
/*
* A different parent path item was provided than we've
* remembered. We trust caller to provide more correct parent than
* we have. Previous parent may be outdated by page split.
*/
gistDecreasePathRefcount(nodeBuffer->path->parent);
nodeBuffer->path->parent = parent;
parent->refCount++;
}
}
return nodeBuffer; return nodeBuffer;
} }
...@@ -585,7 +537,7 @@ typedef struct ...@@ -585,7 +537,7 @@ typedef struct
*/ */
void void
gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
Relation r, GISTBufferingInsertStack *path, Relation r, int level,
Buffer buffer, List *splitinfo) Buffer buffer, List *splitinfo)
{ {
RelocationBufferInfo *relocationBuffersInfos; RelocationBufferInfo *relocationBuffersInfos;
...@@ -601,7 +553,7 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, ...@@ -601,7 +553,7 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
ListCell *lc; ListCell *lc;
/* If the splitted page doesn't have buffers, we have nothing to do. */ /* If the splitted page doesn't have buffers, we have nothing to do. */
if (!LEVEL_HAS_BUFFERS(path->level, gfbb)) if (!LEVEL_HAS_BUFFERS(level, gfbb))
return; return;
/* /*
...@@ -660,14 +612,11 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, ...@@ -660,14 +612,11 @@ gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
/* /*
* Create a node buffer for the page. The leftmost half is on the same * Create a node buffer for the page. The leftmost half is on the same
* block as the old page before split, so for the leftmost half this * block as the old page before split, so for the leftmost half this
* will return the original buffer, which was emptied earlier in this * will return the original buffer. The tuples on the original buffer
* function. * were relinked to the temporary buffer, so the original one is now
* empty.
*/ */
newNodeBuffer = gistGetNodeBuffer(gfbb, newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level);
giststate,
BufferGetBlockNumber(si->buf),
path->downlinkoffnum,
path->parent);
relocationBuffersInfos[i].nodeBuffer = newNodeBuffer; relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
relocationBuffersInfos[i].splitinfo = si; relocationBuffersInfos[i].splitinfo = si;
......
...@@ -329,7 +329,7 @@ typedef struct ...@@ -329,7 +329,7 @@ typedef struct
/* is this a temporary copy, not in the hash table? */ /* is this a temporary copy, not in the hash table? */
bool isTemp; bool isTemp;
struct GISTBufferingInsertStack *path; int level; /* 0 == leaf */
} GISTNodeBuffer; } GISTNodeBuffer;
/* /*
...@@ -338,7 +338,7 @@ typedef struct ...@@ -338,7 +338,7 @@ typedef struct
*/ */
#define LEVEL_HAS_BUFFERS(nlevel, gfbb) \ #define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \ ((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
(nlevel) != (gfbb)->rootitem->level) (nlevel) != (gfbb)->rootlevel)
/* Is specified buffer at least half-filled (should be queued for emptying)? */ /* Is specified buffer at least half-filled (should be queued for emptying)? */
#define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \ #define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
...@@ -352,26 +352,6 @@ typedef struct ...@@ -352,26 +352,6 @@ typedef struct
#define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \ #define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer) ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
/*
* Extended GISTInsertStack for buffering GiST index build.
*/
typedef struct GISTBufferingInsertStack
{
/* current page */
BlockNumber blkno;
/* offset of the downlink in the parent page, that points to this page */
OffsetNumber downlinkoffnum;
/* pointer to parent */
struct GISTBufferingInsertStack *parent;
int refCount;
/* level number */
int level;
} GISTBufferingInsertStack;
/* /*
* Data structure with general information about build buffers. * Data structure with general information about build buffers.
*/ */
...@@ -416,8 +396,8 @@ typedef struct GISTBuildBuffers ...@@ -416,8 +396,8 @@ typedef struct GISTBuildBuffers
int loadedBuffersCount; /* # of entries in loadedBuffers */ int loadedBuffersCount; /* # of entries in loadedBuffers */
int loadedBuffersLen; /* allocated size of loadedBuffers */ int loadedBuffersLen; /* allocated size of loadedBuffers */
/* A path item that points to the current root node */ /* Level of the current root node (= height of the index tree - 1) */
GISTBufferingInsertStack *rootitem; int rootlevel;
} GISTBuildBuffers; } GISTBuildBuffers;
/* /*
...@@ -551,15 +531,13 @@ extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup, ...@@ -551,15 +531,13 @@ extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
/* gistbuild.c */ /* gistbuild.c */
extern Datum gistbuild(PG_FUNCTION_ARGS); extern Datum gistbuild(PG_FUNCTION_ARGS);
extern void gistValidateBufferingOption(char *value); extern void gistValidateBufferingOption(char *value);
extern void gistDecreasePathRefcount(GISTBufferingInsertStack *path);
/* gistbuildbuffers.c */ /* gistbuildbuffers.c */
extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep, extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
int maxLevel); int maxLevel);
extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb, extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
GISTSTATE *giststate, GISTSTATE *giststate,
BlockNumber blkno, OffsetNumber downlinkoffnum, BlockNumber blkno, int level);
GISTBufferingInsertStack *parent);
extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
GISTNodeBuffer *nodeBuffer, IndexTuple item); GISTNodeBuffer *nodeBuffer, IndexTuple item);
extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
...@@ -567,7 +545,7 @@ extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, ...@@ -567,7 +545,7 @@ extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb); extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
GISTSTATE *giststate, Relation r, GISTSTATE *giststate, Relation r,
GISTBufferingInsertStack *path, Buffer buffer, int level, Buffer buffer,
List *splitinfo); List *splitinfo);
extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb); extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment