Commit 218f5158 authored by Teodor Sigaev's avatar Teodor Sigaev

Reduce page locking in GIN vacuum

GIN vacuum during cleaning posting tree can lock this whole tree for a long
time with by holding LockBufferForCleanup() on root. Patch changes it with
two ways: first, cleanup lock will be taken only if there is an empty page
(which should be deleted) and, second, it tries to lock only subtree, not the
whole posting tree.

Author: Andrey Borodin with minor editorization by me
Reviewed-by: Jeff Davis, me

https://commitfest.postgresql.org/13/896/
parent 73561013
...@@ -314,10 +314,17 @@ deleted. ...@@ -314,10 +314,17 @@ deleted.
The previous paragraph's reasoning only applies to searches, and only to The previous paragraph's reasoning only applies to searches, and only to
posting trees. To protect from inserters following a downlink to a deleted posting trees. To protect from inserters following a downlink to a deleted
page, vacuum simply locks out all concurrent insertions to the posting tree, page, vacuum simply locks out all concurrent insertions to the posting tree,
by holding a super-exclusive lock on the posting tree root. Inserters hold a by holding a super-exclusive lock on the parent page of subtree with deletable
pin on the root page, but searches do not, so while new searches cannot begin pages. Inserters hold a pin on the root page, but searches do not, so while
while root page is locked, any already-in-progress scans can continue new searches cannot begin while root page is locked, any already-in-progress
concurrently with vacuum. In the entry tree, we never delete pages. scans can continue concurrently with vacuum in corresponding subtree of
posting tree. To exclude interference with readers vacuum takes exclusive
locks in a depth-first scan in left-to-right order of page tuples. Leftmost
page is never deleted. Thus before deleting any page we obtain exclusive
lock on any left page, effectively excluding deadlock with any reader, despite
taking parent lock before current and left lock after current. We take left
lock not for a concurrency reasons, but rather in need to mark page dirty.
In the entry tree, we never delete pages.
(This is quite different from the mechanism the btree indexam uses to make (This is quite different from the mechanism the btree indexam uses to make
page-deletions safe; it stamps the deleted pages with an XID and keeps the page-deletions safe; it stamps the deleted pages with an XID and keeps the
......
...@@ -31,7 +31,7 @@ static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, ...@@ -31,7 +31,7 @@ static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
/* /*
* Lock buffer by needed method for search. * Lock buffer by needed method for search.
*/ */
static int int
ginTraverseLock(Buffer buffer, bool searchMode) ginTraverseLock(Buffer buffer, bool searchMode)
{ {
Page page; Page page;
......
...@@ -109,75 +109,17 @@ xlogVacuumPage(Relation index, Buffer buffer) ...@@ -109,75 +109,17 @@ xlogVacuumPage(Relation index, Buffer buffer)
PageSetLSN(page, recptr); PageSetLSN(page, recptr);
} }
static bool
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer *rootBuffer)
{
Buffer buffer;
Page page;
bool hasVoidPage = FALSE;
MemoryContext oldCxt;
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
page = BufferGetPage(buffer);
/*
* We should be sure that we don't concurrent with inserts, insert process
* never release root page until end (but it can unlock it and lock
* again). New scan can't start but previously started ones work
* concurrently.
*/
if (isRoot)
LockBufferForCleanup(buffer);
else
LockBuffer(buffer, GIN_EXCLUSIVE);
Assert(GinPageIsData(page));
if (GinPageIsLeaf(page)) typedef struct DataPageDeleteStack
{ {
oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); struct DataPageDeleteStack *child;
ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); struct DataPageDeleteStack *parent;
MemoryContextSwitchTo(oldCxt);
MemoryContextReset(gvs->tmpCxt);
/* if root is a leaf page, we don't desire further processing */
if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page))
hasVoidPage = TRUE;
}
else
{
OffsetNumber i;
bool isChildHasVoid = FALSE;
for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++)
{
PostingItem *pitem = GinDataPageGetPostingItem(page, i);
if (ginVacuumPostingTreeLeaves(gvs, PostingItemGetBlockNumber(pitem), FALSE, NULL))
isChildHasVoid = TRUE;
}
if (isChildHasVoid)
hasVoidPage = TRUE;
}
/* BlockNumber blkno; /* current block number */
* if we have root and there are empty pages in tree, then we don't BlockNumber leftBlkno; /* rightest non-deleted page on left */
* release lock to go further processing and guarantee that tree is unused bool isRoot;
*/ } DataPageDeleteStack;
if (!(isRoot && hasVoidPage))
{
UnlockReleaseBuffer(buffer);
}
else
{
Assert(rootBuffer);
*rootBuffer = buffer;
}
return hasVoidPage;
}
/* /*
* Delete a posting tree page. * Delete a posting tree page.
...@@ -194,8 +136,13 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn ...@@ -194,8 +136,13 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
BlockNumber rightlink; BlockNumber rightlink;
/* /*
* Lock the pages in the same order as an insertion would, to avoid * This function MUST be called only if someone of parent pages hold
* deadlocks: left, then right, then parent. * exclusive cleanup lock. This guarantees that no insertions currently
* happen in this subtree. Caller also acquire Exclusive lock on deletable
* page and is acquiring and releasing exclusive lock on left page before.
* Left page was locked and released. Then parent and this page are locked.
* We acquire left page lock here only to mark page dirty after changing
* right pointer.
*/ */
lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
RBM_NORMAL, gvs->strategy); RBM_NORMAL, gvs->strategy);
...@@ -205,10 +152,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn ...@@ -205,10 +152,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
RBM_NORMAL, gvs->strategy); RBM_NORMAL, gvs->strategy);
LockBuffer(lBuffer, GIN_EXCLUSIVE); LockBuffer(lBuffer, GIN_EXCLUSIVE);
LockBuffer(dBuffer, GIN_EXCLUSIVE);
if (!isParentRoot) /* parent is already locked by
* LockBufferForCleanup() */
LockBuffer(pBuffer, GIN_EXCLUSIVE);
START_CRIT_SECTION(); START_CRIT_SECTION();
...@@ -272,26 +215,15 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn ...@@ -272,26 +215,15 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
PageSetLSN(BufferGetPage(lBuffer), recptr); PageSetLSN(BufferGetPage(lBuffer), recptr);
} }
if (!isParentRoot)
LockBuffer(pBuffer, GIN_UNLOCK);
ReleaseBuffer(pBuffer); ReleaseBuffer(pBuffer);
UnlockReleaseBuffer(lBuffer); UnlockReleaseBuffer(lBuffer);
UnlockReleaseBuffer(dBuffer); ReleaseBuffer(dBuffer);
END_CRIT_SECTION(); END_CRIT_SECTION();
gvs->result->pages_deleted++; gvs->result->pages_deleted++;
} }
typedef struct DataPageDeleteStack
{
struct DataPageDeleteStack *child;
struct DataPageDeleteStack *parent;
BlockNumber blkno; /* current block number */
BlockNumber leftBlkno; /* rightest non-deleted page on left */
bool isRoot;
} DataPageDeleteStack;
/* /*
* scans posting tree and deletes empty pages * scans posting tree and deletes empty pages
...@@ -325,6 +257,10 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, ...@@ -325,6 +257,10 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy); RBM_NORMAL, gvs->strategy);
if(!isRoot)
LockBuffer(buffer, GIN_EXCLUSIVE);
page = BufferGetPage(buffer); page = BufferGetPage(buffer);
Assert(GinPageIsData(page)); Assert(GinPageIsData(page));
...@@ -359,6 +295,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, ...@@ -359,6 +295,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
} }
} }
if(!isRoot)
LockBuffer(buffer, GIN_UNLOCK);
ReleaseBuffer(buffer); ReleaseBuffer(buffer);
if (!meDelete) if (!meDelete)
...@@ -367,37 +306,124 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, ...@@ -367,37 +306,124 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
return meDelete; return meDelete;
} }
static void
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) /*
* Scan through posting tree, delete empty tuples from leaf pages.
* Also, this function collects empty subtrees (with all empty leafs).
* For parents of these subtrees CleanUp lock is taken, then we call
* ScanToDelete. This is done for every inner page, which points to
* empty subtree.
*/
static bool
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot)
{ {
Buffer rootBuffer = InvalidBuffer; Buffer buffer;
DataPageDeleteStack root, Page page;
*ptr, bool hasVoidPage = FALSE;
*tmp; MemoryContext oldCxt;
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
page = BufferGetPage(buffer);
ginTraverseLock(buffer,false);
if (ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE, &rootBuffer) == FALSE) Assert(GinPageIsData(page));
if (GinPageIsLeaf(page))
{ {
Assert(rootBuffer == InvalidBuffer); oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
return; ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
MemoryContextSwitchTo(oldCxt);
MemoryContextReset(gvs->tmpCxt);
/* if root is a leaf page, we don't desire further processing */
if (GinDataLeafPageIsEmpty(page))
hasVoidPage = TRUE;
UnlockReleaseBuffer(buffer);
return hasVoidPage;
} }
else
{
OffsetNumber i;
bool hasEmptyChild = FALSE;
bool hasNonEmptyChild = FALSE;
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
BlockNumber* children = palloc(sizeof(BlockNumber) * (maxoff + 1));
/*
* Read all children BlockNumbers.
* Not sure it is safe if there are many concurrent vacuums.
*/
memset(&root, 0, sizeof(DataPageDeleteStack)); for (i = FirstOffsetNumber; i <= maxoff; i++)
root.leftBlkno = InvalidBlockNumber; {
root.isRoot = TRUE; PostingItem *pitem = GinDataPageGetPostingItem(page, i);
vacuum_delay_point(); children[i] = PostingItemGetBlockNumber(pitem);
}
ginScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber); UnlockReleaseBuffer(buffer);
ptr = root.child; for (i = FirstOffsetNumber; i <= maxoff; i++)
while (ptr) {
{ if (ginVacuumPostingTreeLeaves(gvs, children[i], FALSE))
tmp = ptr->child; hasEmptyChild = TRUE;
pfree(ptr); else
ptr = tmp; hasNonEmptyChild = TRUE;
}
pfree(children);
vacuum_delay_point();
/*
* All subtree is empty - just return TRUE to indicate that parent must
* do a cleanup. Unless we are ROOT an there is way to go upper.
*/
if(hasEmptyChild && !hasNonEmptyChild && !isRoot)
return TRUE;
if(hasEmptyChild)
{
DataPageDeleteStack root,
*ptr,
*tmp;
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
LockBufferForCleanup(buffer);
memset(&root, 0, sizeof(DataPageDeleteStack));
root.leftBlkno = InvalidBlockNumber;
root.isRoot = TRUE;
ginScanToDelete(gvs, blkno, TRUE, &root, InvalidOffsetNumber);
ptr = root.child;
while (ptr)
{
tmp = ptr->child;
pfree(ptr);
ptr = tmp;
}
UnlockReleaseBuffer(buffer);
}
/* Here we have deleted all empty subtrees */
return FALSE;
} }
}
UnlockReleaseBuffer(rootBuffer); static void
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
{
ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE);
} }
/* /*
......
...@@ -471,4 +471,6 @@ ginCompareItemPointers(ItemPointer a, ItemPointer b) ...@@ -471,4 +471,6 @@ ginCompareItemPointers(ItemPointer a, ItemPointer b)
return -1; return -1;
} }
extern int ginTraverseLock(Buffer buffer, bool searchMode);
#endif /* GIN_PRIVATE_H */ #endif /* GIN_PRIVATE_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment