Commit cd8c73a3 authored by Peter Geoghegan's avatar Peter Geoghegan

Refactor nbtree deletion INCOMPLETE_SPLIT check.

Factor out code common to _bt_lock_branch_parent() and _bt_pagedel()
into a new utility function.  This new function is used to check that
the left sibling of a deletion target page does not have the
INCOMPLETE_SPLIT page flag set.  If it is set then deletion is unsafe;
there won't be a usable pivot tuple (with a downlink) in the parent page
that points to the deletion target page.  The page deletion algorithm is
not prepared to deal with that.  Also restructure an existing, related
utility function that checks if the right sibling of the target page has
the ISHALFDEAD page flag set.

This organization highlights the symmetry between the two cases.  The
goal is to make the design of page deletion clearer.  Both functions
involve a sibling page with a flag that indicates that there was an
interrupted operation (a page split or a page deletion) that resulted in
a page pointed to by sibling pages, but not pointed to in the parent.
And, both functions indicate if page deletion is unsafe due to the
absence of a particular downlink in the parent page.
parent db89f0e3
...@@ -1315,20 +1315,87 @@ _bt_xid_horizon(Relation rel, Relation heapRel, Page page, ...@@ -1315,20 +1315,87 @@ _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
} }
/* /*
* Returns true, if the given block has the half-dead flag set. * Check that leftsib page (the btpo_prev of target page) is not marked with
* INCOMPLETE_SPLIT flag.
*
* Returning true indicates that page flag is set in leftsib (which is
* definitely still the left sibling of target). When that happens, the
* target doesn't have a downlink in parent, and the page deletion algorithm
* isn't prepared to handle that. Deletion of the target page (or the whole
* subtree that contains the target page) cannot take place.
*/
static bool
_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
{
Buffer buf;
Page page;
BTPageOpaque opaque;
bool result;
/* Easy case: No left sibling */
if (leftsib == P_NONE)
return false;
buf = _bt_getbuf(rel, leftsib, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* If the left sibling was concurrently split, so that its next-pointer
* doesn't point to the current page anymore, the split that created
* target must be completed. Caller can reasonably expect that there will
* be a downlink to the target page that it can relocate using its stack.
* (We don't allow splitting an incompletely split page again until the
* previous split has been completed.)
*/
result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
_bt_relbuf(rel, buf);
return result;
}
/*
* Check that leafrightsib page (the btpo_next of target leaf page) is not
* marked with ISHALFDEAD flag.
*
* Returning true indicates that page flag is set in leafrightsib, so page
* deletion cannot go ahead. Our caller is not prepared to deal with the case
* where the parent page does not have a pivot tuples whose downlink points to
* leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
* seem worth going to the trouble of teaching our caller to deal with it.
* The situation will be resolved after VACUUM finishes the deletion of the
* half-dead page (when a future VACUUM operation reaches the target page
* again).
*
* _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
* _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
* okay because of the restriction on deleting pages that are the rightmost
* page of their parent (i.e. that such deletions can only take place when the
* entire subtree must be deleted). The leaf level check made here will apply
* to a right "cousin" leaf page rather than a simple right sibling leaf page
* in cases where caller actually goes on to attempt deleting pages that are
* above the leaf page. The right cousin leaf page is representative of the
* left edge of the subtree to the right of the to-be-deleted subtree as a
* whole, which is exactly the condition that our caller cares about.
* (Besides, internal pages are never marked half-dead, so it isn't even
* possible to _directly_ assess if an internal page is part of some other
* to-be-deleted subtree.)
*/ */
static bool static bool
_bt_is_page_halfdead(Relation rel, BlockNumber blk) _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
{ {
Buffer buf; Buffer buf;
Page page; Page page;
BTPageOpaque opaque; BTPageOpaque opaque;
bool result; bool result;
buf = _bt_getbuf(rel, blk, BT_READ); Assert(leafrightsib != P_NONE);
buf = _bt_getbuf(rel, leafrightsib, BT_READ);
page = BufferGetPage(buf); page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque = (BTPageOpaque) PageGetSpecialPointer(page);
Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
result = P_ISHALFDEAD(opaque); result = P_ISHALFDEAD(opaque);
_bt_relbuf(rel, buf); _bt_relbuf(rel, buf);
...@@ -1374,7 +1441,6 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, ...@@ -1374,7 +1441,6 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
Buffer pbuf; Buffer pbuf;
Page page; Page page;
BTPageOpaque opaque; BTPageOpaque opaque;
BlockNumber leftsib;
/* /*
* Locate the downlink of "child" in the parent, updating the stack entry * Locate the downlink of "child" in the parent, updating the stack entry
...@@ -1399,11 +1465,14 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, ...@@ -1399,11 +1465,14 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
* If the target is the rightmost child of its parent, then we can't * If the target is the rightmost child of its parent, then we can't
* delete, unless it's also the only child. * delete, unless it's also the only child.
*/ */
Assert(poffset <= maxoff);
if (poffset >= maxoff) if (poffset >= maxoff)
{ {
/* It's rightmost child... */ /* It's rightmost child... */
if (poffset == P_FIRSTDATAKEY(opaque)) if (poffset == P_FIRSTDATAKEY(opaque))
{ {
BlockNumber leftsibparent;
/* /*
* It's only child, so safe if parent would itself be removable. * It's only child, so safe if parent would itself be removable.
* We have to check the parent itself, and then recurse to test * We have to check the parent itself, and then recurse to test
...@@ -1418,41 +1487,16 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, ...@@ -1418,41 +1487,16 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
*target = parent; *target = parent;
*rightsib = opaque->btpo_next; *rightsib = opaque->btpo_next;
leftsib = opaque->btpo_prev; leftsibparent = opaque->btpo_prev;
_bt_relbuf(rel, pbuf); _bt_relbuf(rel, pbuf);
/* /*
* Like in _bt_pagedel, check that the left sibling is not marked * Check that the left sibling of parent (if any) is not marked
* with INCOMPLETE_SPLIT flag. That would mean that there is no * with INCOMPLETE_SPLIT flag before proceeding
* downlink to the page to be deleted, and the page deletion
* algorithm isn't prepared to handle that.
*/ */
if (leftsib != P_NONE) if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
{ return false;
Buffer lbuf;
Page lpage;
BTPageOpaque lopaque;
lbuf = _bt_getbuf(rel, leftsib, BT_READ);
lpage = BufferGetPage(lbuf);
lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
/*
* If the left sibling was concurrently split, so that its
* next-pointer doesn't point to the current page anymore, the
* split that created the current page must be completed. (We
* don't allow splitting an incompletely split page again
* until the previous split has been completed)
*/
if (lopaque->btpo_next == parent &&
P_INCOMPLETE_SPLIT(lopaque))
{
_bt_relbuf(rel, lbuf);
return false;
}
_bt_relbuf(rel, lbuf);
}
return _bt_lock_branch_parent(rel, parent, stack->bts_parent, return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
topparent, topoff, target, rightsib); topparent, topoff, target, rightsib);
...@@ -1525,7 +1569,9 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) ...@@ -1525,7 +1569,9 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
* *
* Also, when "stack" is not NULL, we have already checked that the * Also, when "stack" is not NULL, we have already checked that the
* current page is not the right half of an incomplete split, i.e. the * current page is not the right half of an incomplete split, i.e. the
* left sibling does not have its INCOMPLETE_SPLIT flag set. * left sibling does not have its INCOMPLETE_SPLIT flag set, including
* when the current target page is to the right of caller's initial page
* (the scanblkno page).
*/ */
BTStack stack = NULL; BTStack stack = NULL;
...@@ -1589,11 +1635,12 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) ...@@ -1589,11 +1635,12 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
* The INCOMPLETE_SPLIT flag on the page tells us if the page is the * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
* left half of an incomplete split, but ensuring that it's not the * left half of an incomplete split, but ensuring that it's not the
* right half is more complicated. For that, we have to check that * right half is more complicated. For that, we have to check that
* the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
* the first iteration, we temporarily release the lock on the current * _bt_leftsib_splitflag(). On the first iteration, we temporarily
* page, and check the left sibling and also construct a search stack * release the lock on scanblkno/leafbuf, check the left sibling, and
* to. On subsequent iterations, we know we stepped right from a page * construct a search stack to scanblkno. On subsequent iterations,
* that passed these tests, so it's OK. * we know we stepped right from a page that passed these tests, so
* it's OK.
*/ */
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
...@@ -1628,13 +1675,14 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) ...@@ -1628,13 +1675,14 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
BTScanInsert itup_key; BTScanInsert itup_key;
ItemId itemid; ItemId itemid;
IndexTuple targetkey; IndexTuple targetkey;
BlockNumber leftsib, target;
Buffer lbuf; Buffer lbuf;
BlockNumber leftsib;
itemid = PageGetItemId(page, P_HIKEY); itemid = PageGetItemId(page, P_HIKEY);
targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
leftsib = opaque->btpo_prev; leftsib = opaque->btpo_prev;
target = BufferGetBlockNumber(leafbuf);
/* /*
* To avoid deadlocks, we'd better drop the leaf page lock * To avoid deadlocks, we'd better drop the leaf page lock
...@@ -1643,35 +1691,14 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) ...@@ -1643,35 +1691,14 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK); LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
/* /*
* Fetch the left sibling, to check that it's not marked with * Check that the left sibling of leafbuf (if any) is not
* INCOMPLETE_SPLIT flag. That would mean that the page * marked with INCOMPLETE_SPLIT flag before proceeding
* to-be-deleted doesn't have a downlink, and the page
* deletion algorithm isn't prepared to handle that.
*/ */
if (leftsib != P_NONE) Assert(target == scanblkno);
if (_bt_leftsib_splitflag(rel, leftsib, target))
{ {
BTPageOpaque lopaque; ReleaseBuffer(leafbuf);
Page lpage; return ndeleted;
lbuf = _bt_getbuf(rel, leftsib, BT_READ);
lpage = BufferGetPage(lbuf);
lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
/*
* If the left sibling is split again by another backend,
* after we released the lock, we know that the first
* split must have finished, because we don't allow an
* incompletely-split page to be split again. So we don't
* need to walk right here.
*/
if (lopaque->btpo_next == BufferGetBlockNumber(leafbuf) &&
P_INCOMPLETE_SPLIT(lopaque))
{
ReleaseBuffer(leafbuf);
_bt_relbuf(rel, lbuf);
return ndeleted;
}
_bt_relbuf(rel, lbuf);
} }
/* we need an insertion scan key for the search, so build one */ /* we need an insertion scan key for the search, so build one */
...@@ -1679,7 +1706,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) ...@@ -1679,7 +1706,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
/* find the leftmost leaf page with matching pivot/high key */ /* find the leftmost leaf page with matching pivot/high key */
itup_key->pivotsearch = true; itup_key->pivotsearch = true;
stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL); stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
/* don't need a lock or second pin on the page */ /* won't need a second lock or pin on leafbuf */
_bt_relbuf(rel, lbuf); _bt_relbuf(rel, lbuf);
/* /*
...@@ -1804,12 +1831,11 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) ...@@ -1804,12 +1831,11 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
* Before attempting to lock the parent page, check that the right sibling * Before attempting to lock the parent page, check that the right sibling
* is not in half-dead state. A half-dead right sibling would have no * is not in half-dead state. A half-dead right sibling would have no
* downlink in the parent, which would be highly confusing later when we * downlink in the parent, which would be highly confusing later when we
* delete the downlink that follows the current page's downlink. (I * delete the downlink that follows the leafbuf page's downlink. It would
* believe the deletion would work correctly, but it would fail the * fail the "right sibling of target page is also the next child in parent
* cross-check we make that the following downlink points to the right * page" cross-check below.
* sibling of the delete page.)
*/ */
if (_bt_is_page_halfdead(rel, leafrightsib)) if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
{ {
elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead", elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
leafblkno, leafrightsib); leafblkno, leafrightsib);
...@@ -1822,16 +1848,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) ...@@ -1822,16 +1848,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
* be deleted too, and the same condition applies recursively to it. We * be deleted too, and the same condition applies recursively to it. We
* have to check this condition all the way up before trying to delete, * have to check this condition all the way up before trying to delete,
* and lock the final parent of the to-be-deleted subtree. * and lock the final parent of the to-be-deleted subtree.
*
* However, we won't need to repeat the above _bt_is_page_halfdead() check
* for parent/ancestor pages because of the rightmost restriction. The
* leaf check will apply to a right "cousin" leaf page rather than a
* simple right sibling leaf page in cases where we actually go on to
* perform internal page deletion. The right cousin leaf page is
* representative of the left edge of the subtree to the right of the
* to-be-deleted subtree as a whole. (Besides, internal pages are never
* marked half-dead, so it isn't even possible to directly assess if an
* internal page is part of some other to-be-deleted subtree.)
*/ */
rightsib = leafrightsib; rightsib = leafrightsib;
target = leafblkno; target = leafblkno;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment