Commit c34787f9 authored by Peter Geoghegan's avatar Peter Geoghegan

Harden nbtree page deletion.

Add some additional defensive checks in the second phase of index
deletion to detect and report index corruption during VACUUM, and to
avoid having VACUUM become stuck in more cases.  The code is still not
robust in the presence of a circular chain of sibling links, though it's
not clear whether that really matters.  This is follow-up work to commit
3a01f68e.

The new defensive checks rely on the assumption that there can be no
more than one VACUUM operation running for an index at any given time.
Remove an old comment suggesting that multiple concurrent VACUUMs need
to be considered here.  This concern now seems highly unlikely to have
any real validity, since we clearly rely on the same assumption in
several other places.  For example, there are much more recent comments
that appear in the same function (added by commit efada2b8) that make
the same assumption.

Also add a CHECK_FOR_INTERRUPTS() to the relevant code path.  Contrary
to comments added by commit 3a01f68e, it is actually possible to handle
interrupts here, at least in the common case where processing takes
place at the leaf level.  We only hold a pin on leafbuf/target page when
stepping right at the leaf level.

No backpatch due to the lack of complaints following hardening added to
the same area by commit 3a01f68e.
parent 2f86ab30
...@@ -1978,9 +1978,6 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) ...@@ -1978,9 +1978,6 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
* Then unlink it from its siblings. Each call to * Then unlink it from its siblings. Each call to
* _bt_unlink_halfdead_page unlinks the topmost page from the subtree, * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
* making it shallower. Iterate until the leafbuf page is deleted. * making it shallower. Iterate until the leafbuf page is deleted.
*
* _bt_unlink_halfdead_page should never fail, since we established
* that deletion is generally safe in _bt_mark_page_halfdead.
*/ */
rightsib_empty = false; rightsib_empty = false;
Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)); Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
...@@ -1991,7 +1988,15 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) ...@@ -1991,7 +1988,15 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
&rightsib_empty, oldestBtpoXact, &rightsib_empty, oldestBtpoXact,
&ndeleted)) &ndeleted))
{ {
/* _bt_unlink_halfdead_page failed, released buffer */ /*
* _bt_unlink_halfdead_page should never fail, since we
* established that deletion is generally safe in
* _bt_mark_page_halfdead -- index must be corrupt.
*
* Note that _bt_unlink_halfdead_page already released the
* lock and pin on leafbuf for us.
*/
Assert(false);
return ndeleted; return ndeleted;
} }
} }
...@@ -2355,11 +2360,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, ...@@ -2355,11 +2360,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
* So, first lock the leaf page, if it's not the target. Then find and * So, first lock the leaf page, if it's not the target. Then find and
* write-lock the current left sibling of the target page. The sibling * write-lock the current left sibling of the target page. The sibling
* that was current a moment ago could have split, so we may have to move * that was current a moment ago could have split, so we may have to move
* right. This search could fail if either the sibling or the target page * right.
* was deleted by someone else meanwhile; if so, give up. (Right now,
* that should never happen, since page deletion is only done in VACUUM
* and there shouldn't be multiple VACUUMs concurrently on the same
* table.)
*/ */
if (target != leafblkno) if (target != leafblkno)
_bt_lockbuf(rel, leafbuf, BT_WRITE); _bt_lockbuf(rel, leafbuf, BT_WRITE);
...@@ -2370,23 +2371,26 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, ...@@ -2370,23 +2371,26 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque = (BTPageOpaque) PageGetSpecialPointer(page);
while (P_ISDELETED(opaque) || opaque->btpo_next != target) while (P_ISDELETED(opaque) || opaque->btpo_next != target)
{ {
/* step right one page */ bool leftsibvalid = true;
leftsib = opaque->btpo_next;
_bt_relbuf(rel, lbuf);
/* /*
* It'd be good to check for interrupts here, but it's not easy to * Before we follow the link from the page that was the left
* do so because a lock is always held. This block isn't * sibling mere moments ago, validate its right link. This
* frequently reached, so hopefully the consequences of not * reduces the opportunities for loop to fail to ever make any
* checking interrupts aren't too bad. * progress in the presence of index corruption.
*
* Note: we rely on the assumption that there can only be one
* vacuum process running at a time (against the same index).
*/ */
if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
leftsib == opaque->btpo_next)
leftsibvalid = false;
leftsib = opaque->btpo_next;
_bt_relbuf(rel, lbuf);
if (leftsib == P_NONE) if (!leftsibvalid)
{ {
ereport(LOG,
(errmsg("no left sibling (concurrent deletion?) of block %u in \"%s\"",
target,
RelationGetRelationName(rel))));
if (target != leafblkno) if (target != leafblkno)
{ {
/* we have only a pin on target, but pin+lock on leafbuf */ /* we have only a pin on target, but pin+lock on leafbuf */
...@@ -2398,8 +2402,20 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, ...@@ -2398,8 +2402,20 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
/* we have only a pin on leafbuf */ /* we have only a pin on leafbuf */
ReleaseBuffer(leafbuf); ReleaseBuffer(leafbuf);
} }
ereport(LOG,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("valid left sibling for deletion target could not be located: "
"left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
leftsib, target, leafblkno, scanblkno,
RelationGetRelationName(rel))));
return false; return false;
} }
CHECK_FOR_INTERRUPTS();
/* step right one page */
lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
page = BufferGetPage(lbuf); page = BufferGetPage(lbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque = (BTPageOpaque) PageGetSpecialPointer(page);
...@@ -2408,11 +2424,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, ...@@ -2408,11 +2424,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
else else
lbuf = InvalidBuffer; lbuf = InvalidBuffer;
/* /* Next write-lock the target page itself */
* Next write-lock the target page itself. It's okay to take a write lock
* rather than a superexclusive lock, since no scan will stop on an empty
* page.
*/
_bt_lockbuf(rel, buf, BT_WRITE); _bt_lockbuf(rel, buf, BT_WRITE);
page = BufferGetPage(buf); page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque = (BTPageOpaque) PageGetSpecialPointer(page);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment