Commit d114cc53 authored by Alexander Korotkov's avatar Alexander Korotkov

Improve checking of child pages in contrib/amcheck.

This commit eliminates lossiness in check for missing parent downlinks in
B-tree.  Instead of collecting lossy bitmap, we check for missing downlinks
while visiting child pages referenced by downlinks of target level.  We
traverse from previous child page to the subsequent child page by right links.
Intermediate pages are candidates to have lost parent downlinks.

Also this commit introduces matching of child high key to the pivot key of
it's parent.

Discussion: https://postgr.es/m/CAPpHfduoF-c4RhOyOm%3D4-Y367%2B8txq9Q6iM_ty0OYc8si1Abww%40mail.gmail.com
Author: Alexander Korotkov
Reviewed-by: Peter Geoghegan
parent a88a285c
...@@ -94,16 +94,26 @@ typedef struct BtreeCheckState ...@@ -94,16 +94,26 @@ typedef struct BtreeCheckState
/* Target page's LSN */ /* Target page's LSN */
XLogRecPtr targetlsn; XLogRecPtr targetlsn;
/*
* Low key: high key of left sibling of target page. Used only for child
* verification. So, 'lowkey' is kept only when 'readonly' is set.
*/
IndexTuple lowkey;
/*
* The rightlink and incomplete split flag of block one level down to the
* target page, which was visited last time via downlink from taget page.
* We use it to check for missing downlinks.
*/
BlockNumber prevrightlink;
bool previncompletesplit;
/* /*
* Mutable state, for optional heapallindexed verification: * Mutable state, for optional heapallindexed verification:
*/ */
/* Bloom filter fingerprints B-Tree index */ /* Bloom filter fingerprints B-Tree index */
bloom_filter *filter; bloom_filter *filter;
/* Bloom filter fingerprints downlink blocks within tree */
bloom_filter *downlinkfilter;
/* Right half of incomplete split marker */
bool rightsplit;
/* Debug counter */ /* Debug counter */
int64 heaptuplespresent; int64 heaptuplespresent;
} BtreeCheckState; } BtreeCheckState;
...@@ -137,9 +147,14 @@ static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, ...@@ -137,9 +147,14 @@ static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
BtreeLevel level); BtreeLevel level);
static void bt_target_page_check(BtreeCheckState *state); static void bt_target_page_check(BtreeCheckState *state);
static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state);
static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
BlockNumber childblock); OffsetNumber downlinkoffnum);
static void bt_downlink_missing_check(BtreeCheckState *state); static void bt_child_highkey_check(BtreeCheckState *state,
OffsetNumber target_downlinkoffnum,
Page loaded_child,
uint32 target_level);
static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
BlockNumber targetblock, Page target);
static void bt_tuple_present_callback(Relation index, ItemPointer tid, static void bt_tuple_present_callback(Relation index, ItemPointer tid,
Datum *values, bool *isnull, Datum *values, bool *isnull,
bool tupleIsAlive, void *checkstate); bool tupleIsAlive, void *checkstate);
...@@ -470,20 +485,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, ...@@ -470,20 +485,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
errmsg("index \"%s\" cannot be verified using transaction snapshot", errmsg("index \"%s\" cannot be verified using transaction snapshot",
RelationGetRelationName(rel)))); RelationGetRelationName(rel))));
} }
else
{
/*
* Extra readonly downlink check.
*
* In readonly case, we know that there cannot be a concurrent
* page split or a concurrent page deletion, which gives us the
* opportunity to verify that every non-ignorable page had a
* downlink one level up. We must be tolerant of interrupted page
* splits and page deletions, though. This is taken care of in
* bt_downlink_missing_check().
*/
state->downlinkfilter = bloom_create(total_pages, work_mem, seed);
}
} }
Assert(!state->rootdescend || state->readonly); Assert(!state->rootdescend || state->readonly);
...@@ -532,12 +533,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, ...@@ -532,12 +533,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
current.istruerootlevel = true; current.istruerootlevel = true;
while (current.leftmost != P_NONE) while (current.leftmost != P_NONE)
{ {
/*
* Leftmost page on level cannot be right half of incomplete split.
* This can go stale immediately in !readonly case.
*/
state->rightsplit = false;
/* /*
* Verify this level, and get left most page for next level down, if * Verify this level, and get left most page for next level down, if
* not at leaf level * not at leaf level
...@@ -561,16 +556,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, ...@@ -561,16 +556,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
IndexInfo *indexinfo = BuildIndexInfo(state->rel); IndexInfo *indexinfo = BuildIndexInfo(state->rel);
TableScanDesc scan; TableScanDesc scan;
/* Report on extra downlink checks performed in readonly case */
if (state->readonly)
{
ereport(DEBUG1,
(errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",
RelationGetRelationName(rel),
100.0 * bloom_prop_bits_set(state->downlinkfilter))));
bloom_free(state->downlinkfilter);
}
/* /*
* Create our own scan for table_index_build_scan(), rather than * Create our own scan for table_index_build_scan(), rather than
* getting it to do so for us. This is required so that we can * getting it to do so for us. This is required so that we can
...@@ -673,6 +658,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) ...@@ -673,6 +658,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
level.istruerootlevel ? level.istruerootlevel ?
" (true root level)" : level.level == 0 ? " (leaf level)" : ""); " (true root level)" : level.level == 0 ? " (leaf level)" : "");
state->prevrightlink = InvalidBlockNumber;
state->previncompletesplit = false;
do do
{ {
/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */ /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
...@@ -692,9 +680,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) ...@@ -692,9 +680,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
* mode, and since a page has no links within other pages * mode, and since a page has no links within other pages
* (siblings and parent) once it is marked fully deleted, it * (siblings and parent) once it is marked fully deleted, it
* should be impossible to land on a fully deleted page in * should be impossible to land on a fully deleted page in
* readonly mode. See bt_downlink_check() for further details. * readonly mode. See bt_child_check() for further details.
* *
* The bt_downlink_check() P_ISDELETED() check is repeated here so * The bt_child_check() P_ISDELETED() check is repeated here so
* that pages that are only reachable through sibling links get * that pages that are only reachable through sibling links get
* checked. * checked.
*/ */
...@@ -816,21 +804,57 @@ nextpage: ...@@ -816,21 +804,57 @@ nextpage:
errmsg("circular link chain found in block %u of index \"%s\"", errmsg("circular link chain found in block %u of index \"%s\"",
current, RelationGetRelationName(state->rel)))); current, RelationGetRelationName(state->rel))));
leftcurrent = current;
current = opaque->btpo_next;
if (state->lowkey)
{
Assert(state->readonly);
pfree(state->lowkey);
state->lowkey = NULL;
}
/* /*
* Record if page that is about to become target is the right half of * Copy current target high key as the low key of right sibling.
* an incomplete page split. This can go stale immediately in * Allocate memory in upper level context, so it would be cleared
* !readonly case. * after reset of target context.
*
* We only need the low key in corner cases of checking child high
* keys. We use high key only when incomplete split on the child level
* falls to the boundary of pages on the target level. See
* bt_child_highkey_check() for details. So, typically we won't end
* up doing anything with low key, but it's simpler for general case
* high key verification to always have it available.
*
* The correctness of managing low key in the case of concurrent
* splits wasn't investigated yet. Thankfully we only need low key
* for readonly verification and concurrent splits won't happen.
*/ */
state->rightsplit = P_INCOMPLETE_SPLIT(opaque); if (state->readonly && !P_RIGHTMOST(opaque))
{
IndexTuple itup;
ItemId itemid;
leftcurrent = current; itemid = PageGetItemIdCareful(state, state->targetblock,
current = opaque->btpo_next; state->target, P_HIKEY);
itup = (IndexTuple) PageGetItem(state->target, itemid);
state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup));
memcpy(state->lowkey, itup, IndexTupleSize(itup));
}
/* Free page and associated memory for this iteration */ /* Free page and associated memory for this iteration */
MemoryContextReset(state->targetcontext); MemoryContextReset(state->targetcontext);
} }
while (current != P_NONE); while (current != P_NONE);
if (state->lowkey)
{
Assert(state->readonly);
pfree(state->lowkey);
state->lowkey = NULL;
}
/* Don't change context for caller */ /* Don't change context for caller */
MemoryContextSwitchTo(oldcontext); MemoryContextSwitchTo(oldcontext);
...@@ -863,7 +887,9 @@ nextpage: ...@@ -863,7 +887,9 @@ nextpage:
* tuple. * tuple.
* *
* - That downlink to block was encountered in parent where that's expected. * - That downlink to block was encountered in parent where that's expected.
* (Limited to heapallindexed readonly callers.) * (Limited to readonly callers.)
*
* - That high keys of child pages matches corresponding pivot keys in parent.
* *
* This is also where heapallindexed callers use their Bloom filter to * This is also where heapallindexed callers use their Bloom filter to
* fingerprint IndexTuples for later table_index_build_scan() verification. * fingerprint IndexTuples for later table_index_build_scan() verification.
...@@ -981,22 +1007,26 @@ bt_target_page_check(BtreeCheckState *state) ...@@ -981,22 +1007,26 @@ bt_target_page_check(BtreeCheckState *state)
(uint32) state->targetlsn))); (uint32) state->targetlsn)));
} }
/* Fingerprint downlink blocks in heapallindexed + readonly case */
if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))
{
BlockNumber childblock = BTreeTupleGetDownLink(itup);
bloom_add_element(state->downlinkfilter,
(unsigned char *) &childblock,
sizeof(BlockNumber));
}
/* /*
* Don't try to generate scankey using "negative infinity" item on * Don't try to generate scankey using "negative infinity" item on
* internal pages. They are always truncated to zero attributes. * internal pages. They are always truncated to zero attributes.
*/ */
if (offset_is_negative_infinity(topaque, offset)) if (offset_is_negative_infinity(topaque, offset))
{
/*
* We don't call bt_child_check() for "negative infinity" items.
* But if we're performing downlink connectivity check, we do it
* for every item including "negative infinity" one.
*/
if (!P_ISLEAF(topaque) && state->readonly)
{
bt_child_highkey_check(state,
offset,
NULL,
topaque->btpo.level);
}
continue; continue;
}
/* /*
* Readonly callers may optionally verify that non-pivot tuples can * Readonly callers may optionally verify that non-pivot tuples can
...@@ -1340,20 +1370,23 @@ bt_target_page_check(BtreeCheckState *state) ...@@ -1340,20 +1370,23 @@ bt_target_page_check(BtreeCheckState *state)
* because it has no useful value to compare). * because it has no useful value to compare).
*/ */
if (!P_ISLEAF(topaque) && state->readonly) if (!P_ISLEAF(topaque) && state->readonly)
{ bt_child_check(state, skey, offset);
BlockNumber childblock = BTreeTupleGetDownLink(itup);
bt_downlink_check(state, skey, childblock);
}
} }
/* /*
* * Check if page has a downlink in parent * * Special case bt_child_highkey_check() call
* *
* This can only be checked in heapallindexed + readonly case. * We don't pass an real downlink, but we've to finish the level
* processing. If condition is satisfied, we've already processed all the
* downlinks from the target level. But there still might be pages to the
* right of the child page pointer to by our rightmost downlink. And they
* might have missing downlinks. This final call checks for them.
*/ */
if (state->heapallindexed && state->readonly) if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
bt_downlink_missing_check(state); {
bt_child_highkey_check(state, InvalidOffsetNumber,
NULL, topaque->btpo.level);
}
} }
/* /*
...@@ -1497,7 +1530,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) ...@@ -1497,7 +1530,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
* Top level tree walk caller moves on to next page (makes it the new * Top level tree walk caller moves on to next page (makes it the new
* target) following recovery from this race. (cf. The rationale for * target) following recovery from this race. (cf. The rationale for
* child/downlink verification needing a ShareLock within * child/downlink verification needing a ShareLock within
* bt_downlink_check(), where page deletion is also the main source of * bt_child_check(), where page deletion is also the main source of
* trouble.) * trouble.)
* *
* Note that it doesn't matter if right sibling page here is actually a * Note that it doesn't matter if right sibling page here is actually a
...@@ -1570,6 +1603,297 @@ bt_right_page_check_scankey(BtreeCheckState *state) ...@@ -1570,6 +1603,297 @@ bt_right_page_check_scankey(BtreeCheckState *state)
return bt_mkscankey_pivotsearch(state->rel, firstitup); return bt_mkscankey_pivotsearch(state->rel, firstitup);
} }
/*
* Check if two tuples are binary identical except the block number. So,
* this function is capable to compare pivot keys on different levels.
*/
static bool
bt_pivot_tuple_identical(IndexTuple itup1, IndexTuple itup2)
{
if (IndexTupleSize(itup1) != IndexTupleSize(itup2))
return false;
if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid,
IndexTupleSize(itup1) - offsetof(ItemPointerData, ip_posid)) != 0)
return false;
return true;
}
/*---
* Check high keys on the child level. Traverse rightlinks from previous
* downlink to the current one. Check that there are no intermediate pages
* with missing downlinks.
*
* If 'loaded_child' is given, it's assumed to be the page pointed to by the
* downlink referenced by 'downlinkoffnum' of the target page.
*
* Basically this function is called for each target downlink and checks two
* invariants:
*
* 1) You can reach the next child from previous one via rightlinks;
* 2) Each child high key have matching pivot key on target level.
*
* Consider the sample tree picture.
*
* 1
* / \
* 2 <-> 3
* / \ / \
* 4 <> 5 <> 6 <> 7 <> 8
*
* This function will be called for blocks 4, 5, 6 and 8. Consider what is
* happening for each function call.
*
* - The function call for block 4 initializes data structure and matches high
* key of block 4 to downlink's pivot key of block 2.
* - The high key of block 5 is matched to the high key of block 2.
* - The block 6 has an incomplete split flag set, so its high key isn't
* matched to anything.
* - The function call for block 8 checks that block 8 can be found while
* following rightlinks from block 6. The high key of block 7 will be
* matched to downlink's pivot key in block 3.
*
* There is also final call of this function, which checks that there is no
* missing downlinks for children to the right of the child referenced by
* rightmost downlink in target level.
*/
static void
bt_child_highkey_check(BtreeCheckState *state,
OffsetNumber target_downlinkoffnum,
Page loaded_child,
uint32 target_level)
{
BlockNumber blkno = state->prevrightlink;
Page page;
BTPageOpaque opaque;
bool rightsplit = state->previncompletesplit;
bool first = true;
ItemId itemid;
IndexTuple itup;
BlockNumber downlink;
if (OffsetNumberIsValid(target_downlinkoffnum))
{
itemid = PageGetItemIdCareful(state, state->targetblock,
state->target, target_downlinkoffnum);
itup = (IndexTuple) PageGetItem(state->target, itemid);
downlink = BTreeTupleGetDownLink(itup);
}
else
{
downlink = P_NONE;
}
/*
* If no previous rightlink is memorized for current level just below
* target page's level, we are about to start from the leftmost page. We
* can't follow rightlinks from previous page, because there is no
* previous page. But we still can match high key.
*
* So we initialize variables for the loop above like there is previous
* page referencing current child. Also we imply previous page to not
* have incomplete split flag, that would make us require downlink for
* current child. That's correct, because leftmost page on the level
* should always have parent downlink.
*/
if (!BlockNumberIsValid(blkno))
{
blkno = downlink;
rightsplit = false;
}
/* Move to the right on the child level */
while (true)
{
/*
* Did we traverse the whole tree level and this is check for pages to
* the right of rightmost downlink?
*/
if (blkno == P_NONE && downlink == P_NONE)
{
state->prevrightlink = InvalidBlockNumber;
state->previncompletesplit = false;
return;
}
/* Did we traverse the whole tree level and don't find next downlink? */
if (blkno == P_NONE)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"",
state->prevrightlink, downlink,
RelationGetRelationName(state->rel))));
/* Load page contents */
if (blkno == downlink && loaded_child)
page = loaded_child;
else
page = palloc_btree_page(state, blkno);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* The first page we visit at the level should be leftmost */
if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
/* Check level for non-ignorable page */
if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
RelationGetRelationName(state->rel)),
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
blkno, target_level - 1, opaque->btpo.level)));
/* Try to detect circular links */
if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("circular link chain found in block %u of index \"%s\"",
blkno, RelationGetRelationName(state->rel))));
if (blkno != downlink && !P_IGNORE(opaque))
{
/* blkno probably has missing parent downlink */
bt_downlink_missing_check(state, rightsplit, blkno, page);
}
rightsplit = P_INCOMPLETE_SPLIT(opaque);
/*
* If we visit page with high key, check that it is be equal to the
* target key next to corresponding downlink.
*/
if (!rightsplit && !P_RIGHTMOST(opaque))
{
BTPageOpaque topaque;
IndexTuple highkey;
OffsetNumber pivotkey_offset;
/* Get high key */
itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY);
highkey = (IndexTuple) PageGetItem(page, itemid);
/*
* There might be two situations when we examine high key. If
* current child page is referenced by given target downlink, we
* should look to the next offset number for matching key from
* target page.
*
* Alternatively, we're following rightlinks somewhere in the
* middle between page referenced by previous target's downlink
* and the page referenced by current target's downlink. If
* current child page hasn't incomplete split flag set, then its
* high key should match to the target's key of current offset
* number. This happens when a previous call here (to
* bt_child_highkey_check()) found an incomplete split, and we
* reach a right sibling page without a downlink -- the right
* sibling page's high key still needs to be matched to a
* separator key on the parent/target level.
*
* Don't apply OffsetNumberNext() to target_downlinkoffnum when we
* already had to step right on the child level. Our traversal of
* the child level must try to move in perfect lockstep behind (to
* the left of) the target/parent level traversal.
*/
if (blkno == downlink)
pivotkey_offset = OffsetNumberNext(target_downlinkoffnum);
else
pivotkey_offset = target_downlinkoffnum;
topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
if (!offset_is_negative_infinity(topaque, pivotkey_offset))
{
/*
* If we're looking for the next pivot tuple in target page,
* but there is no more pivot tuples, then we should match to
* high key instead.
*/
if (pivotkey_offset > PageGetMaxOffsetNumber(state->target))
{
if (P_RIGHTMOST(topaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
pivotkey_offset = P_HIKEY;
}
itemid = PageGetItemIdCareful(state, state->targetblock,
state->target, pivotkey_offset);
itup = (IndexTuple) PageGetItem(state->target, itemid);
}
else
{
/*
* We cannot try to match child's high key to a negative
* infinity key in target, since there is nothing to compare.
* However, it's still possible to match child's high key
* outside of target page. The reason why we're are is that
* bt_child_highkey_check() was previously called for the
* cousin page of 'loaded_child', which is incomplete split.
* So, now we traverse to the right of that cousin page and
* current child level page under consideration still belongs
* to the subtree of target's left sibling. Thus, we need to
* match child's high key to it's left uncle page high key.
* Thankfully we saved it, it's called a "low key" of target
* page.
*/
if (!state->lowkey)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("can't find left sibling high key in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
itup = state->lowkey;
}
if (!bt_pivot_tuple_identical(highkey, itup))
{
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("mismatch between parent key and child high key in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
}
}
/* Exit if we already found next downlink */
if (blkno == downlink)
{
state->prevrightlink = opaque->btpo_next;
state->previncompletesplit = rightsplit;
return;
}
/* Traverse to the next page using rightlink */
blkno = opaque->btpo_next;
/* Free page contents if it's allocated by us */
if (page != loaded_child)
pfree(page);
first = false;
}
}
/* /*
* Checks one of target's downlink against its child page. * Checks one of target's downlink against its child page.
* *
...@@ -1578,15 +1902,28 @@ bt_right_page_check_scankey(BtreeCheckState *state) ...@@ -1578,15 +1902,28 @@ bt_right_page_check_scankey(BtreeCheckState *state)
* The downlink insertion into the target is probably where any problem raised * The downlink insertion into the target is probably where any problem raised
* here arises, and there is no such thing as a parent link, so doing the * here arises, and there is no such thing as a parent link, so doing the
* verification this way around is much more practical. * verification this way around is much more practical.
*
* This function visits child page and it's sequentially called for each
* downlink of target page. Assuming this we also check downlink connectivity
* here in order to save child page visits.
*/ */
static void static void
bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
BlockNumber childblock) OffsetNumber downlinkoffnum)
{ {
ItemId itemid;
IndexTuple itup;
BlockNumber childblock;
OffsetNumber offset; OffsetNumber offset;
OffsetNumber maxoffset; OffsetNumber maxoffset;
Page child; Page child;
BTPageOpaque copaque; BTPageOpaque copaque;
BTPageOpaque topaque;
itemid = PageGetItemIdCareful(state, state->targetblock,
state->target, downlinkoffnum);
itup = (IndexTuple) PageGetItem(state->target, itemid);
childblock = BTreeTupleGetDownLink(itup);
/* /*
* Caller must have ShareLock on target relation, because of * Caller must have ShareLock on target relation, because of
...@@ -1636,10 +1973,18 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, ...@@ -1636,10 +1973,18 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* Check all items, rather than checking just the first and trusting that * Check all items, rather than checking just the first and trusting that
* the operator class obeys the transitive law. * the operator class obeys the transitive law.
*/ */
topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
child = palloc_btree_page(state, childblock); child = palloc_btree_page(state, childblock);
copaque = (BTPageOpaque) PageGetSpecialPointer(child); copaque = (BTPageOpaque) PageGetSpecialPointer(child);
maxoffset = PageGetMaxOffsetNumber(child); maxoffset = PageGetMaxOffsetNumber(child);
/*
* Since we've already loaded the child block, combine this check with
* check for downlink connectivity.
*/
bt_child_highkey_check(state, downlinkoffnum,
child, topaque->btpo.level);
/* /*
* Since there cannot be a concurrent VACUUM operation in readonly mode, * Since there cannot be a concurrent VACUUM operation in readonly mode,
* and since a page has no links within other pages (siblings and parent) * and since a page has no links within other pages (siblings and parent)
...@@ -1734,23 +2079,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, ...@@ -1734,23 +2079,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* be concerned about concurrent page splits or page deletions. * be concerned about concurrent page splits or page deletions.
*/ */
static void static void
bt_downlink_missing_check(BtreeCheckState *state) bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
BlockNumber blkno, Page page)
{ {
BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
ItemId itemid; ItemId itemid;
IndexTuple itup; IndexTuple itup;
Page child; Page child;
BTPageOpaque copaque; BTPageOpaque copaque;
uint32 level; uint32 level;
BlockNumber childblk; BlockNumber childblk;
XLogRecPtr pagelsn;
Assert(state->heapallindexed && state->readonly); Assert(state->readonly);
Assert(!P_IGNORE(topaque)); Assert(!P_IGNORE(opaque));
/* No next level up with downlinks to fingerprint from the true root */ /* No next level up with downlinks to fingerprint from the true root */
if (P_ISROOT(topaque)) if (P_ISROOT(opaque))
return; return;
pagelsn = PageGetLSN(page);
/* /*
* Incomplete (interrupted) page splits can account for the lack of a * Incomplete (interrupted) page splits can account for the lack of a
* downlink. Some inserting transaction should eventually complete the * downlink. Some inserting transaction should eventually complete the
...@@ -1772,54 +2121,47 @@ bt_downlink_missing_check(BtreeCheckState *state) ...@@ -1772,54 +2121,47 @@ bt_downlink_missing_check(BtreeCheckState *state)
* by design, so it shouldn't be taken to indicate corruption. See * by design, so it shouldn't be taken to indicate corruption. See
* _bt_pagedel() for full details. * _bt_pagedel() for full details.
*/ */
if (state->rightsplit) if (rightsplit)
{ {
ereport(DEBUG1, ereport(DEBUG1,
(errcode(ERRCODE_NO_DATA), (errcode(ERRCODE_NO_DATA),
errmsg("harmless interrupted page split detected in index %s", errmsg("harmless interrupted page split detected in index %s",
RelationGetRelationName(state->rel)), RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
state->targetblock, topaque->btpo.level, blkno, opaque->btpo.level,
topaque->btpo_prev, opaque->btpo_prev,
(uint32) (state->targetlsn >> 32), (uint32) (pagelsn >> 32),
(uint32) state->targetlsn))); (uint32) pagelsn)));
return; return;
} }
/* Target's downlink is typically present in parent/fingerprinted */
if (!bloom_lacks_element(state->downlinkfilter,
(unsigned char *) &state->targetblock,
sizeof(BlockNumber)))
return;
/* /*
* Target is probably the "top parent" of a multi-level page deletion. * Page under check is probably the "top parent" of a multi-level page
* We'll need to descend the subtree to make sure that descendant pages * deletion. We'll need to descend the subtree to make sure that
* are consistent with that, though. * descendant pages are consistent with that, though.
* *
* If the target page (which must be non-ignorable) is a leaf page, then * If the page (which must be non-ignorable) is a leaf page, then clearly
* clearly it can't be the top parent. The lack of a downlink is probably * it can't be the top parent. The lack of a downlink is probably a
* a symptom of a broad problem that could just as easily cause * symptom of a broad problem that could just as easily cause
* inconsistencies anywhere else. * inconsistencies anywhere else.
*/ */
if (P_ISLEAF(topaque)) if (P_ISLEAF(opaque))
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED), (errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("leaf index block lacks downlink in index \"%s\"", errmsg("leaf index block lacks downlink in index \"%s\"",
RelationGetRelationName(state->rel)), RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u page lsn=%X/%X.", errdetail_internal("Block=%u page lsn=%X/%X.",
state->targetblock, blkno,
(uint32) (state->targetlsn >> 32), (uint32) (pagelsn >> 32),
(uint32) state->targetlsn))); (uint32) pagelsn)));
/* Descend from the target page, which is an internal page */ /* Descend from the given page, which is an internal page */
elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
RelationGetRelationName(state->rel)); RelationGetRelationName(state->rel));
level = topaque->btpo.level; level = opaque->btpo.level;
itemid = PageGetItemIdCareful(state, state->targetblock, state->target, itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
P_FIRSTDATAKEY(topaque)); itup = (IndexTuple) PageGetItem(page, itemid);
itup = (IndexTuple) PageGetItem(state->target, itemid);
childblk = BTreeTupleGetDownLink(itup); childblk = BTreeTupleGetDownLink(itup);
for (;;) for (;;)
{ {
...@@ -1837,8 +2179,8 @@ bt_downlink_missing_check(BtreeCheckState *state) ...@@ -1837,8 +2179,8 @@ bt_downlink_missing_check(BtreeCheckState *state)
(errcode(ERRCODE_INDEX_CORRUPTED), (errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
RelationGetRelationName(state->rel)), RelationGetRelationName(state->rel)),
errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
state->targetblock, childblk, blkno, childblk,
level - 1, copaque->btpo.level))); level - 1, copaque->btpo.level)));
level = copaque->btpo.level; level = copaque->btpo.level;
...@@ -1854,12 +2196,12 @@ bt_downlink_missing_check(BtreeCheckState *state) ...@@ -1854,12 +2196,12 @@ bt_downlink_missing_check(BtreeCheckState *state)
* Since there cannot be a concurrent VACUUM operation in readonly mode, * Since there cannot be a concurrent VACUUM operation in readonly mode,
* and since a page has no links within other pages (siblings and parent) * and since a page has no links within other pages (siblings and parent)
* once it is marked fully deleted, it should be impossible to land on a * once it is marked fully deleted, it should be impossible to land on a
* fully deleted page. See bt_downlink_check() for further details. * fully deleted page. See bt_child_check() for further details.
* *
* The bt_downlink_check() P_ISDELETED() check is repeated here because * The bt_child_check() P_ISDELETED() check is repeated here because
* bt_downlink_check() does not visit pages reachable through negative * bt_child_check() does not visit pages reachable through negative
* infinity items. Besides, bt_downlink_check() is unwilling to descend * infinity items. Besides, bt_child_check() is unwilling to descend
* multiple levels. (The similar bt_downlink_check() P_ISDELETED() check * multiple levels. (The similar bt_child_check() P_ISDELETED() check
* within bt_check_level_from_leftmost() won't reach the page either, * within bt_check_level_from_leftmost() won't reach the page either,
* since the leaf's live siblings should have their sibling links updated * since the leaf's live siblings should have their sibling links updated
* to bypass the deletion target page when it is marked fully dead.) * to bypass the deletion target page when it is marked fully dead.)
...@@ -1875,26 +2217,26 @@ bt_downlink_missing_check(BtreeCheckState *state) ...@@ -1875,26 +2217,26 @@ bt_downlink_missing_check(BtreeCheckState *state)
(errcode(ERRCODE_INDEX_CORRUPTED), (errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("downlink to deleted leaf page found in index \"%s\"", errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
RelationGetRelationName(state->rel)), RelationGetRelationName(state->rel)),
errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.", errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.",
state->targetblock, childblk, blkno, childblk,
(uint32) (state->targetlsn >> 32), (uint32) (pagelsn >> 32),
(uint32) state->targetlsn))); (uint32) pagelsn)));
/* /*
* Iff leaf page is half-dead, its high key top parent link should point * Iff leaf page is half-dead, its high key top parent link should point
* to what VACUUM considered to be the top parent page at the instant it * to what VACUUM considered to be the top parent page at the instant it
* was interrupted. Provided the high key link actually points to the * was interrupted. Provided the high key link actually points to the
* target page, the missing downlink we detected is consistent with there * page under check, the missing downlink we detected is consistent with
* having been an interrupted multi-level page deletion. This means that * there having been an interrupted multi-level page deletion. This means
* the subtree with the target page at its root (a page deletion chain) is * that the subtree with the page under check at its root (a page deletion
* in a consistent state, enabling VACUUM to resume deleting the entire * chain) is in a consistent state, enabling VACUUM to resume deleting the
* chain the next time it encounters the half-dead leaf page. * entire chain the next time it encounters the half-dead leaf page.
*/ */
if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque)) if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
{ {
itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY); itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY);
itup = (IndexTuple) PageGetItem(child, itemid); itup = (IndexTuple) PageGetItem(child, itemid);
if (BTreeTupleGetTopParent(itup) == state->targetblock) if (BTreeTupleGetTopParent(itup) == blkno)
return; return;
} }
...@@ -1903,9 +2245,9 @@ bt_downlink_missing_check(BtreeCheckState *state) ...@@ -1903,9 +2245,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
errmsg("internal index block lacks downlink in index \"%s\"", errmsg("internal index block lacks downlink in index \"%s\"",
RelationGetRelationName(state->rel)), RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u page lsn=%X/%X.", errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
state->targetblock, topaque->btpo.level, blkno, opaque->btpo.level,
(uint32) (state->targetlsn >> 32), (uint32) (pagelsn >> 32),
(uint32) state->targetlsn))); (uint32) pagelsn)));
} }
/* /*
...@@ -2146,7 +2488,7 @@ bt_posting_plain_tuple(IndexTuple itup, int n) ...@@ -2146,7 +2488,7 @@ bt_posting_plain_tuple(IndexTuple itup, int n)
* separator key value won't always be available from parent, though, because * separator key value won't always be available from parent, though, because
* the first items of internal pages are negative infinity items, truncated * the first items of internal pages are negative infinity items, truncated
* down to zero attributes during internal page splits. While it's true that * down to zero attributes during internal page splits. While it's true that
* bt_downlink_check() and the high key check can detect most imaginable key * bt_child_check() and the high key check can detect most imaginable key
* space problems, there are remaining problems it won't detect with non-pivot * space problems, there are remaining problems it won't detect with non-pivot
* tuples in cousin leaf pages. Starting a search from the root for every * tuples in cousin leaf pages. Starting a search from the root for every
* existing leaf tuple detects small inconsistencies in upper levels of the * existing leaf tuple detects small inconsistencies in upper levels of the
......
...@@ -125,8 +125,7 @@ ORDER BY c.relpages DESC LIMIT 10; ...@@ -125,8 +125,7 @@ ORDER BY c.relpages DESC LIMIT 10;
Optionally, when the <parameter>heapallindexed</parameter> Optionally, when the <parameter>heapallindexed</parameter>
argument is <literal>true</literal>, the function verifies the argument is <literal>true</literal>, the function verifies the
presence of all heap tuples that should be found within the presence of all heap tuples that should be found within the
index, and that there are no missing downlinks in the index index. When the optional <parameter>rootdescend</parameter>
structure. When the optional <parameter>rootdescend</parameter>
argument is <literal>true</literal>, verification re-finds argument is <literal>true</literal>, verification re-finds
tuples on the leaf level by performing a new search from the tuples on the leaf level by performing a new search from the
root page for each tuple. The checks that can be performed by root page for each tuple. The checks that can be performed by
...@@ -136,7 +135,8 @@ ORDER BY c.relpages DESC LIMIT 10; ...@@ -136,7 +135,8 @@ ORDER BY c.relpages DESC LIMIT 10;
a more thorough variant of <function>bt_index_check</function>: a more thorough variant of <function>bt_index_check</function>:
unlike <function>bt_index_check</function>, unlike <function>bt_index_check</function>,
<function>bt_index_parent_check</function> also checks <function>bt_index_parent_check</function> also checks
invariants that span parent/child relationships. invariants that span parent/child relationships, including checking
that there are no missing downlinks in the index structure.
<function>bt_index_parent_check</function> follows the general <function>bt_index_parent_check</function> follows the general
convention of raising an error if it finds a logical convention of raising an error if it finds a logical
inconsistency or other problem. inconsistency or other problem.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment