Commit c1afd175 authored by Peter Geoghegan's avatar Peter Geoghegan

Allow amcheck to re-find tuples using new search.

Teach contrib/amcheck's bt_index_parent_check() function to take
advantage of the uniqueness property of heapkeyspace indexes in support
of a new verification option: non-pivot tuples (non-highkey tuples on
the leaf level) can optionally be re-found using a new search for each,
that starts from the root page.  If a tuple cannot be re-found, report
that the index is corrupt.

The new "rootdescend" verification option is exhaustive, and can
therefore make a call to bt_index_parent_check() take a lot longer.
Re-finding tuples during verification is mostly intended as an option
for backend developers, since the corruption scenarios that it alone is
uniquely capable of detecting seem fairly far-fetched.

For example, "rootdescend" verification is much more likely to detect
corruption of the least significant byte of a key from a pivot tuple in
the root page of a B-Tree that already has at least three levels.
Typically, only a few tuples on a cousin leaf page are at risk of
"getting overlooked" by index scans in this scenario.  The corrupt key
in the root page is only slightly corrupt: corrupt enough to give wrong
answers to some queries, and yet not corrupt enough to allow the problem
to be detected without verifying agreement between the leaf page and the
root page, skipping at least one internal page level.  The existing
bt_index_parent_check() checks never cross more than a single level.

Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas
Discussion: https://postgr.es/m/CAH2-Wz=yTWnVu+HeHGKb2AGiADL9eprn-cKYAto4MkKOuiGtRQ@mail.gmail.com
parent fab25024
......@@ -4,7 +4,7 @@ MODULE_big = amcheck
OBJS = verify_nbtree.o $(WIN32RES)
EXTENSION = amcheck
DATA = amcheck--1.0--1.1.sql amcheck--1.0.sql
DATA = amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql
PGFILEDESC = "amcheck - function for verifying relation integrity"
REGRESS = check check_btree
......
/* contrib/amcheck/amcheck--1.1--1.2.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.2'" to load this file. \quit
-- In order to avoid issues with dependencies when updating amcheck to 1.2,
-- create new, overloaded version of the 1.1 function signature
--
-- bt_index_parent_check()
--
CREATE FUNCTION bt_index_parent_check(index regclass,
heapallindexed boolean, rootdescend boolean)
RETURNS VOID
AS 'MODULE_PATHNAME', 'bt_index_parent_check'
LANGUAGE C STRICT PARALLEL RESTRICTED;
-- Don't want this to be available to public
REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean) FROM PUBLIC;
# amcheck extension
comment = 'functions for verifying relation integrity'
default_version = '1.1'
default_version = '1.2'
module_pathname = '$libdir/amcheck'
relocatable = true
......@@ -126,7 +126,8 @@ SELECT bt_index_parent_check('bttest_multi_idx', true);
(1 row)
--
-- Test for multilevel page deletion/downlink present checks
-- Test for multilevel page deletion/downlink present checks, and rootdescend
-- checks
--
INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
......@@ -137,7 +138,7 @@ VACUUM delete_test_table;
-- root"
DELETE FROM delete_test_table WHERE a < 79990;
VACUUM delete_test_table;
SELECT bt_index_parent_check('delete_test_table_pkey', true);
SELECT bt_index_parent_check('delete_test_table_pkey', true, true);
bt_index_parent_check
-----------------------
......
......@@ -78,7 +78,8 @@ INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i;
SELECT bt_index_parent_check('bttest_multi_idx', true);
--
-- Test for multilevel page deletion/downlink present checks
-- Test for multilevel page deletion/downlink present checks, and rootdescend
-- checks
--
INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
......@@ -89,7 +90,7 @@ VACUUM delete_test_table;
-- root"
DELETE FROM delete_test_table WHERE a < 79990;
VACUUM delete_test_table;
SELECT bt_index_parent_check('delete_test_table_pkey', true);
SELECT bt_index_parent_check('delete_test_table_pkey', true, true);
--
-- BUG #15597: must not assume consistent input toasting state when forming
......
......@@ -75,6 +75,8 @@ typedef struct BtreeCheckState
bool readonly;
/* Also verifying heap has no unindexed tuples? */
bool heapallindexed;
/* Also making sure non-pivot tuples can be found by new search? */
bool rootdescend;
/* Per-page context */
MemoryContext targetcontext;
/* Buffer access strategy */
......@@ -124,10 +126,11 @@ PG_FUNCTION_INFO_V1(bt_index_check);
PG_FUNCTION_INFO_V1(bt_index_parent_check);
static void bt_index_check_internal(Oid indrelid, bool parentcheck,
bool heapallindexed);
bool heapallindexed, bool rootdescend);
static inline void btree_index_checkable(Relation rel);
static void bt_check_every_level(Relation rel, Relation heaprel,
bool heapkeyspace, bool readonly, bool heapallindexed);
bool heapkeyspace, bool readonly, bool heapallindexed,
bool rootdescend);
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
BtreeLevel level);
static void bt_target_page_check(BtreeCheckState *state);
......@@ -140,6 +143,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup,
bool tupleIsAlive, void *checkstate);
static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
IndexTuple itup);
static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
OffsetNumber offset);
static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
......@@ -177,7 +181,7 @@ bt_index_check(PG_FUNCTION_ARGS)
if (PG_NARGS() == 2)
heapallindexed = PG_GETARG_BOOL(1);
bt_index_check_internal(indrelid, false, heapallindexed);
bt_index_check_internal(indrelid, false, heapallindexed, false);
PG_RETURN_VOID();
}
......@@ -196,11 +200,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
{
Oid indrelid = PG_GETARG_OID(0);
bool heapallindexed = false;
bool rootdescend = false;
if (PG_NARGS() == 2)
if (PG_NARGS() >= 2)
heapallindexed = PG_GETARG_BOOL(1);
if (PG_NARGS() == 3)
rootdescend = PG_GETARG_BOOL(2);
bt_index_check_internal(indrelid, true, heapallindexed);
bt_index_check_internal(indrelid, true, heapallindexed, rootdescend);
PG_RETURN_VOID();
}
......@@ -209,7 +216,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
* Helper for bt_index_[parent_]check, coordinating the bulk of the work.
*/
static void
bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
bool rootdescend)
{
Oid heapid;
Relation indrel;
......@@ -267,7 +275,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
/* Check index, possibly against table it is an index on */
heapkeyspace = _bt_heapkeyspace(indrel);
bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
heapallindexed);
heapallindexed, rootdescend);
/*
* Release locks early. That's ok here because nothing in the called
......@@ -338,7 +346,7 @@ btree_index_checkable(Relation rel)
*/
static void
bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
bool readonly, bool heapallindexed)
bool readonly, bool heapallindexed, bool rootdescend)
{
BtreeCheckState *state;
Page metapage;
......@@ -362,6 +370,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
state->heapkeyspace = heapkeyspace;
state->readonly = readonly;
state->heapallindexed = heapallindexed;
state->rootdescend = rootdescend;
if (state->heapallindexed)
{
......@@ -430,6 +439,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
}
}
Assert(!state->rootdescend || state->readonly);
if (state->rootdescend && !state->heapkeyspace)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search",
RelationGetRelationName(rel)),
errhint("Only B-Tree version 4 indexes support rootdescend verification.")));
/* Create context for page */
state->targetcontext = AllocSetContextCreate(CurrentMemoryContext,
"amcheck context",
......@@ -922,6 +939,31 @@ bt_target_page_check(BtreeCheckState *state)
if (offset_is_negative_infinity(topaque, offset))
continue;
/*
* Readonly callers may optionally verify that non-pivot tuples can
* each be found by an independent search that starts from the root
*/
if (state->rootdescend && P_ISLEAF(topaque) &&
!bt_rootdescend(state, itup))
{
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
htid = psprintf("(%u,%u)",
ItemPointerGetBlockNumber(&(itup->t_tid)),
ItemPointerGetOffsetNumber(&(itup->t_tid)));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("could not find tuple using search from root page in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
itid, htid,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
}
/* Build insertion scankey for current page offset */
skey = bt_mkscankey_pivotsearch(state->rel, itup);
......@@ -1526,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* internal pages. In more general terms, a negative infinity item is
* only negative infinity with respect to the subtree that the page is
* at the root of.
*
* See also: bt_rootdescend(), which can even detect transitive
* inconsistencies on cousin leaf pages.
*/
if (offset_is_negative_infinity(copaque, offset))
continue;
......@@ -1926,6 +1971,81 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
return reformed;
}
/*
* Search for itup in index, starting from fast root page. itup must be a
* non-pivot tuple. This is only supported with heapkeyspace indexes, since
* we rely on having fully unique keys to find a match with only a signle
* visit to a leaf page, barring an interrupted page split, where we may have
* to move right. (A concurrent page split is impossible because caller must
* be readonly caller.)
*
* This routine can detect very subtle transitive consistency issues across
* more than one level of the tree. Leaf pages all have a high key (even the
* rightmost page has a conceptual positive infinity high key), but not a low
* key. Their downlink in parent is a lower bound, which along with the high
* key is almost enough to detect every possible inconsistency. A downlink
* separator key value won't always be available from parent, though, because
* the first items of internal pages are negative infinity items, truncated
* down to zero attributes during internal page splits. While it's true that
* bt_downlink_check() and the high key check can detect most imaginable key
* space problems, there are remaining problems it won't detect with non-pivot
* tuples in cousin leaf pages. Starting a search from the root for every
* existing leaf tuple detects small inconsistencies in upper levels of the
* tree that cannot be detected any other way. (Besides all this, this is
* probably also useful as a direct test of the code used by index scans
* themselves.)
*/
static bool
bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
{
BTScanInsert key;
BTStack stack;
Buffer lbuf;
bool exists;
key = _bt_mkscankey(state->rel, itup);
Assert(key->heapkeyspace && key->scantid != NULL);
/*
* Search from root.
*
* Ideally, we would arrange to only move right within _bt_search() when
* an interrupted page split is detected (i.e. when the incomplete split
* bit is found to be set), but for now we accept the possibility that
* that could conceal an inconsistency.
*/
Assert(state->readonly && state->rootdescend);
exists = false;
stack = _bt_search(state->rel, key, &lbuf, BT_READ, NULL);
if (BufferIsValid(lbuf))
{
BTInsertStateData insertstate;
OffsetNumber offnum;
Page page;
insertstate.itup = itup;
insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
insertstate.itup_key = key;
insertstate.bounds_valid = false;
insertstate.buf = lbuf;
/* Get matching tuple on leaf page */
offnum = _bt_binsrch_insert(state->rel, &insertstate);
/* Compare first >= matching item on leaf page, if any */
page = BufferGetPage(lbuf);
if (offnum <= PageGetMaxOffsetNumber(page) &&
_bt_compare(state->rel, key, page, offnum) == 0)
exists = true;
_bt_relbuf(state->rel, lbuf);
}
_bt_freestack(stack);
pfree(key);
return exists;
}
/*
* Is particular offset within page (whose special state is passed by caller)
* the page negative-infinity item?
......
......@@ -112,7 +112,7 @@ ORDER BY c.relpages DESC LIMIT 10;
<varlistentry>
<term>
<function>bt_index_parent_check(index regclass, heapallindexed boolean) returns void</function>
<function>bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean) returns void</function>
<indexterm>
<primary>bt_index_parent_check</primary>
</indexterm>
......@@ -126,7 +126,10 @@ ORDER BY c.relpages DESC LIMIT 10;
argument is <literal>true</literal>, the function verifies the
presence of all heap tuples that should be found within the
index, and that there are no missing downlinks in the index
structure. The checks that can be performed by
structure. When the optional <parameter>rootdescend</parameter>
argument is <literal>true</literal>, verification re-finds
tuples on the leaf level by performing a new search from the
root page for each tuple. The checks that can be performed by
<function>bt_index_parent_check</function> are a superset of the
checks that can be performed by <function>bt_index_check</function>.
<function>bt_index_parent_check</function> can be thought of as
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment