Commit 7f563c09 authored by Andres Freund's avatar Andres Freund

Add amcheck verification of heap relations belonging to btree indexes.

Add a new, optional, capability to bt_index_check() and
bt_index_parent_check():  check that each heap tuple that should have an
index entry does in fact have one.  The extra checking is performed at
the end of the existing nbtree checks.

This is implemented by using a Bloom filter data structure.  The
implementation performs set membership tests within a callback (the same
type of callback that each index AM registers for CREATE INDEX).  The
Bloom filter is populated during the initial index verification scan.

Reusing the CREATE INDEX infrastructure allows the new verification
option to automatically benefit from the heap consistency checks that
CREATE INDEX already performs.  CREATE INDEX does thorough sanity
checking of HOT chains, so the new check actually manages to detect
problems in heap-only tuples.

Author: Peter Geoghegan
Reviewed-By: Pavan Deolasee, Andres Freund
Discussion: https://postgr.es/m/CAH2-Wzm5VmG7cu1N-H=nnS57wZThoSDQU+F5dewx3o84M+jY=g@mail.gmail.com
parent 51bc2717
......@@ -4,7 +4,7 @@ MODULE_big = amcheck
OBJS = verify_nbtree.o $(WIN32RES)
EXTENSION = amcheck
DATA = amcheck--1.0.sql
DATA = amcheck--1.0--1.1.sql amcheck--1.0.sql
PGFILEDESC = "amcheck - function for verifying relation integrity"
REGRESS = check check_btree
......
/* contrib/amcheck/amcheck--1.0--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.1'" to load this file. \quit
-- In order to avoid issues with dependencies when updating amcheck to 1.1,
-- create new, overloaded versions of the 1.0 functions
--
-- bt_index_check()
--
CREATE FUNCTION bt_index_check(index regclass,
heapallindexed boolean)
RETURNS VOID
AS 'MODULE_PATHNAME', 'bt_index_check'
LANGUAGE C STRICT PARALLEL RESTRICTED;
--
-- bt_index_parent_check()
--
CREATE FUNCTION bt_index_parent_check(index regclass,
heapallindexed boolean)
RETURNS VOID
AS 'MODULE_PATHNAME', 'bt_index_parent_check'
LANGUAGE C STRICT PARALLEL RESTRICTED;
-- Don't want these to be available to public
REVOKE ALL ON FUNCTION bt_index_check(regclass, boolean) FROM PUBLIC;
REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean) FROM PUBLIC;
# amcheck extension
comment = 'functions for verifying relation integrity'
default_version = '1.0'
default_version = '1.1'
module_pathname = '$libdir/amcheck'
relocatable = true
......@@ -18,6 +18,8 @@ RESET ROLE;
-- above explicit permission has to be granted for that.
GRANT EXECUTE ON FUNCTION bt_index_check(regclass) TO bttest_role;
GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass) TO bttest_role;
GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO bttest_role;
GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO bttest_role;
SET ROLE bttest_role;
SELECT bt_index_check('bttest_a_idx');
bt_index_check
......@@ -56,8 +58,14 @@ SELECT bt_index_check('bttest_a_idx');
(1 row)
-- more expansive test
SELECT bt_index_parent_check('bttest_b_idx');
-- more expansive tests
SELECT bt_index_check('bttest_a_idx', true);
bt_index_check
----------------
(1 row)
SELECT bt_index_parent_check('bttest_b_idx', true);
bt_index_parent_check
-----------------------
......
......@@ -21,6 +21,8 @@ RESET ROLE;
-- above explicit permission has to be granted for that.
GRANT EXECUTE ON FUNCTION bt_index_check(regclass) TO bttest_role;
GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass) TO bttest_role;
GRANT EXECUTE ON FUNCTION bt_index_check(regclass, boolean) TO bttest_role;
GRANT EXECUTE ON FUNCTION bt_index_parent_check(regclass, boolean) TO bttest_role;
SET ROLE bttest_role;
SELECT bt_index_check('bttest_a_idx');
SELECT bt_index_parent_check('bttest_a_idx');
......@@ -42,8 +44,9 @@ ROLLBACK;
-- normal check outside of xact
SELECT bt_index_check('bttest_a_idx');
-- more expansive test
SELECT bt_index_parent_check('bttest_b_idx');
-- more expansive tests
SELECT bt_index_check('bttest_a_idx', true);
SELECT bt_index_parent_check('bttest_b_idx', true);
BEGIN;
SELECT bt_index_check('bttest_a_idx');
......
......@@ -8,6 +8,11 @@
* (the insertion scankey sort-wise NULL semantics are needed for
* verification).
*
* When index-to-heap verification is requested, a Bloom filter is used to
* fingerprint all tuples in the target index, as the index is traversed to
* verify its structure. A heap scan later uses Bloom filter probes to verify
* that every visible heap tuple has a matching index tuple.
*
*
* Copyright (c) 2017-2018, PostgreSQL Global Development Group
*
......@@ -18,11 +23,14 @@
*/
#include "postgres.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/transam.h"
#include "access/xact.h"
#include "catalog/index.h"
#include "catalog/pg_am.h"
#include "commands/tablecmds.h"
#include "lib/bloomfilter.h"
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "utils/memutils.h"
......@@ -44,8 +52,9 @@ PG_MODULE_MAGIC;
*
* Other B-Tree pages may be allocated, but those are always auxiliary (e.g.,
* they are current target's child pages). Conceptually, problems are only
* ever found in the current target page. Each page found by verification's
* left/right, top/bottom scan becomes the target exactly once.
* ever found in the current target page (or for a particular heap tuple during
* heapallindexed verification). Each page found by verification's left/right,
* top/bottom scan becomes the target exactly once.
*/
typedef struct BtreeCheckState
{
......@@ -53,10 +62,13 @@ typedef struct BtreeCheckState
* Unchanging state, established at start of verification:
*/
/* B-Tree Index Relation */
/* B-Tree Index Relation and associated heap relation */
Relation rel;
Relation heaprel;
/* ShareLock held on heap/index, rather than AccessShareLock? */
bool readonly;
/* Also verifying heap has no unindexed tuples? */
bool heapallindexed;
/* Per-page context */
MemoryContext targetcontext;
/* Buffer access strategy */
......@@ -72,6 +84,15 @@ typedef struct BtreeCheckState
BlockNumber targetblock;
/* Target page's LSN */
XLogRecPtr targetlsn;
/*
* Mutable state, for optional heapallindexed verification:
*/
/* Bloom filter fingerprints B-Tree index */
bloom_filter *filter;
/* Debug counter */
int64 heaptuplespresent;
} BtreeCheckState;
/*
......@@ -92,15 +113,20 @@ typedef struct BtreeLevel
PG_FUNCTION_INFO_V1(bt_index_check);
PG_FUNCTION_INFO_V1(bt_index_parent_check);
static void bt_index_check_internal(Oid indrelid, bool parentcheck);
static void bt_index_check_internal(Oid indrelid, bool parentcheck,
bool heapallindexed);
static inline void btree_index_checkable(Relation rel);
static void bt_check_every_level(Relation rel, bool readonly);
static void bt_check_every_level(Relation rel, Relation heaprel,
bool readonly, bool heapallindexed);
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
BtreeLevel level);
static void bt_target_page_check(BtreeCheckState *state);
static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
ScanKey targetkey);
static void bt_tuple_present_callback(Relation index, HeapTuple htup,
Datum *values, bool *isnull,
bool tupleIsAlive, void *checkstate);
static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
OffsetNumber offset);
static inline bool invariant_leq_offset(BtreeCheckState *state,
......@@ -116,37 +142,47 @@ static inline bool invariant_leq_nontarget_offset(BtreeCheckState *state,
static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum);
/*
* bt_index_check(index regclass)
* bt_index_check(index regclass, heapallindexed boolean)
*
* Verify integrity of B-Tree index.
*
* Acquires AccessShareLock on heap & index relations. Does not consider
* invariants that exist between parent/child pages.
* invariants that exist between parent/child pages. Optionally verifies
* that heap does not contain any unindexed or incorrectly indexed tuples.
*/
Datum
bt_index_check(PG_FUNCTION_ARGS)
{
Oid indrelid = PG_GETARG_OID(0);
bool heapallindexed = false;
if (PG_NARGS() == 2)
heapallindexed = PG_GETARG_BOOL(1);
bt_index_check_internal(indrelid, false);
bt_index_check_internal(indrelid, false, heapallindexed);
PG_RETURN_VOID();
}
/*
* bt_index_parent_check(index regclass)
* bt_index_parent_check(index regclass, heapallindexed boolean)
*
* Verify integrity of B-Tree index.
*
* Acquires ShareLock on heap & index relations. Verifies that downlinks in
* parent pages are valid lower bounds on child pages.
* parent pages are valid lower bounds on child pages. Optionally verifies
* that heap does not contain any unindexed or incorrectly indexed tuples.
*/
Datum
bt_index_parent_check(PG_FUNCTION_ARGS)
{
Oid indrelid = PG_GETARG_OID(0);
bool heapallindexed = false;
if (PG_NARGS() == 2)
heapallindexed = PG_GETARG_BOOL(1);
bt_index_check_internal(indrelid, true);
bt_index_check_internal(indrelid, true, heapallindexed);
PG_RETURN_VOID();
}
......@@ -155,7 +191,7 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
* Helper for bt_index_[parent_]check, coordinating the bulk of the work.
*/
static void
bt_index_check_internal(Oid indrelid, bool parentcheck)
bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
{
Oid heapid;
Relation indrel;
......@@ -185,15 +221,20 @@ bt_index_check_internal(Oid indrelid, bool parentcheck)
* Open the target index relations separately (like relation_openrv(), but
* with heap relation locked first to prevent deadlocking). In hot
* standby mode this will raise an error when parentcheck is true.
*
* There is no need for the usual indcheckxmin usability horizon test here,
* even in the heapallindexed case, because index undergoing verification
* only needs to have entries for a new transaction snapshot. (If this is
* a parentcheck verification, there is no question about committed or
* recently dead heap tuples lacking index entries due to concurrent
* activity.)
*/
indrel = index_open(indrelid, lockmode);
/*
* Since we did the IndexGetRelation call above without any lock, it's
* barely possible that a race against an index drop/recreation could have
* netted us the wrong table. Although the table itself won't actually be
* examined during verification currently, a recheck still seems like a
* good idea.
* netted us the wrong table.
*/
if (heaprel == NULL || heapid != IndexGetRelation(indrelid, false))
ereport(ERROR,
......@@ -204,8 +245,8 @@ bt_index_check_internal(Oid indrelid, bool parentcheck)
/* Relation suitable for checking as B-Tree? */
btree_index_checkable(indrel);
/* Check index */
bt_check_every_level(indrel, parentcheck);
/* Check index, possibly against table it is an index on */
bt_check_every_level(indrel, heaprel, parentcheck, heapallindexed);
/*
* Release locks early. That's ok here because nothing in the called
......@@ -253,11 +294,14 @@ btree_index_checkable(Relation rel)
/*
* Main entry point for B-Tree SQL-callable functions. Walks the B-Tree in
* logical order, verifying invariants as it goes.
* logical order, verifying invariants as it goes. Optionally, verification
* checks if the heap relation contains any tuples that are not represented in
* the index but should be.
*
* It is the caller's responsibility to acquire appropriate heavyweight lock on
* the index relation, and advise us if extra checks are safe when a ShareLock
* is held.
* is held. (A lock of the same type must also have been acquired on the heap
* relation.)
*
* A ShareLock is generally assumed to prevent any kind of physical
* modification to the index structure, including modifications that VACUUM may
......@@ -272,13 +316,15 @@ btree_index_checkable(Relation rel)
* parent/child check cannot be affected.)
*/
static void
bt_check_every_level(Relation rel, bool readonly)
bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
bool heapallindexed)
{
BtreeCheckState *state;
Page metapage;
BTMetaPageData *metad;
uint32 previouslevel;
BtreeLevel current;
Snapshot snapshot = SnapshotAny;
/*
* RecentGlobalXmin assertion matches index_getnext_tid(). See note on
......@@ -291,7 +337,57 @@ bt_check_every_level(Relation rel, bool readonly)
*/
state = palloc(sizeof(BtreeCheckState));
state->rel = rel;
state->heaprel = heaprel;
state->readonly = readonly;
state->heapallindexed = heapallindexed;
if (state->heapallindexed)
{
int64 total_elems;
uint64 seed;
/* Size Bloom filter based on estimated number of tuples in index */
total_elems = (int64) state->rel->rd_rel->reltuples;
/* Random seed relies on backend srandom() call to avoid repetition */
seed = random();
/* Create Bloom filter to fingerprint index */
state->filter = bloom_create(total_elems, maintenance_work_mem, seed);
state->heaptuplespresent = 0;
/*
* Register our own snapshot in !readonly case, rather than asking
* IndexBuildHeapScan() to do this for us later. This needs to happen
* before index fingerprinting begins, so we can later be certain that
* index fingerprinting should have reached all tuples returned by
* IndexBuildHeapScan().
*/
if (!state->readonly)
{
snapshot = RegisterSnapshot(GetTransactionSnapshot());
/*
* GetTransactionSnapshot() always acquires a new MVCC snapshot in
* READ COMMITTED mode. A new snapshot is guaranteed to have all
* the entries it requires in the index.
*
* We must defend against the possibility that an old xact snapshot
* was returned at higher isolation levels when that snapshot is
* not safe for index scans of the target index. This is possible
* when the snapshot sees tuples that are before the index's
* indcheckxmin horizon. Throwing an error here should be very
* rare. It doesn't seem worth using a secondary snapshot to avoid
* this.
*/
if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin &&
!TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data),
snapshot->xmin))
ereport(ERROR,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("index \"%s\" cannot be verified using transaction snapshot",
RelationGetRelationName(rel))));
}
}
/* Create context for page */
state->targetcontext = AllocSetContextCreate(CurrentMemoryContext,
"amcheck context",
......@@ -345,6 +441,69 @@ bt_check_every_level(Relation rel, bool readonly)
previouslevel = current.level;
}
/*
* * Check whether heap contains unindexed/malformed tuples *
*/
if (state->heapallindexed)
{
IndexInfo *indexinfo = BuildIndexInfo(state->rel);
HeapScanDesc scan;
/*
* Create our own scan for IndexBuildHeapScan(), rather than getting it
* to do so for us. This is required so that we can actually use the
* MVCC snapshot registered earlier in !readonly case.
*
* Note that IndexBuildHeapScan() calls heap_endscan() for us.
*/
scan = heap_beginscan_strat(state->heaprel, /* relation */
snapshot, /* snapshot */
0, /* number of keys */
NULL, /* scan key */
true, /* buffer access strategy OK */
true); /* syncscan OK? */
/*
* Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY
* behaves in !readonly case.
*
* It's okay that we don't actually use the same lock strength for the
* heap relation as any other ii_Concurrent caller would in !readonly
* case. We have no reason to care about a concurrent VACUUM
* operation, since there isn't going to be a second scan of the heap
* that needs to be sure that there was no concurrent recycling of
* TIDs.
*/
indexinfo->ii_Concurrent = !state->readonly;
/*
* Don't wait for uncommitted tuple xact commit/abort when index is a
* unique index on a catalog (or an index used by an exclusion
* constraint). This could otherwise happen in the readonly case.
*/
indexinfo->ii_Unique = false;
indexinfo->ii_ExclusionOps = NULL;
indexinfo->ii_ExclusionProcs = NULL;
indexinfo->ii_ExclusionStrats = NULL;
elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"",
RelationGetRelationName(state->rel),
RelationGetRelationName(state->heaprel));
IndexBuildHeapScan(state->heaprel, state->rel, indexinfo, true,
bt_tuple_present_callback, (void *) state, scan);
ereport(DEBUG1,
(errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set",
state->heaptuplespresent, RelationGetRelationName(heaprel),
100.0 * bloom_prop_bits_set(state->filter))));
if (snapshot != SnapshotAny)
UnregisterSnapshot(snapshot);
bloom_free(state->filter);
}
/* Be tidy: */
MemoryContextDelete(state->targetcontext);
}
......@@ -497,7 +656,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
current, level.level, opaque->btpo.level)));
/* Verify invariants for page -- all important checks occur here */
/* Verify invariants for page */
bt_target_page_check(state);
nextpage:
......@@ -544,6 +703,9 @@ nextpage:
*
* - That all child pages respect downlinks lower bound.
*
* This is also where heapallindexed callers use their Bloom filter to
* fingerprint IndexTuples.
*
* Note: Memory allocated in this routine is expected to be released by caller
* resetting state->targetcontext.
*/
......@@ -572,21 +734,46 @@ bt_target_page_check(BtreeCheckState *state)
ItemId itemid;
IndexTuple itup;
ScanKey skey;
size_t tupsize;
CHECK_FOR_INTERRUPTS();
itemid = PageGetItemId(state->target, offset);
itup = (IndexTuple) PageGetItem(state->target, itemid);
tupsize = IndexTupleSize(itup);
/*
* lp_len should match the IndexTuple reported length exactly, since
* lp_len is completely redundant in indexes, and both sources of tuple
* length are MAXALIGN()'d. nbtree does not use lp_len all that
* frequently, and is surprisingly tolerant of corrupt lp_len fields.
*/
if (tupsize != ItemIdGetLength(itemid))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("index tuple size does not equal lp_len in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%X.",
state->targetblock, offset,
tupsize, ItemIdGetLength(itemid),
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn),
errhint("This could be a torn page problem")));
/*
* Don't try to generate scankey using "negative infinity" garbage
* data
* data on internal pages
*/
if (offset_is_negative_infinity(topaque, offset))
continue;
/* Build insertion scankey for current page offset */
itemid = PageGetItemId(state->target, offset);
itup = (IndexTuple) PageGetItem(state->target, itemid);
skey = _bt_mkscankey(state->rel, itup);
/* Fingerprint leaf page tuples (those that point to the heap) */
if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
bloom_add_element(state->filter, (unsigned char *) itup, tupsize);
/*
* * High key check *
*
......@@ -680,8 +867,10 @@ bt_target_page_check(BtreeCheckState *state)
* * Last item check *
*
* Check last item against next/right page's first data item's when
* last item on page is reached. This additional check can detect
* transposed pages.
* last item on page is reached. This additional check will detect
* transposed pages iff the supposed right sibling page happens to
* belong before target in the key space. (Otherwise, a subsequent
* heap verification will probably detect the problem.)
*
* This check is similar to the item order check that will have
* already been performed for every other "real" item on target page
......@@ -1059,6 +1248,106 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
pfree(child);
}
/*
* Per-tuple callback from IndexBuildHeapScan, used to determine if index has
* all the entries that definitely should have been observed in leaf pages of
* the target index (that is, all IndexTuples that were fingerprinted by our
* Bloom filter). All heapallindexed checks occur here.
*
* The redundancy between an index and the table it indexes provides a good
* opportunity to detect corruption, especially corruption within the table.
* The high level principle behind the verification performed here is that any
* IndexTuple that should be in an index following a fresh CREATE INDEX (based
* on the same index definition) should also have been in the original,
* existing index, which should have used exactly the same representation
*
* Since the overall structure of the index has already been verified, the most
* likely explanation for error here is a corrupt heap page (could be logical
* or physical corruption). Index corruption may still be detected here,
* though. Only readonly callers will have verified that left links and right
* links are in agreement, and so it's possible that a leaf page transposition
* within index is actually the source of corruption detected here (for
* !readonly callers). The checks performed only for readonly callers might
* more accurately frame the problem as a cross-page invariant issue (this
* could even be due to recovery not replaying all WAL records). The !readonly
* ERROR message raised here includes a HINT about retrying with readonly
* verification, just in case it's a cross-page invariant issue, though that
* isn't particularly likely.
*
* IndexBuildHeapScan() expects to be able to find the root tuple when a
* heap-only tuple (the live tuple at the end of some HOT chain) needs to be
* indexed, in order to replace the actual tuple's TID with the root tuple's
* TID (which is what we're actually passed back here). The index build heap
* scan code will raise an error when a tuple that claims to be the root of the
* heap-only tuple's HOT chain cannot be located. This catches cases where the
* original root item offset/root tuple for a HOT chain indicates (for whatever
* reason) that the entire HOT chain is dead, despite the fact that the latest
* heap-only tuple should be indexed. When this happens, sequential scans may
* always give correct answers, and all indexes may be considered structurally
* consistent (i.e. the nbtree structural checks would not detect corruption).
* It may be the case that only index scans give wrong answers, and yet heap or
* SLRU corruption is the real culprit. (While it's true that LP_DEAD bit
* setting will probably also leave the index in a corrupt state before too
* long, the problem is nonetheless that there is heap corruption.)
*
* Heap-only tuple handling within IndexBuildHeapScan() works in a way that
* helps us to detect index tuples that contain the wrong values (values that
* don't match the latest tuple in the HOT chain). This can happen when there
* is no superseding index tuple due to a faulty assessment of HOT safety,
* perhaps during the original CREATE INDEX. Because the latest tuple's
* contents are used with the root TID, an error will be raised when a tuple
* with the same TID but non-matching attribute values is passed back to us.
* Faulty assessment of HOT-safety was behind at least two distinct CREATE
* INDEX CONCURRENTLY bugs that made it into stable releases, one of which was
* undetected for many years. In short, the same principle that allows a
* REINDEX to repair corruption when there was an (undetected) broken HOT chain
* also allows us to detect the corruption in many cases.
*/
static void
bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
bool *isnull, bool tupleIsAlive, void *checkstate)
{
BtreeCheckState *state = (BtreeCheckState *) checkstate;
IndexTuple itup;
Assert(state->heapallindexed);
/*
* Generate an index tuple for fingerprinting.
*
* Index tuple formation is assumed to be deterministic, and IndexTuples
* are assumed immutable. While the LP_DEAD bit is mutable in leaf pages,
* that's ItemId metadata, which was not fingerprinted. (There will often
* be some dead-to-everyone IndexTuples fingerprinted by the Bloom filter,
* but we only try to detect the absence of needed tuples, so that's okay.)
*
* Note that we rely on deterministic index_form_tuple() TOAST compression.
* If index_form_tuple() was ever enhanced to compress datums out-of-line,
* or otherwise varied when or how compression was applied, our assumption
* would break, leading to false positive reports of corruption. For now,
* we don't decompress/normalize toasted values as part of fingerprinting.
*/
itup = index_form_tuple(RelationGetDescr(index), values, isnull);
itup->t_tid = htup->t_self;
/* Probe Bloom filter -- tuple should be present */
if (bloom_lacks_element(state->filter, (unsigned char *) itup,
IndexTupleSize(itup)))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
ItemPointerGetBlockNumber(&(itup->t_tid)),
ItemPointerGetOffsetNumber(&(itup->t_tid)),
RelationGetRelationName(state->heaprel),
RelationGetRelationName(state->rel)),
!state->readonly
? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.")
: 0));
state->heaptuplespresent++;
pfree(itup);
}
/*
* Is particular offset within page (whose special state is passed by caller)
* the page negative-infinity item?
......
......@@ -9,13 +9,13 @@
<para>
The <filename>amcheck</filename> module provides functions that allow you to
verify the logical consistency of the structure of indexes. If the
verify the logical consistency of the structure of relations. If the
structure appears to be valid, no error is raised.
</para>
<para>
The functions verify various <emphasis>invariants</emphasis> in the
structure of the representation of particular indexes. The
structure of the representation of particular relations. The
correctness of the access method functions behind index scans and
other important operations relies on these invariants always
holding. For example, certain functions verify, among other things,
......@@ -44,7 +44,7 @@
<variablelist>
<varlistentry>
<term>
<function>bt_index_check(index regclass) returns void</function>
<function>bt_index_check(index regclass, heapallindexed boolean) returns void</function>
<indexterm>
<primary>bt_index_check</primary>
</indexterm>
......@@ -55,7 +55,9 @@
<function>bt_index_check</function> tests that its target, a
B-Tree index, respects a variety of invariants. Example usage:
<screen>
test=# SELECT bt_index_check(c.oid), c.relname, c.relpages
test=# SELECT bt_index_check(index =&gt; c.oid, heapallindexed =&gt; i.indisunique)
c.relname,
c.relpages
FROM pg_index i
JOIN pg_opclass op ON i.indclass[0] = op.oid
JOIN pg_am am ON op.opcmethod = am.oid
......@@ -83,9 +85,11 @@ ORDER BY c.relpages DESC LIMIT 10;
</screen>
This example shows a session that performs verification of every
catalog index in the database <quote>test</quote>. Details of just
the 10 largest indexes verified are displayed. Since no error
is raised, all indexes tested appear to be logically consistent.
Naturally, this query could easily be changed to call
the 10 largest indexes verified are displayed. Verification of
the presence of heap tuples as index tuples is requested for
unique indexes only. Since no error is raised, all indexes
tested appear to be logically consistent. Naturally, this query
could easily be changed to call
<function>bt_index_check</function> for every index in the
database where verification is supported.
</para>
......@@ -95,10 +99,11 @@ ORDER BY c.relpages DESC LIMIT 10;
is the same lock mode acquired on relations by simple
<literal>SELECT</literal> statements.
<function>bt_index_check</function> does not verify invariants
that span child/parent relationships, nor does it verify that
the target index is consistent with its heap relation. When a
routine, lightweight test for corruption is required in a live
production environment, using
that span child/parent relationships, but will verify the
presence of all heap tuples as index tuples within the index
when <parameter>heapallindexed</parameter> is
<literal>true</literal>. When a routine, lightweight test for
corruption is required in a live production environment, using
<function>bt_index_check</function> often provides the best
trade-off between thoroughness of verification and limiting the
impact on application performance and availability.
......@@ -108,7 +113,7 @@ ORDER BY c.relpages DESC LIMIT 10;
<varlistentry>
<term>
<function>bt_index_parent_check(index regclass) returns void</function>
<function>bt_index_parent_check(index regclass, heapallindexed boolean) returns void</function>
<indexterm>
<primary>bt_index_parent_check</primary>
</indexterm>
......@@ -117,19 +122,21 @@ ORDER BY c.relpages DESC LIMIT 10;
<listitem>
<para>
<function>bt_index_parent_check</function> tests that its
target, a B-Tree index, respects a variety of invariants. The
checks performed by <function>bt_index_parent_check</function>
are a superset of the checks performed by
<function>bt_index_check</function>.
target, a B-Tree index, respects a variety of invariants.
Optionally, when the <parameter>heapallindexed</parameter>
argument is <literal>true</literal>, the function verifies the
presence of all heap tuples that should be found within the
index. The checks that can be performed by
<function>bt_index_parent_check</function> are a superset of the
checks that can be performed by <function>bt_index_check</function>.
<function>bt_index_parent_check</function> can be thought of as
a more thorough variant of <function>bt_index_check</function>:
unlike <function>bt_index_check</function>,
<function>bt_index_parent_check</function> also checks
invariants that span parent/child relationships. However, it
does not verify that the target index is consistent with its
heap relation. <function>bt_index_parent_check</function>
follows the general convention of raising an error if it finds a
logical inconsistency or other problem.
invariants that span parent/child relationships.
<function>bt_index_parent_check</function> follows the general
convention of raising an error if it finds a logical
inconsistency or other problem.
</para>
<para>
A <literal>ShareLock</literal> is required on the target index by
......@@ -158,6 +165,47 @@ ORDER BY c.relpages DESC LIMIT 10;
</variablelist>
</sect2>
<sect2>
<title>Optional <parameter>heapallindexed</parameter> verification</title>
<para>
When the <parameter>heapallindexed</parameter> argument to
verification functions is <literal>true</literal>, an additional
phase of verification is performed against the table associated with
the target index relation. This consists of a <quote>dummy</quote>
<command>CREATE INDEX</command> operation, which checks for the
presence of all hypothetical new index tuples against a temporary,
in-memory summarizing structure (this is built when needed during
the basic first phase of verification). The summarizing structure
<quote>fingerprints</quote> every tuple found within the target
index. The high level principle behind
<parameter>heapallindexed</parameter> verification is that a new
index that is equivalent to the existing, target index must only
have entries that can be found in the existing structure.
</para>
<para>
The additional <parameter>heapallindexed</parameter> phase adds
significant overhead: verification will typically take several times
longer. However, there is no change to the relation-level locks
acquired when <parameter>heapallindexed</parameter> verification is
performed.
</para>
<para>
The summarizing structure is bound in size by
<varname>maintenance_work_mem</varname>. In order to ensure that
there is no more than a 2% probability of failure to detect an
inconsistency for each heap tuple that should be represented in the
index, approximately 2 bytes of memory are needed per tuple. As
less memory is made available per tuple, the probability of missing
an inconsistency slowly increases. This approach limits the
overhead of verification significantly, while only slightly reducing
the probability of detecting a problem, especially for installations
where verification is treated as a routine maintenance task. Any
single absent or malformed tuple has a new opportunity to be
detected with each new verification attempt.
</para>
</sect2>
<sect2>
<title>Using <filename>amcheck</filename> effectively</title>
......@@ -197,18 +245,31 @@ ORDER BY c.relpages DESC LIMIT 10;
operating system locales and collations.
</para>
</listitem>
<listitem>
<para>
Structural inconsistencies between indexes and the heap relations
that are indexed (when <parameter>heapallindexed</parameter>
verification is performed).
</para>
<para>
There is no cross-checking of indexes against their heap relation
during normal operation. Symptoms of heap corruption can be subtle.
</para>
</listitem>
<listitem>
<para>
Corruption caused by hypothetical undiscovered bugs in the
underlying <productname>PostgreSQL</productname> access method code or sort
code.
underlying <productname>PostgreSQL</productname> access method
code, sort code, or transaction management code.
</para>
<para>
Automatic verification of the structural integrity of indexes
plays a role in the general testing of new or proposed
<productname>PostgreSQL</productname> features that could plausibly allow a
logical inconsistency to be introduced. One obvious testing
strategy is to call <filename>amcheck</filename> functions continuously
logical inconsistency to be introduced. Verification of table
structure and associated visibility and transaction status
information plays a similar role. One obvious testing strategy
is to call <filename>amcheck</filename> functions continuously
when running the standard regression tests. See <xref
linkend="regress-run"/> for details on running the tests.
</para>
......@@ -242,6 +303,12 @@ ORDER BY c.relpages DESC LIMIT 10;
<emphasis>absolute</emphasis> protection against failures that
result in memory corruption.
</para>
<para>
When <parameter>heapallindexed</parameter> verification is
performed, there is generally a greatly increased chance of
detecting single-bit errors, since strict binary equality is
tested, and the indexed attributes within the heap are tested.
</para>
</listitem>
</itemizedlist>
In general, <filename>amcheck</filename> can only prove the presence of
......@@ -253,11 +320,10 @@ ORDER BY c.relpages DESC LIMIT 10;
<title>Repairing corruption</title>
<para>
No error concerning corruption raised by <filename>amcheck</filename> should
ever be a false positive. In practice, <filename>amcheck</filename> is more
likely to find software bugs than problems with hardware.
<filename>amcheck</filename> raises errors in the event of conditions that,
by definition, should never happen, and so careful analysis of
<filename>amcheck</filename> errors is often required.
ever be a false positive. <filename>amcheck</filename> raises
errors in the event of conditions that, by definition, should never
happen, and so careful analysis of <filename>amcheck</filename>
errors is often required.
</para>
<para>
There is no general method of repairing problems that
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment