Commit a507b869 authored by Robert Haas's avatar Robert Haas

Add WAL consistency checking facility.

When the new GUC wal_consistency_checking is set to a non-empty value,
it triggers recording of additional full-page images, which are
compared on the standby against the results of applying the WAL record
(without regard to those full-page images).  Allowable differences
such as hints are masked out, and the resulting pages are compared;
any difference results in a FATAL error on the standby.

Kuntal Ghosh, based on earlier patches by Michael Paquier and Heikki
Linnakangas.  Extensively reviewed and revised by Michael Paquier and
by me, with additional reviews and comments from Amit Kapila, Álvaro
Herrera, Simon Riggs, and Peter Eisentraut.
parent 115cb315
......@@ -8184,6 +8184,38 @@ LOG: CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1)
</listitem>
</varlistentry>
<varlistentry id="guc-wal-consistency-checking" xreflabel="wal_consistency_checking">
<term><varname>wal_consistency_checking</varname> (<type>string</type>)
<indexterm>
<primary><varname>wal_consistency_checking</> configuration parameter</primary>
</indexterm>
</term>
<listitem>
<para>
This parameter is intended to be used to check for bugs in the WAL
redo routines. When enabled, full-page images of any buffers modified
in conjunction with the WAL record are added to the record.
If the record is subsequently replayed, the system will first apply
each record and then test whether the buffers modified by the record
match the stored images. In certain cases (such as hint bits), minor
variations are acceptable, and will be ignored. Any unexpected
differences will result in a fatal error, terminating recovery.
</para>
<para>
The default value of this setting is the empty string, which disables
the feature. It can be set to <literal>all</literal> to check all
records, or to a comma-separated list of resource managers to check
only records originating from those resource managers. Currently,
the supported resource managers are <literal>heap</>,
<literal>heap2</>, <literal>btree</>, <literal>gin</>,
<literal>gist</>, <literal>sequence</>, <literal>spgist</>,
<literal>brin</>, and <literal>generic</>. Only
superusers can change this setting.
</para>
</listitem>
</varlistentry>
<varlistentry id="guc-wal-debug" xreflabel="wal_debug">
<term><varname>wal_debug</varname> (<type>boolean</type>)
<indexterm>
......
......@@ -13,6 +13,7 @@
#include "access/brin_page.h"
#include "access/brin_pageops.h"
#include "access/brin_xlog.h"
#include "access/bufmask.h"
#include "access/xlogutils.h"
......@@ -279,3 +280,22 @@ brin_redo(XLogReaderState *record)
elog(PANIC, "brin_redo: unknown op code %u", info);
}
}
/*
* Mask a BRIN page before doing consistency checks.
*/
void
brin_mask(char *pagedata, BlockNumber blkno)
{
Page page = (Page) pagedata;
mask_page_lsn(page);
mask_page_hint_bits(page);
if (BRIN_IS_REGULAR_PAGE(page))
{
/* Regular brin pages contain unused space which needs to be masked. */
mask_unused_space(page);
}
}
......@@ -12,7 +12,7 @@ subdir = src/backend/access/common
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = heaptuple.o indextuple.o printsimple.o printtup.o reloptions.o \
scankey.o tupconvert.o tupdesc.o
OBJS = bufmask.o heaptuple.o indextuple.o printsimple.o printtup.o \
reloptions.o scankey.o tupconvert.o tupdesc.o
include $(top_srcdir)/src/backend/common.mk
/*-------------------------------------------------------------------------
*
* bufmask.c
* Routines for buffer masking. Used to mask certain bits
* in a page which can be different when the WAL is generated
* and when the WAL is applied.
*
* Portions Copyright (c) 2016, PostgreSQL Global Development Group
*
* Contains common routines required for masking a page.
*
* IDENTIFICATION
* src/backend/storage/buffer/bufmask.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/bufmask.h"
/*
* mask_page_lsn
*
* In consistency checks, the LSN of the two pages compared will likely be
* different because of concurrent operations when the WAL is generated
* and the state of the page when WAL is applied.
*/
void
mask_page_lsn(Page page)
{
PageHeader phdr = (PageHeader) page;
PageXLogRecPtrSet(phdr->pd_lsn, (uint64) MASK_MARKER);
}
/*
* mask_page_hint_bits
*
* Mask hint bits in PageHeader. We want to ignore differences in hint bits,
* since they can be set without emitting any WAL.
*/
void
mask_page_hint_bits(Page page)
{
PageHeader phdr = (PageHeader) page;
/* Ignore prune_xid (it's like a hint-bit) */
phdr->pd_prune_xid = MASK_MARKER;
/* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints. */
PageClearFull(page);
PageClearHasFreeLinePointers(page);
/*
* During replay, if the page LSN has advanced past our XLOG record's LSN,
* we don't mark the page all-visible. See heap_xlog_visible() for
* details.
*/
PageClearAllVisible(page);
}
/*
* mask_unused_space
*
* Mask the unused space of a page between pd_lower and pd_upper.
*/
void
mask_unused_space(Page page)
{
int pd_lower = ((PageHeader) page)->pd_lower;
int pd_upper = ((PageHeader) page)->pd_upper;
int pd_special = ((PageHeader) page)->pd_special;
/* Sanity check */
if (pd_lower > pd_upper || pd_special < pd_upper ||
pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ)
{
elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n",
pd_lower, pd_upper, pd_special);
}
memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower);
}
/*
* mask_lp_flags
*
* In some index AMs, line pointer flags can be modified in master without
* emitting any WAL record.
*/
void
mask_lp_flags(Page page)
{
OffsetNumber offnum,
maxoff;
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
ItemId itemId = PageGetItemId(page, offnum);
if (ItemIdIsUsed(itemId))
itemId->lp_flags = LP_UNUSED;
}
}
/*
* mask_page_content
*
* In some index AMs, the contents of deleted pages need to be almost
* completely ignored.
*/
void
mask_page_content(Page page)
{
/* Mask Page Content */
memset(page + SizeOfPageHeaderData, MASK_MARKER,
BLCKSZ - SizeOfPageHeaderData);
/* Mask pd_lower and pd_upper */
memset(&((PageHeader) page)->pd_lower, MASK_MARKER,
sizeof(uint16));
memset(&((PageHeader) page)->pd_upper, MASK_MARKER,
sizeof(uint16));
}
......@@ -13,6 +13,7 @@
*/
#include "postgres.h"
#include "access/bufmask.h"
#include "access/gin_private.h"
#include "access/xlogutils.h"
#include "utils/memutils.h"
......@@ -758,3 +759,34 @@ gin_xlog_cleanup(void)
MemoryContextDelete(opCtx);
opCtx = NULL;
}
/*
* Mask a GIN page before running consistency checks on it.
*/
void
gin_mask(char *pagedata, BlockNumber blkno)
{
Page page = (Page) pagedata;
GinPageOpaque opaque;
mask_page_lsn(page);
opaque = GinPageGetOpaque(page);
mask_page_hint_bits(page);
/*
* GIN metapage doesn't use pd_lower/pd_upper. Other page types do. Hence,
* we need to apply masking for those pages.
*/
if (opaque->flags != GIN_META)
{
/*
* For GIN_DELETED page, the page is initialized to empty. Hence, mask
* the page content.
*/
if (opaque->flags & GIN_DELETED)
mask_page_content(page);
else
mask_unused_space(page);
}
}
......@@ -13,6 +13,7 @@
*/
#include "postgres.h"
#include "access/bufmask.h"
#include "access/gist_private.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
......@@ -342,6 +343,48 @@ gist_xlog_cleanup(void)
MemoryContextDelete(opCtx);
}
/*
* Mask a Gist page before running consistency checks on it.
*/
void
gist_mask(char *pagedata, BlockNumber blkno)
{
Page page = (Page) pagedata;
mask_page_lsn(page);
mask_page_hint_bits(page);
mask_unused_space(page);
/*
* NSN is nothing but a special purpose LSN. Hence, mask it for the same
* reason as mask_page_lsn.
*/
GistPageSetNSN(page, (uint64) MASK_MARKER);
/*
* We update F_FOLLOW_RIGHT flag on the left child after writing WAL
* record. Hence, mask this flag. See gistplacetopage() for details.
*/
GistMarkFollowRight(page);
if (GistPageIsLeaf(page))
{
/*
* In gist leaf pages, it is possible to modify the LP_FLAGS without
* emitting any WAL record. Hence, mask the line pointer flags. See
* gistkillitems() for details.
*/
mask_lp_flags(page);
}
/*
* During gist redo, we never mark a page as garbage. Hence, mask it to
* ignore any differences.
*/
GistClearPageHasGarbage(page);
}
/*
* Write WAL record of a page split.
*/
......
......@@ -38,6 +38,7 @@
*/
#include "postgres.h"
#include "access/bufmask.h"
#include "access/heapam.h"
#include "access/heapam_xlog.h"
#include "access/hio.h"
......@@ -9142,3 +9143,81 @@ heap_sync(Relation rel)
heap_close(toastrel, AccessShareLock);
}
}
/*
* Mask a heap page before performing consistency checks on it.
*/
void
heap_mask(char *pagedata, BlockNumber blkno)
{
Page page = (Page) pagedata;
OffsetNumber off;
mask_page_lsn(page);
mask_page_hint_bits(page);
mask_unused_space(page);
for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
{
ItemId iid = PageGetItemId(page, off);
char *page_item;
page_item = (char *) (page + ItemIdGetOffset(iid));
if (ItemIdIsNormal(iid))
{
HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
/*
* If xmin of a tuple is not yet frozen, we should ignore
* differences in hint bits, since they can be set without
* emitting WAL.
*/
if (!HeapTupleHeaderXminFrozen(page_htup))
page_htup->t_infomask &= ~HEAP_XACT_MASK;
else
{
/* Still we need to mask xmax hint bits. */
page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
}
/*
* During replay, we set Command Id to FirstCommandId. Hence, mask
* it. See heap_xlog_insert() for details.
*/
page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
/*
* For a speculative tuple, heap_insert() does not set ctid in the
* caller-passed heap tuple itself, leaving the ctid field to
* contain a speculative token value - a per-backend monotonically
* increasing identifier. Besides, it does not WAL-log ctid under
* any circumstances.
*
* During redo, heap_xlog_insert() sets t_ctid to current block
* number and self offset number. It doesn't care about any
* speculative insertions in master. Hence, we set t_ctid to
* current block number and self offset number to ignore any
* inconsistency.
*/
if (HeapTupleHeaderIsSpeculative(page_htup))
ItemPointerSet(&page_htup->t_ctid, blkno, off);
}
/*
* Ignore any padding bytes after the tuple, when the length of the
* item is not MAXALIGNed.
*/
if (ItemIdHasStorage(iid))
{
int len = ItemIdGetLength(iid);
int padlen = MAXALIGN(len) - len;
if (padlen > 0)
memset(page_item + len, MASK_MARKER, padlen);
}
}
}
......@@ -14,6 +14,7 @@
*/
#include "postgres.h"
#include "access/bufmask.h"
#include "access/heapam_xlog.h"
#include "access/nbtree.h"
#include "access/transam.h"
......@@ -1028,3 +1029,52 @@ btree_redo(XLogReaderState *record)
elog(PANIC, "btree_redo: unknown op code %u", info);
}
}
/*
* Mask a btree page before performing consistency checks on it.
*/
void
btree_mask(char *pagedata, BlockNumber blkno)
{
Page page = (Page) pagedata;
BTPageOpaque maskopaq;
mask_page_lsn(page);
mask_page_hint_bits(page);
mask_unused_space(page);
maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
if (P_ISDELETED(maskopaq))
{
/*
* Mask page content on a DELETED page since it will be re-initialized
* during replay. See btree_xlog_unlink_page() for details.
*/
mask_page_content(page);
}
else if (P_ISLEAF(maskopaq))
{
/*
* In btree leaf pages, it is possible to modify the LP_FLAGS without
* emitting any WAL record. Hence, mask the line pointer flags. See
* _bt_killitems(), _bt_check_unique() for details.
*/
mask_lp_flags(page);
}
/*
* BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
* _bt_killitems(), _bt_check_unique() for details.
*/
maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
/*
* During replay of a btree page split, we don't set the BTP_SPLIT_END
* flag of the right sibling and initialize the cycle_id to 0 for the same
* page. See btree_xlog_split() for details.
*/
maskopaq->btpo_flags &= ~BTP_SPLIT_END;
maskopaq->btpo_cycleid = 0;
}
......@@ -105,7 +105,12 @@ gin_desc(StringInfo buf, XLogReaderState *record)
leftChildBlkno, rightChildBlkno);
}
if (XLogRecHasBlockImage(record, 0))
{
if (XLogRecBlockImageApply(record, 0))
appendStringInfoString(buf, " (full page image)");
else
appendStringInfoString(buf, " (full page image, for WAL verification)");
}
else
{
char *payload = XLogRecGetBlockData(record, 0, NULL);
......@@ -145,7 +150,12 @@ gin_desc(StringInfo buf, XLogReaderState *record)
case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
{
if (XLogRecHasBlockImage(record, 0))
{
if (XLogRecBlockImageApply(record, 0))
appendStringInfoString(buf, " (full page image)");
else
appendStringInfoString(buf, " (full page image, for WAL verification)");
}
else
{
ginxlogVacuumDataLeafPage *xlrec =
......
......@@ -14,6 +14,7 @@
*/
#include "postgres.h"
#include "access/bufmask.h"
#include "access/spgist_private.h"
#include "access/transam.h"
#include "access/xlog.h"
......@@ -1023,3 +1024,23 @@ spg_xlog_cleanup(void)
MemoryContextDelete(opCtx);
opCtx = NULL;
}
/*
* Mask a SpGist page before performing consistency checks on it.
*/
void
spg_mask(char *pagedata, BlockNumber blkno)
{
Page page = (Page) pagedata;
mask_page_lsn(page);
mask_page_hint_bits(page);
/*
* Any SpGist page other than meta contains unused space which needs to be
* masked.
*/
if (!SpGistPageIsMeta(page))
mask_unused_space(page);
}
......@@ -13,6 +13,7 @@
*/
#include "postgres.h"
#include "access/bufmask.h"
#include "access/generic_xlog.h"
#include "access/xlogutils.h"
#include "miscadmin.h"
......@@ -533,3 +534,14 @@ generic_redo(XLogReaderState *record)
UnlockReleaseBuffer(buffers[block_id]);
}
}
/*
* Mask a generic page before performing consistency checks on it.
*/
void
generic_mask(char *page, BlockNumber blkno)
{
mask_page_lsn(page);
mask_unused_space(page);
}
......@@ -30,8 +30,8 @@
#include "utils/relmapper.h"
/* must be kept in sync with RmgrData definition in xlog_internal.h */
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
{ name, redo, desc, identify, startup, cleanup },
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
{ name, redo, desc, identify, startup, cleanup, mask },
const RmgrData RmgrTable[RM_MAX_ID + 1] = {
#include "access/rmgrlist.h"
......
......@@ -95,6 +95,8 @@ bool EnableHotStandby = false;
bool fullPageWrites = true;
bool wal_log_hints = false;
bool wal_compression = false;
char *wal_consistency_checking_string = NULL;
bool *wal_consistency_checking = NULL;
bool log_checkpoints = false;
int sync_method = DEFAULT_SYNC_METHOD;
int wal_level = WAL_LEVEL_MINIMAL;
......@@ -245,6 +247,10 @@ bool InArchiveRecovery = false;
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
/* Buffers dedicated to consistency checks of size BLCKSZ */
static char *replay_image_masked = NULL;
static char *master_image_masked = NULL;
/* options taken from recovery.conf for archive recovery */
char *recoveryRestoreCommand = NULL;
static char *recoveryEndCommand = NULL;
......@@ -903,6 +909,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr);
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
static void checkXLogConsistency(XLogReaderState *record);
static void WALInsertLockAcquire(void);
static void WALInsertLockAcquireExclusive(void);
......@@ -1314,6 +1321,103 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
return true;
}
/*
* Checks whether the current buffer page and backup page stored in the
* WAL record are consistent or not. Before comparing the two pages, a
* masking can be applied to the pages to ignore certain areas like hint bits,
* unused space between pd_lower and pd_upper among other things. This
* function should be called once WAL replay has been completed for a
* given record.
*/
static void
checkXLogConsistency(XLogReaderState *record)
{
RmgrId rmid = XLogRecGetRmid(record);
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blkno;
int block_id;
/* Records with no backup blocks have no need for consistency checks. */
if (!XLogRecHasAnyBlockRefs(record))
return;
Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
for (block_id = 0; block_id <= record->max_block_id; block_id++)
{
Buffer buf;
Page page;
if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
{
/*
* WAL record doesn't contain a block reference with the given id.
* Do nothing.
*/
continue;
}
Assert(XLogRecHasBlockImage(record, block_id));
/*
* Read the contents from the current buffer and store it in a
* temporary page.
*/
buf = XLogReadBufferExtended(rnode, forknum, blkno,
RBM_NORMAL_NO_LOG);
if (!BufferIsValid(buf))
continue;
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(buf);
/*
* Take a copy of the local page where WAL has been applied to have a
* comparison base before masking it...
*/
memcpy(replay_image_masked, page, BLCKSZ);
/* No need for this page anymore now that a copy is in. */
UnlockReleaseBuffer(buf);
/*
* If the block LSN is already ahead of this WAL record, we can't
* expect contents to match. This can happen if recovery is restarted.
*/
if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
continue;
/*
* Read the contents from the backup copy, stored in WAL record and
* store it in a temporary page. There is not need to allocate a new
* page here, a local buffer is fine to hold its contents and a mask
* can be directly applied on it.
*/
if (!RestoreBlockImage(record, block_id, master_image_masked))
elog(ERROR, "failed to restore block image");
/*
* If masking function is defined, mask both the master and replay
* images
*/
if (RmgrTable[rmid].rm_mask != NULL)
{
RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
RmgrTable[rmid].rm_mask(master_image_masked, blkno);
}
/* Time to compare the master and replay images. */
if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
{
elog(FATAL,
"inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
rnode.spcNode, rnode.dbNode, rnode.relNode,
forknum, blkno);
}
}
}
/*
* Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
* area in the WAL.
......@@ -6200,6 +6304,13 @@ StartupXLOG(void)
errdetail("Failed while allocating an XLog reading processor.")));
xlogreader->system_identifier = ControlFile->system_identifier;
/*
* Allocate pages dedicated to WAL consistency checks, those had better
* be aligned.
*/
replay_image_masked = (char *) palloc(BLCKSZ);
master_image_masked = (char *) palloc(BLCKSZ);
if (read_backup_label(&checkPointLoc, &backupEndRequired,
&backupFromStandby))
{
......@@ -7000,6 +7111,15 @@ StartupXLOG(void)
/* Now apply the WAL record itself */
RmgrTable[record->xl_rmid].rm_redo(xlogreader);
/*
* After redo, check whether the backup pages associated with
* the WAL record are consistent with the existing pages. This
* check is done only if consistency check is enabled for this
* record.
*/
if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
checkXLogConsistency(xlogreader);
/* Pop the error context stack */
error_context_stack = errcallback.previous;
......
......@@ -421,10 +421,12 @@ XLogInsert(RmgrId rmid, uint8 info)
elog(ERROR, "XLogBeginInsert was not called");
/*
* The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are
* reserved for use by me.
* The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and
* XLR_CHECK_CONSISTENCY; the rest are reserved for use by me.
*/
if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0)
if ((info & ~(XLR_RMGR_INFO_MASK |
XLR_SPECIAL_REL_UPDATE |
XLR_CHECK_CONSISTENCY)) != 0)
elog(PANIC, "invalid xlog info mask %02X", info);
TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
......@@ -504,6 +506,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
rdt_datas_last = &hdr_rdt;
hdr_rdt.data = hdr_scratch;
/*
* Enforce consistency checks for this record if user is looking for
* it. Do this before at the beginning of this routine to give the
* possibility for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY
* directly for a record.
*/
if (wal_consistency_checking[rmid])
info |= XLR_CHECK_CONSISTENCY;
/*
* Make an rdata chain containing all the data portions of all block
* references. This includes the data for full-page images. Also append
......@@ -520,6 +531,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
XLogRecordBlockCompressHeader cbimg = {0};
bool samerel;
bool is_compressed = false;
bool include_image;
if (!regbuf->in_use)
continue;
......@@ -563,7 +575,14 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT)
bkpb.fork_flags |= BKPBLOCK_WILL_INIT;
if (needs_backup)
/*
* If needs_backup is true or WAL checking is enabled for
* current resource manager, log a full-page write for the current
* block.
*/
include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0;
if (include_image)
{
Page page = regbuf->page;
uint16 compressed_len;
......@@ -625,6 +644,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE;
/*
* If WAL consistency checking is enabled for the resource manager of
* this WAL record, a full-page image is included in the record
* for the block modified. During redo, the full-page is replayed
* only if BKPIMAGE_APPLY is set.
*/
if (needs_backup)
bimg.bimg_info |= BKPIMAGE_APPLY;
if (is_compressed)
{
bimg.length = compressed_len;
......@@ -687,7 +715,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
/* Ok, copy the header to the scratch buffer */
memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader);
scratch += SizeOfXLogRecordBlockHeader;
if (needs_backup)
if (include_image)
{
memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader);
scratch += SizeOfXLogRecordBlockImageHeader;
......
......@@ -997,6 +997,7 @@ ResetDecoder(XLogReaderState *state)
state->blocks[block_id].in_use = false;
state->blocks[block_id].has_image = false;
state->blocks[block_id].has_data = false;
state->blocks[block_id].apply_image = false;
}
state->max_block_id = -1;
}
......@@ -1089,6 +1090,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
blk = &state->blocks[block_id];
blk->in_use = true;
blk->apply_image = false;
COPY_HEADER_FIELD(&fork_flags, sizeof(uint8));
blk->forknum = fork_flags & BKPBLOCK_FORK_MASK;
......@@ -1120,6 +1122,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16));
COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16));
COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8));
blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0);
if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED)
{
if (blk->bimg_info & BKPIMAGE_HAS_HOLE)
......@@ -1243,6 +1248,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
if (!blk->in_use)
continue;
Assert(blk->has_image || !blk->apply_image);
if (blk->has_image)
{
blk->bkp_image = ptr;
......
......@@ -275,9 +275,9 @@ XLogCheckInvalidPages(void)
* will complain if we don't have the lock. In hot standby mode it's
* definitely necessary.)
*
* Note: when a backup block is available in XLOG, we restore it
* unconditionally, even if the page in the database appears newer. This is
* to protect ourselves against database pages that were partially or
* Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
* set, we restore it, even if the page in the database appears newer. This
* is to protect ourselves against database pages that were partially or
* incorrectly written during a crash. We assume that the XLOG data must be
* good because it has passed a CRC check, while the database page might not
* be. This will force us to replay all subsequent modifications of the page
......@@ -352,9 +352,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
if (!willinit && zeromode)
elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
/* If it's a full-page image, restore it. */
if (XLogRecHasBlockImage(record, block_id))
/* If it has a full-page image and it should be restored, do it. */
if (XLogRecBlockImageApply(record, block_id))
{
Assert(XLogRecHasBlockImage(record, block_id));
*buf = XLogReadBufferExtended(rnode, forknum, blkno,
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
page = BufferGetPage(*buf);
......
......@@ -14,6 +14,7 @@
*/
#include "postgres.h"
#include "access/bufmask.h"
#include "access/htup_details.h"
#include "access/multixact.h"
#include "access/transam.h"
......@@ -1740,3 +1741,14 @@ ResetSequenceCaches(void)
last_used_seq = NULL;
}
/*
* Mask a Sequence page before performing consistency checks on it.
*/
void
seq_mask(char *page, BlockNumber blkno)
{
mask_page_lsn(page);
mask_unused_space(page);
}
......@@ -28,9 +28,11 @@
#include "access/commit_ts.h"
#include "access/gin.h"
#include "access/rmgr.h"
#include "access/transam.h"
#include "access/twophase.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "catalog/namespace.h"
#include "commands/async.h"
#include "commands/prepare.h"
......@@ -147,6 +149,10 @@ static bool call_enum_check_hook(struct config_enum * conf, int *newval,
static bool check_log_destination(char **newval, void **extra, GucSource source);
static void assign_log_destination(const char *newval, void *extra);
static bool check_wal_consistency_checking(char **newval, void **extra,
GucSource source);
static void assign_wal_consistency_checking(const char *newval, void *extra);
#ifdef HAVE_SYSLOG
static int syslog_facility = LOG_LOCAL0;
#else
......@@ -3572,6 +3578,17 @@ static struct config_string ConfigureNamesString[] =
check_cluster_name, NULL, NULL
},
{
{"wal_consistency_checking", PGC_SUSET, DEVELOPER_OPTIONS,
gettext_noop("Sets the WAL resource managers for which WAL consistency checks are done."),
gettext_noop("Full-page images will be logged for all data blocks and cross-checked against the results of WAL replay."),
GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE
},
&wal_consistency_checking_string,
"",
check_wal_consistency_checking, assign_wal_consistency_checking, NULL
},
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL
......@@ -9888,6 +9905,86 @@ call_enum_check_hook(struct config_enum * conf, int *newval, void **extra,
* check_hook, assign_hook and show_hook subroutines
*/
static bool
check_wal_consistency_checking(char **newval, void **extra, GucSource source)
{
char *rawstring;
List *elemlist;
ListCell *l;
bool newwalconsistency[RM_MAX_ID + 1];
/* Initialize the array */
MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
/* Need a modifiable copy of string */
rawstring = pstrdup(*newval);
/* Parse string into list of identifiers */
if (!SplitIdentifierString(rawstring, ',', &elemlist))
{
/* syntax error in list */
GUC_check_errdetail("List syntax is invalid.");
pfree(rawstring);
list_free(elemlist);
return false;
}
foreach(l, elemlist)
{
char *tok = (char *) lfirst(l);
bool found = false;
RmgrId rmid;
/* Check for 'all'. */
if (pg_strcasecmp(tok, "all") == 0)
{
for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
if (RmgrTable[rmid].rm_mask != NULL)
newwalconsistency[rmid] = true;
found = true;
}
else
{
/*
* Check if the token matches with any individual resource
* manager.
*/
for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
{
if (pg_strcasecmp(tok, RmgrTable[rmid].rm_name) == 0 &&
RmgrTable[rmid].rm_mask != NULL)
{
newwalconsistency[rmid] = true;
found = true;
}
}
}
/* If a valid resource manager is found, check for the next one. */
if (!found)
{
GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
pfree(rawstring);
list_free(elemlist);
return false;
}
}
pfree(rawstring);
list_free(elemlist);
/* assign new value */
*extra = guc_malloc(ERROR, (RM_MAX_ID + 1) * sizeof(bool));
memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
return true;
}
static void
assign_wal_consistency_checking(const char *newval, void *extra)
{
wal_consistency_checking = (bool *) extra;
}
static bool
check_log_destination(char **newval, void **extra, GucSource source)
{
......
......@@ -29,7 +29,7 @@
* RmgrNames is an array of resource manager names, to make error messages
* a bit nicer.
*/
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
name,
static const char *RmgrNames[RM_MAX_ID + 1] = {
......
......@@ -465,7 +465,12 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
rnode.spcNode, rnode.dbNode, rnode.relNode,
blk);
if (XLogRecHasBlockImage(record, block_id))
{
if (XLogRecBlockImageApply(record, block_id))
printf(" FPW");
else
printf(" FPW for WAL verification");
}
}
putchar('\n');
}
......@@ -489,7 +494,10 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
if (record->blocks[block_id].bimg_info &
BKPIMAGE_IS_COMPRESSED)
{
printf(" (FPW); hole: offset: %u, length: %u, compression saved: %u\n",
printf(" (FPW%s); hole: offset: %u, length: %u, "
"compression saved: %u\n",
XLogRecBlockImageApply(record, block_id) ?
"" : " for WAL verification",
record->blocks[block_id].hole_offset,
record->blocks[block_id].hole_length,
BLCKSZ -
......@@ -498,7 +506,9 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
}
else
{
printf(" (FPW); hole: offset: %u, length: %u\n",
printf(" (FPW%s); hole: offset: %u, length: %u\n",
XLogRecBlockImageApply(record, block_id) ?
"" : " for WAL verification",
record->blocks[block_id].hole_offset,
record->blocks[block_id].hole_length);
}
......
......@@ -32,7 +32,7 @@
#include "storage/standbydefs.h"
#include "utils/relmapper.h"
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
{ name, desc, identify},
const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = {
......
......@@ -128,5 +128,6 @@ typedef struct xl_brin_revmap_extend
extern void brin_redo(XLogReaderState *record);
extern void brin_desc(StringInfo buf, XLogReaderState *record);
extern const char *brin_identify(uint8 info);
extern void brin_mask(char *pagedata, BlockNumber blkno);
#endif /* BRIN_XLOG_H */
/*-------------------------------------------------------------------------
*
* bufmask.h
* Definitions for buffer masking routines, used to mask certain bits
* in a page which can be different when the WAL is generated
* and when the WAL is applied. This is really the job of each
* individual rmgr, but we make things easier by providing some
* common routines to handle cases which occur in multiple rmgrs.
*
* Portions Copyright (c) 2016, PostgreSQL Global Development Group
*
* src/include/access/bufmask.h
*
*-------------------------------------------------------------------------
*/
#ifndef BUFMASK_H
#define BUFMASK_H
#include "postgres.h"
#include "storage/block.h"
#include "storage/bufmgr.h"
/* Marker used to mask pages consistently */
#define MASK_MARKER 0
extern void mask_page_lsn(Page page);
extern void mask_page_hint_bits(Page page);
extern void mask_unused_space(Page page);
extern void mask_lp_flags(Page page);
extern void mask_page_content(Page page);
#endif
......@@ -40,5 +40,6 @@ extern void GenericXLogAbort(GenericXLogState *state);
extern void generic_redo(XLogReaderState *record);
extern const char *generic_identify(uint8 info);
extern void generic_desc(StringInfo buf, XLogReaderState *record);
extern void generic_mask(char *pagedata, BlockNumber blkno);
#endif /* GENERIC_XLOG_H */
......@@ -79,5 +79,6 @@ extern void gin_desc(StringInfo buf, XLogReaderState *record);
extern const char *gin_identify(uint8 info);
extern void gin_xlog_startup(void);
extern void gin_xlog_cleanup(void);
extern void gin_mask(char *pagedata, BlockNumber blkno);
#endif /* GIN_H */
......@@ -459,6 +459,7 @@ extern void gist_desc(StringInfo buf, XLogReaderState *record);
extern const char *gist_identify(uint8 info);
extern void gist_xlog_startup(void);
extern void gist_xlog_cleanup(void);
extern void gist_mask(char *pagedata, BlockNumber blkno);
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
......
......@@ -373,6 +373,7 @@ extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
extern void heap_redo(XLogReaderState *record);
extern void heap_desc(StringInfo buf, XLogReaderState *record);
extern const char *heap_identify(uint8 info);
extern void heap_mask(char *pagedata, BlockNumber blkno);
extern void heap2_redo(XLogReaderState *record);
extern void heap2_desc(StringInfo buf, XLogReaderState *record);
extern const char *heap2_identify(uint8 info);
......
......@@ -774,5 +774,6 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
extern void btree_redo(XLogReaderState *record);
extern void btree_desc(StringInfo buf, XLogReaderState *record);
extern const char *btree_identify(uint8 info);
extern void btree_mask(char *pagedata, BlockNumber blkno);
#endif /* NBTREE_H */
......@@ -19,7 +19,7 @@ typedef uint8 RmgrId;
* Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG
* file format.
*/
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
symname,
typedef enum RmgrIds
......
......@@ -25,25 +25,25 @@
*/
/* symbol name, textual name, redo, desc, identify, startup, cleanup */
PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL)
PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL)
PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL)
PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL)
PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL)
PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL)
PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL)
PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL)
PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL)
PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL)
PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL)
PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL)
PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL)
PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup)
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup)
PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL)
PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup)
PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL)
PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL)
PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL)
PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL)
PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL)
PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask)
PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, NULL)
PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
......@@ -219,5 +219,6 @@ extern void spg_desc(StringInfo buf, XLogReaderState *record);
extern const char *spg_identify(uint8 info);
extern void spg_xlog_startup(void);
extern void spg_xlog_cleanup(void);
extern void spg_mask(char *pagedata, BlockNumber blkno);
#endif /* SPGIST_H */
......@@ -105,6 +105,8 @@ extern bool EnableHotStandby;
extern bool fullPageWrites;
extern bool wal_log_hints;
extern bool wal_compression;
extern bool *wal_consistency_checking;
extern char *wal_consistency_checking_string;
extern bool log_checkpoints;
extern int CheckPointSegments;
......
......@@ -31,7 +31,7 @@
/*
* Each page of XLOG file has a header like this:
*/
#define XLOG_PAGE_MAGIC 0xD094 /* can be used as WAL version indicator */
#define XLOG_PAGE_MAGIC 0xD095 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
......@@ -266,6 +266,9 @@ typedef enum
* "VACUUM". rm_desc can then be called to obtain additional detail for the
* record, if available (e.g. the last block).
*
* rm_mask takes as input a page modified by the resource manager and masks
* out bits that shouldn't be flagged by wal_consistency_checking.
*
* RmgrTable[] is indexed by RmgrId values (see rmgrlist.h).
*/
typedef struct RmgrData
......@@ -276,6 +279,7 @@ typedef struct RmgrData
const char *(*rm_identify) (uint8 info);
void (*rm_startup) (void);
void (*rm_cleanup) (void);
void (*rm_mask) (char *pagedata, BlockNumber blkno);
} RmgrData;
extern const RmgrData RmgrTable[];
......
......@@ -51,7 +51,8 @@ typedef struct
uint8 flags;
/* Information on full-page image, if any */
bool has_image;
bool has_image; /* has image, even for consistency checking */
bool apply_image; /* has image that should be restored */
char *bkp_image;
uint16 hole_offset;
uint16 hole_length;
......@@ -205,6 +206,8 @@ extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record,
((decoder)->blocks[block_id].in_use)
#define XLogRecHasBlockImage(decoder, block_id) \
((decoder)->blocks[block_id].has_image)
#define XLogRecBlockImageApply(decoder, block_id) \
((decoder)->blocks[block_id].apply_image)
extern bool RestoreBlockImage(XLogReaderState *recoder, uint8 block_id, char *dst);
extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len);
......
......@@ -56,8 +56,8 @@ typedef struct XLogRecord
/*
* The high 4 bits in xl_info may be used freely by rmgr. The
* XLR_SPECIAL_REL_UPDATE bit can be passed by XLogInsert caller. The rest
* are set internally by XLogInsert.
* XLR_SPECIAL_REL_UPDATE and XLR_CHECK_CONSISTENCY bits can be passed by
* XLogInsert caller. The rest are set internally by XLogInsert.
*/
#define XLR_INFO_MASK 0x0F
#define XLR_RMGR_INFO_MASK 0xF0
......@@ -70,6 +70,15 @@ typedef struct XLogRecord
*/
#define XLR_SPECIAL_REL_UPDATE 0x01
/*
* Enforces consistency checks of replayed WAL at recovery. If enabled,
* each record will log a full-page write for each block modified by the
* record and will reuse it afterwards for consistency checks. The caller
* of XLogInsert can use this value if necessary, but if
* wal_consistency_checking is enabled for a rmgr this is set unconditionally.
*/
#define XLR_CHECK_CONSISTENCY 0x02
/*
* Header info for block data appended to an XLOG record.
*
......@@ -137,6 +146,7 @@ typedef struct XLogRecordBlockImageHeader
/* Information stored in bimg_info */
#define BKPIMAGE_HAS_HOLE 0x01 /* page image has "hole" */
#define BKPIMAGE_IS_COMPRESSED 0x02 /* page image is compressed */
#define BKPIMAGE_APPLY 0x04 /* page image should be restored during replay */
/*
* Extra header information used when page image has "hole" and
......
......@@ -62,5 +62,6 @@ extern void ResetSequenceCaches(void);
extern void seq_redo(XLogReaderState *rptr);
extern void seq_desc(StringInfo buf, XLogReaderState *rptr);
extern const char *seq_identify(uint8 info);
extern void seq_mask(char *pagedata, BlockNumber blkno);
#endif /* SEQUENCE_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment