Commit 061b079f authored by Tom Lane's avatar Tom Lane

Fix multiple bugs in index page locking during hot-standby WAL replay.

In ordinary operation, VACUUM must be careful to take a cleanup lock on
each leaf page of a btree index; this ensures that no indexscans could
still be "in flight" to heap tuples due to be deleted.  (Because of
possible index-tuple motion due to concurrent page splits, it's not enough
to lock only the pages we're deleting index tuples from.)  In Hot Standby,
the WAL replay process must likewise lock every leaf page.  There were
several bugs in the code for that:

* The replay scan might come across unused, all-zero pages in the index.
While btree_xlog_vacuum itself did the right thing (ie, nothing) with
such pages, xlogutils.c supposed that such pages must be corrupt and
would throw an error.  This accounts for various reports of replication
failures with "PANIC: WAL contains references to invalid pages".  To
fix, add a ReadBufferMode value that instructs XLogReadBufferExtended
not to complain when we're doing this.

* btree_xlog_vacuum performed the extra locking if standbyState ==
STANDBY_SNAPSHOT_READY, but that's not the correct test: we won't open up
for hot standby queries until the database has reached consistency, and
we don't want to do the extra locking till then either, for fear of reading
corrupted pages (which bufmgr.c would complain about).  Fix by exporting a
new function from xlog.c that will report whether we're actually in hot
standby replay mode.

* To ensure full coverage of the index in the replay scan, btvacuumscan
would emit a dummy WAL record for the last page of the index, if no
vacuuming work had been done on that page.  However, if the last page
of the index is all-zero, that would result in corruption of said page,
since the functions called on it weren't prepared to handle that case.
There's no need to lock any such pages, so change the logic to target
the last normal leaf page instead.

The first two of these bugs were diagnosed by Andres Freund, the other one
by me.  Fixes based on ideas from Heikki Linnakangas and myself.

This has been wrong since Hot Standby was introduced, so back-patch to 9.0.
parent 16cad3e8
...@@ -56,8 +56,8 @@ typedef struct ...@@ -56,8 +56,8 @@ typedef struct
IndexBulkDeleteCallback callback; IndexBulkDeleteCallback callback;
void *callback_state; void *callback_state;
BTCycleId cycleid; BTCycleId cycleid;
BlockNumber lastBlockVacuumed; /* last blkno reached by Vacuum scan */ BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
BlockNumber lastUsedPage; /* blkno of last non-recyclable page */ BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
BlockNumber totFreePages; /* true total # of free pages */ BlockNumber totFreePages; /* true total # of free pages */
MemoryContext pagedelcontext; MemoryContext pagedelcontext;
} BTVacState; } BTVacState;
...@@ -763,7 +763,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, ...@@ -763,7 +763,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
vstate.callback_state = callback_state; vstate.callback_state = callback_state;
vstate.cycleid = cycleid; vstate.cycleid = cycleid;
vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
vstate.lastUsedPage = BTREE_METAPAGE; vstate.lastBlockLocked = BTREE_METAPAGE;
vstate.totFreePages = 0; vstate.totFreePages = 0;
/* Create a temporary memory context to run _bt_pagedel in */ /* Create a temporary memory context to run _bt_pagedel in */
...@@ -819,27 +819,30 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, ...@@ -819,27 +819,30 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
} }
/* /*
* InHotStandby we need to scan right up to the end of the index for * If the WAL is replayed in hot standby, the replay process needs to get
* correct locking, so we may need to write a WAL record for the final * cleanup locks on all index leaf pages, just as we've been doing here.
* block in the index if it was not vacuumed. It's possible that VACUUMing * However, we won't issue any WAL records about pages that have no items
* has actually removed zeroed pages at the end of the index so we need to * to be deleted. For pages between pages we've vacuumed, the replay code
* take care to issue the record for last actual block and not for the * will take locks under the direction of the lastBlockVacuumed fields in
* last block that was scanned. Ignore empty indexes. * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
* we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
* against the last leaf page in the index, if that one wasn't vacuumed.
*/ */
if (XLogStandbyInfoActive() && if (XLogStandbyInfoActive() &&
num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1)) vstate.lastBlockVacuumed < vstate.lastBlockLocked)
{ {
Buffer buf; Buffer buf;
/* /*
* We can't use _bt_getbuf() here because it always applies * The page should be valid, but we can't use _bt_getbuf() because we
* _bt_checkpage(), which will barf on an all-zero page. We want to * want to use a nondefault buffer access strategy. Since we aren't
* recycle all-zero pages, not fail. Also, we want to use a * going to delete any items, getting cleanup lock again is probably
* nondefault buffer access strategy. * overkill, but for consistency do that anyway.
*/ */
buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL, buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
info->strategy); RBM_NORMAL, info->strategy);
LockBufferForCleanup(buf); LockBufferForCleanup(buf);
_bt_checkpage(rel, buf);
_bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
_bt_relbuf(rel, buf); _bt_relbuf(rel, buf);
} }
...@@ -914,10 +917,6 @@ restart: ...@@ -914,10 +917,6 @@ restart:
} }
} }
/* If the page is in use, update lastUsedPage */
if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
vstate->lastUsedPage = blkno;
/* Page is valid, see what to do with it */ /* Page is valid, see what to do with it */
if (_bt_page_recyclable(page)) if (_bt_page_recyclable(page))
{ {
...@@ -953,6 +952,13 @@ restart: ...@@ -953,6 +952,13 @@ restart:
LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBuffer(buf, BUFFER_LOCK_UNLOCK);
LockBufferForCleanup(buf); LockBufferForCleanup(buf);
/*
* Remember highest leaf page number we've taken cleanup lock on; see
* notes in btvacuumscan
*/
if (blkno > vstate->lastBlockLocked)
vstate->lastBlockLocked = blkno;
/* /*
* Check whether we need to recurse back to earlier pages. What we * Check whether we need to recurse back to earlier pages. What we
* are concerned about is a page split that happened since we started * are concerned about is a page split that happened since we started
...@@ -1019,19 +1025,26 @@ restart: ...@@ -1019,19 +1025,26 @@ restart:
*/ */
if (ndeletable > 0) if (ndeletable > 0)
{ {
BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); /*
* Notice that the issued XLOG_BTREE_VACUUM WAL record includes an
* instruction to the replay code to get cleanup lock on all pages
* between the previous lastBlockVacuumed and this page. This
* ensures that WAL replay locks all leaf pages at some point.
*
* Since we can visit leaf pages out-of-order when recursing,
* replay might end up locking such pages an extra time, but it
* doesn't seem worth the amount of bookkeeping it'd take to avoid
* that.
*/
_bt_delitems_vacuum(rel, buf, deletable, ndeletable, _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
vstate->lastBlockVacuumed); vstate->lastBlockVacuumed);
/* /*
* Keep track of the block number of the lastBlockVacuumed, so we * Remember highest leaf page number we've issued a
* can scan those blocks as well during WAL replay. This then * XLOG_BTREE_VACUUM WAL record for.
* provides concurrency protection and allows btrees to be used
* while in recovery.
*/ */
if (lastBlockVacuumed > vstate->lastBlockVacuumed) if (blkno > vstate->lastBlockVacuumed)
vstate->lastBlockVacuumed = lastBlockVacuumed; vstate->lastBlockVacuumed = blkno;
stats->tuples_removed += ndeletable; stats->tuples_removed += ndeletable;
/* must recompute maxoff */ /* must recompute maxoff */
......
...@@ -482,28 +482,47 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) ...@@ -482,28 +482,47 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
BTPageOpaque opaque; BTPageOpaque opaque;
/* /*
* If queries might be active then we need to ensure every block is * If queries might be active then we need to ensure every leaf page is
* unpinned between the lastBlockVacuumed and the current block, if there * unpinned between the lastBlockVacuumed and the current block, if there
* are any. This ensures that every block in the index is touched during * are any. This prevents replay of the VACUUM from reaching the stage of
* VACUUM as required to ensure scans work correctly. * removing heap tuples while there could still be indexscans "in flight"
* to those particular tuples (see nbtree/README).
*
* It might be worth checking if there are actually any backends running;
* if not, we could just skip this.
*
* Since VACUUM can visit leaf pages out-of-order, it might issue records
* with lastBlockVacuumed >= block; that's not an error, it just means
* nothing to do now.
*
* Note: since we touch all pages in the range, we will lock non-leaf
* pages, and also any empty (all-zero) pages that may be in the index. It
* doesn't seem worth the complexity to avoid that. But it's important
* that HotStandbyActiveInReplay() will not return true if the database
* isn't yet consistent; so we need not fear reading still-corrupt blocks
* here during crash recovery.
*/ */
if (standbyState == STANDBY_SNAPSHOT_READY && if (HotStandbyActiveInReplay())
(xlrec->lastBlockVacuumed + 1) != xlrec->block)
{ {
BlockNumber blkno = xlrec->lastBlockVacuumed + 1; BlockNumber blkno;
for (; blkno < xlrec->block; blkno++) for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
{ {
/* /*
* We use RBM_NORMAL_NO_LOG mode because it's not an error
* condition to see all-zero pages. The original btvacuumpage
* scan would have skipped over all-zero pages, noting them in FSM
* but not bothering to initialize them just yet; so we mustn't
* throw an error here. (We could skip acquiring the cleanup lock
* if PageIsNew, but it's probably not worth the cycles to test.)
*
* XXX we don't actually need to read the block, we just need to * XXX we don't actually need to read the block, we just need to
* confirm it is unpinned. If we had a special call into the * confirm it is unpinned. If we had a special call into the
* buffer manager we could optimise this so that if the block is * buffer manager we could optimise this so that if the block is
* not in shared_buffers we confirm it as unpinned. * not in shared_buffers we confirm it as unpinned.
*
* Another simple optimization would be to check if there's any
* backends running; if not, we could just skip this.
*/ */
buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL); buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
RBM_NORMAL_NO_LOG);
if (BufferIsValid(buffer)) if (BufferIsValid(buffer))
{ {
LockBufferForCleanup(buffer); LockBufferForCleanup(buffer);
......
...@@ -7562,7 +7562,8 @@ RecoveryInProgress(void) ...@@ -7562,7 +7562,8 @@ RecoveryInProgress(void)
* true. Postmaster knows this by way of signal, not via shared memory. * true. Postmaster knows this by way of signal, not via shared memory.
* *
* Unlike testing standbyState, this works in any process that's connected to * Unlike testing standbyState, this works in any process that's connected to
* shared memory. * shared memory. (And note that standbyState alone doesn't tell the truth
* anyway.)
*/ */
bool bool
HotStandbyActive(void) HotStandbyActive(void)
...@@ -7588,6 +7589,17 @@ HotStandbyActive(void) ...@@ -7588,6 +7589,17 @@ HotStandbyActive(void)
} }
} }
/*
* Like HotStandbyActive(), but to be used only in WAL replay code,
* where we don't need to ask any other process what the state is.
*/
bool
HotStandbyActiveInReplay(void)
{
Assert(AmStartupProcess());
return LocalHotStandbyActive;
}
/* /*
* Is this process allowed to insert new WAL records? * Is this process allowed to insert new WAL records?
* *
......
...@@ -288,6 +288,10 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) ...@@ -288,6 +288,10 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
* *
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
* relation is extended with all-zeroes pages up to the given block number. * relation is extended with all-zeroes pages up to the given block number.
*
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
* exist, and we don't check for all-zeroes. Thus, no log entry is made
* to imply that the page should be dropped or truncated later.
*/ */
Buffer Buffer
XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
...@@ -328,6 +332,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, ...@@ -328,6 +332,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
log_invalid_page(rnode, forknum, blkno, false); log_invalid_page(rnode, forknum, blkno, false);
return InvalidBuffer; return InvalidBuffer;
} }
if (mode == RBM_NORMAL_NO_LOG)
return InvalidBuffer;
/* OK to extend the file */ /* OK to extend the file */
/* we do this in recovery only - no rel-extension lock needed */ /* we do this in recovery only - no rel-extension lock needed */
Assert(InRecovery); Assert(InRecovery);
......
...@@ -207,7 +207,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum) ...@@ -207,7 +207,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
* Assume when this function is called, that reln has been opened already. * Assume when this function is called, that reln has been opened already.
* *
* In RBM_NORMAL mode, the page is read from disk, and the page header is * In RBM_NORMAL mode, the page is read from disk, and the page header is
* validated. An error is thrown if the page header is not valid. * validated. An error is thrown if the page header is not valid. (But
* note that an all-zero page is considered "valid"; see PageIsVerified().)
* *
* RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
* valid, the page is zeroed instead of throwing an error. This is intended * valid, the page is zeroed instead of throwing an error. This is intended
...@@ -221,6 +222,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum) ...@@ -221,6 +222,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
* current physical EOF; that is likely to cause problems in md.c when * current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though. * the page is modified and written out. P_NEW is OK, though.
* *
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
*
* If strategy is not NULL, a nondefault buffer access strategy is used. * If strategy is not NULL, a nondefault buffer access strategy is used.
* See buffer/README for details. * See buffer/README for details.
*/ */
......
...@@ -300,6 +300,7 @@ extern void issue_xlog_fsync(int fd, XLogSegNo segno); ...@@ -300,6 +300,7 @@ extern void issue_xlog_fsync(int fd, XLogSegNo segno);
extern bool RecoveryInProgress(void); extern bool RecoveryInProgress(void);
extern bool HotStandbyActive(void); extern bool HotStandbyActive(void);
extern bool HotStandbyActiveInReplay(void);
extern bool XLogInsertAllowed(void); extern bool XLogInsertAllowed(void);
extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream); extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
......
...@@ -38,7 +38,9 @@ typedef enum ...@@ -38,7 +38,9 @@ typedef enum
RBM_NORMAL, /* Normal read */ RBM_NORMAL, /* Normal read */
RBM_ZERO, /* Don't read from disk, caller will RBM_ZERO, /* Don't read from disk, caller will
* initialize */ * initialize */
RBM_ZERO_ON_ERROR /* Read, but return an all-zeros page on error */ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
* replay; otherwise same as RBM_NORMAL */
} ReadBufferMode; } ReadBufferMode;
/* in globals.c ... this duplicates miscadmin.h */ /* in globals.c ... this duplicates miscadmin.h */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment