Commit d59ff6c1 authored by Heikki Linnakangas's avatar Heikki Linnakangas

Fix bug in determining when recovery has reached consistency.

When starting WAL replay from an online checkpoint, the last replayed WAL
record variable was initialized using the checkpoint record's location, even
though the records between the REDO location and the checkpoint record had
not been replayed yet. That was noted as "slightly confusing" but harmless
in the comment, but in some cases, it fooled CheckRecoveryConsistency to
incorrectly conclude that we had already reached a consistent state
immediately at the beginning of WAL replay. That caused the system to accept
read-only connections in hot standby mode too early, and also PANICs with
message "WAL contains references to invalid pages".

Fix by initializing the variables to the REDO location instead.

In 9.2 and above, change CheckRecoveryConsistency() to use
lastReplayedEndRecPtr variable when checking if backup end location has
been reached. It was inconsistently using EndRecPtr for that check, but
lastReplayedEndRecPtr when checking min recovery point. It made no
difference before this patch, because in all the places where
CheckRecoveryConsistency was called the two variables were the same, but
it was always an accident waiting to happen, and would have been wrong
after this patch anyway.

Report and analysis by Tomonari Katsumata, bug #8686. Backpatch to 9.0,
where hot standby was introduced.
parent ca607b15
...@@ -6684,21 +6684,13 @@ StartupXLOG(void) ...@@ -6684,21 +6684,13 @@ StartupXLOG(void)
} }
/* /*
* Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and * Initialize shared variables for tracking progress of WAL replay,
* recoveryLastXTime. * as if we had just replayed the record before the REDO location.
*
* This is slightly confusing if we're starting from an online
* checkpoint; we've just read and replayed the checkpoint record, but
* we're going to start replay from its redo pointer, which precedes
* the location of the checkpoint record itself. So even though the
* last record we've replayed is indeed ReadRecPtr, we haven't
* replayed all the preceding records yet. That's OK for the current
* use of these variables.
*/ */
SpinLockAcquire(&xlogctl->info_lck); SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replayEndRecPtr = ReadRecPtr; xlogctl->replayEndRecPtr = checkPoint.redo;
xlogctl->replayEndTLI = ThisTimeLineID; xlogctl->replayEndTLI = ThisTimeLineID;
xlogctl->lastReplayedEndRecPtr = EndRecPtr; xlogctl->lastReplayedEndRecPtr = checkPoint.redo;
xlogctl->lastReplayedTLI = ThisTimeLineID; xlogctl->lastReplayedTLI = ThisTimeLineID;
xlogctl->recoveryLastXTime = 0; xlogctl->recoveryLastXTime = 0;
xlogctl->currentChunkStartTime = 0; xlogctl->currentChunkStartTime = 0;
...@@ -7388,6 +7380,8 @@ StartupXLOG(void) ...@@ -7388,6 +7380,8 @@ StartupXLOG(void)
static void static void
CheckRecoveryConsistency(void) CheckRecoveryConsistency(void)
{ {
XLogRecPtr lastReplayedEndRecPtr;
/* /*
* During crash recovery, we don't reach a consistent state until we've * During crash recovery, we don't reach a consistent state until we've
* replayed all the WAL. * replayed all the WAL.
...@@ -7395,11 +7389,17 @@ CheckRecoveryConsistency(void) ...@@ -7395,11 +7389,17 @@ CheckRecoveryConsistency(void)
if (XLogRecPtrIsInvalid(minRecoveryPoint)) if (XLogRecPtrIsInvalid(minRecoveryPoint))
return; return;
/*
* assume that we are called in the startup process, and hence don't need
* a lock to read lastReplayedEndRecPtr
*/
lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
/* /*
* Have we reached the point where our base backup was completed? * Have we reached the point where our base backup was completed?
*/ */
if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) && if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
ControlFile->backupEndPoint <= EndRecPtr) ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
{ {
/* /*
* We have reached the end of base backup, as indicated by pg_control. * We have reached the end of base backup, as indicated by pg_control.
...@@ -7412,8 +7412,8 @@ CheckRecoveryConsistency(void) ...@@ -7412,8 +7412,8 @@ CheckRecoveryConsistency(void)
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (ControlFile->minRecoveryPoint < EndRecPtr) if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
ControlFile->minRecoveryPoint = EndRecPtr; ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
ControlFile->backupStartPoint = InvalidXLogRecPtr; ControlFile->backupStartPoint = InvalidXLogRecPtr;
ControlFile->backupEndPoint = InvalidXLogRecPtr; ControlFile->backupEndPoint = InvalidXLogRecPtr;
...@@ -7431,7 +7431,7 @@ CheckRecoveryConsistency(void) ...@@ -7431,7 +7431,7 @@ CheckRecoveryConsistency(void)
* consistent yet. * consistent yet.
*/ */
if (!reachedConsistency && !ControlFile->backupEndRequired && if (!reachedConsistency && !ControlFile->backupEndRequired &&
minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr && minRecoveryPoint <= lastReplayedEndRecPtr &&
XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
{ {
/* /*
...@@ -7443,8 +7443,8 @@ CheckRecoveryConsistency(void) ...@@ -7443,8 +7443,8 @@ CheckRecoveryConsistency(void)
reachedConsistency = true; reachedConsistency = true;
ereport(LOG, ereport(LOG,
(errmsg("consistent recovery state reached at %X/%X", (errmsg("consistent recovery state reached at %X/%X",
(uint32) (XLogCtl->lastReplayedEndRecPtr >> 32), (uint32) (lastReplayedEndRecPtr >> 32),
(uint32) XLogCtl->lastReplayedEndRecPtr))); (uint32) lastReplayedEndRecPtr)));
} }
/* /*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment