Commit 7e48b77b authored by Heikki Linnakangas's avatar Heikki Linnakangas

Fix some serious bugs in archive recovery, now that bgwriter is active

during it:

When bgwriter is active, the startup process can't perform mdsync() correctly
because it won't see the fsync requests accumulated in bgwriter's private
pendingOpsTable. Therefore make bgwriter responsible for the end-of-recovery
checkpoint as well, when it's active.

When bgwriter is active (= archive recovery), the startup process must not
accumulate fsync requests to its own pendingOpsTable, since bgwriter won't
see them there when it performs restartpoints. Make startup process drop its
pendingOpsTable when bgwriter is launched to avoid that.

Update minimum recovery point one last time when leaving archive recovery.
It won't be updated by the end-of-recovery checkpoint because XLogFlush()
sees us as out of recovery already.

This fixes bug #4879 reported by Fujii Masao.
parent e6b8f470
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.343 2009/06/11 14:48:54 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.344 2009/06/25 21:36:00 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -4912,12 +4912,18 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg) ...@@ -4912,12 +4912,18 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
{ {
char recoveryPath[MAXPGPATH]; char recoveryPath[MAXPGPATH];
char xlogpath[MAXPGPATH]; char xlogpath[MAXPGPATH];
XLogRecPtr InvalidXLogRecPtr = {0, 0};
/* /*
* We are no longer in archive recovery state. * We are no longer in archive recovery state.
*/ */
InArchiveRecovery = false; InArchiveRecovery = false;
/*
* Update min recovery point one last time.
*/
UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
/* /*
* We should have the ending log segment currently open. Verify, and then * We should have the ending log segment currently open. Verify, and then
* close it (to avoid problems on Windows with trying to rename or delete * close it (to avoid problems on Windows with trying to rename or delete
...@@ -5156,6 +5162,7 @@ StartupXLOG(void) ...@@ -5156,6 +5162,7 @@ StartupXLOG(void)
XLogRecord *record; XLogRecord *record;
uint32 freespace; uint32 freespace;
TransactionId oldestActiveXID; TransactionId oldestActiveXID;
bool bgwriterLaunched = false;
XLogCtl->SharedRecoveryInProgress = true; XLogCtl->SharedRecoveryInProgress = true;
...@@ -5472,7 +5479,11 @@ StartupXLOG(void) ...@@ -5472,7 +5479,11 @@ StartupXLOG(void)
* process in addition to postmaster! * process in addition to postmaster!
*/ */
if (InArchiveRecovery && IsUnderPostmaster) if (InArchiveRecovery && IsUnderPostmaster)
{
SetForwardFsyncRequests();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
}
/* /*
* main redo apply loop * main redo apply loop
...@@ -5709,12 +5720,6 @@ StartupXLOG(void) ...@@ -5709,12 +5720,6 @@ StartupXLOG(void)
/* Pre-scan prepared transactions to find out the range of XIDs present */ /* Pre-scan prepared transactions to find out the range of XIDs present */
oldestActiveXID = PrescanPreparedTransactions(); oldestActiveXID = PrescanPreparedTransactions();
/*
* Allow writing WAL for us, so that we can create a checkpoint record.
* But not yet for other backends!
*/
LocalRecoveryInProgress = false;
if (InRecovery) if (InRecovery)
{ {
int rmid; int rmid;
...@@ -5743,7 +5748,12 @@ StartupXLOG(void) ...@@ -5743,7 +5748,12 @@ StartupXLOG(void)
* the rule that TLI only changes in shutdown checkpoints, which * the rule that TLI only changes in shutdown checkpoints, which
* allows some extra error checking in xlog_redo. * allows some extra error checking in xlog_redo.
*/ */
CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); if (bgwriterLaunched)
RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_IMMEDIATE |
CHECKPOINT_WAIT);
else
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
/* /*
* And finally, execute the recovery_end_command, if any. * And finally, execute the recovery_end_command, if any.
...@@ -5806,7 +5816,7 @@ StartupXLOG(void) ...@@ -5806,7 +5816,7 @@ StartupXLOG(void)
} }
/* /*
* All done. Allow others to write WAL. * All done. Allow backends to write WAL.
*/ */
XLogCtl->SharedRecoveryInProgress = false; XLogCtl->SharedRecoveryInProgress = false;
} }
...@@ -6123,12 +6133,13 @@ LogCheckpointStart(int flags, bool restartpoint) ...@@ -6123,12 +6133,13 @@ LogCheckpointStart(int flags, bool restartpoint)
* the main message, but what about all the flags? * the main message, but what about all the flags?
*/ */
if (restartpoint) if (restartpoint)
msg = "restartpoint starting:%s%s%s%s%s%s"; msg = "restartpoint starting:%s%s%s%s%s%s%s";
else else
msg = "checkpoint starting:%s%s%s%s%s%s"; msg = "checkpoint starting:%s%s%s%s%s%s%s";
elog(LOG, msg, elog(LOG, msg,
(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
(flags & CHECKPOINT_FORCE) ? " force" : "", (flags & CHECKPOINT_FORCE) ? " force" : "",
(flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_WAIT) ? " wait" : "",
...@@ -6190,10 +6201,12 @@ LogCheckpointEnd(bool restartpoint) ...@@ -6190,10 +6201,12 @@ LogCheckpointEnd(bool restartpoint)
* *
* flags is a bitwise OR of the following: * flags is a bitwise OR of the following:
* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
* CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
* ignoring checkpoint_completion_target parameter. * ignoring checkpoint_completion_target parameter.
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
* since the last one (implied by CHECKPOINT_IS_SHUTDOWN). * since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
* CHECKPOINT_END_OF_RECOVERY).
* *
* Note: flags contains other bits, of interest here only for logging purposes. * Note: flags contains other bits, of interest here only for logging purposes.
* In particular note that this routine is synchronous and does not pay * In particular note that this routine is synchronous and does not pay
...@@ -6202,7 +6215,7 @@ LogCheckpointEnd(bool restartpoint) ...@@ -6202,7 +6215,7 @@ LogCheckpointEnd(bool restartpoint)
void void
CreateCheckPoint(int flags) CreateCheckPoint(int flags)
{ {
bool shutdown = (flags & CHECKPOINT_IS_SHUTDOWN) != 0; bool shutdown;
CheckPoint checkPoint; CheckPoint checkPoint;
XLogRecPtr recptr; XLogRecPtr recptr;
XLogCtlInsert *Insert = &XLogCtl->Insert; XLogCtlInsert *Insert = &XLogCtl->Insert;
...@@ -6212,35 +6225,53 @@ CreateCheckPoint(int flags) ...@@ -6212,35 +6225,53 @@ CreateCheckPoint(int flags)
uint32 _logSeg; uint32 _logSeg;
TransactionId *inCommitXids; TransactionId *inCommitXids;
int nInCommit; int nInCommit;
bool OldInRecovery = InRecovery;
/* shouldn't happen */ /*
if (RecoveryInProgress()) * An end-of-recovery checkpoint is really a shutdown checkpoint, just
elog(ERROR, "can't create a checkpoint during recovery"); * issued at a different time.
*/
if (flags & ((CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY) != 0))
shutdown = true;
else
shutdown = false;
/* /*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time. * A startup checkpoint is created before anyone else is allowed to
* During normal operation, bgwriter is the only process that creates * write WAL. To allow us to write the checkpoint record, set
* checkpoints, but at the end of archive recovery, the bgwriter can be * LocalRecoveryInProgress to false. This lets us write WAL, but others
* busy creating a restartpoint while the startup process tries to perform * are still not allowed to do so.
* the startup checkpoint.
*/ */
if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE)) if (flags & CHECKPOINT_END_OF_RECOVERY)
{ {
Assert(InRecovery); Assert(RecoveryInProgress());
LocalRecoveryInProgress = false;
InitXLOGAccess();
/* /*
* A restartpoint is in progress. Wait until it finishes. This can * Before 8.4, end-of-recovery checkpoints were always performed by
* cause an extra restartpoint to be performed, but that's OK because * the startup process, and InRecovery was set true. InRecovery is not
* we're just about to perform a checkpoint anyway. Flushing the * normally set in bgwriter, but we set it here temporarily to avoid
* buffers in this restartpoint can take some time, but that time is * confusing old code in the end-of-recovery checkpoint code path that
* saved from the upcoming checkpoint so the net effect is zero. * rely on it.
*/ */
ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint"))); InRecovery = true;
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); }
else
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE); {
/* shouldn't happen */
if (RecoveryInProgress())
elog(ERROR, "can't create a checkpoint during recovery");
} }
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
* (This is just pro forma, since in the present system structure there is
* only one process that is allowed to issue checkpoints at any given
* time.)
*/
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
/* /*
* Prepare to accumulate statistics. * Prepare to accumulate statistics.
* *
...@@ -6298,7 +6329,8 @@ CreateCheckPoint(int flags) ...@@ -6298,7 +6329,8 @@ CreateCheckPoint(int flags)
* the end of the last checkpoint record, and its redo pointer must point * the end of the last checkpoint record, and its redo pointer must point
* to itself. * to itself.
*/ */
if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0) if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_FORCE)) == 0)
{ {
XLogRecPtr curInsert; XLogRecPtr curInsert;
...@@ -6542,6 +6574,9 @@ CreateCheckPoint(int flags) ...@@ -6542,6 +6574,9 @@ CreateCheckPoint(int flags)
CheckpointStats.ckpt_segs_recycled); CheckpointStats.ckpt_segs_recycled);
LWLockRelease(CheckpointLock); LWLockRelease(CheckpointLock);
/* Restore old value */
InRecovery = OldInRecovery;
} }
/* /*
......
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.60 2009/06/11 14:49:01 momjian Exp $ * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.61 2009/06/25 21:36:00 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -448,6 +448,13 @@ BackgroundWriterMain(void) ...@@ -448,6 +448,13 @@ BackgroundWriterMain(void)
bgs->ckpt_started++; bgs->ckpt_started++;
SpinLockRelease(&bgs->ckpt_lck); SpinLockRelease(&bgs->ckpt_lck);
/*
* The end-of-recovery checkpoint is a real checkpoint that's
* performed while we're still in recovery.
*/
if (flags & CHECKPOINT_END_OF_RECOVERY)
do_restartpoint = false;
/* /*
* We will warn if (a) too soon since last checkpoint (whatever * We will warn if (a) too soon since last checkpoint (whatever
* caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
...@@ -895,10 +902,12 @@ BgWriterShmemInit(void) ...@@ -895,10 +902,12 @@ BgWriterShmemInit(void)
* *
* flags is a bitwise OR of the following: * flags is a bitwise OR of the following:
* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
* CHECKPOINT_END_OF_RECOVERY: checkpoint is to finish WAL recovery.
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
* ignoring checkpoint_completion_target parameter. * ignoring checkpoint_completion_target parameter.
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
* since the last one (implied by CHECKPOINT_IS_SHUTDOWN). * since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
* CHECKPOINT_END_OF_RECOVERY).
* CHECKPOINT_WAIT: wait for completion before returning (otherwise, * CHECKPOINT_WAIT: wait for completion before returning (otherwise,
* just signal bgwriter to do it, and return). * just signal bgwriter to do it, and return).
* CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.146 2009/06/11 14:49:02 momjian Exp $ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.147 2009/06/25 21:36:00 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -203,6 +203,21 @@ mdinit(void) ...@@ -203,6 +203,21 @@ mdinit(void)
} }
} }
/*
* In archive recovery, we rely on bgwriter to do fsyncs(), but we don't
* know that we do archive recovery at process startup when pendingOpsTable
* has already been created. Calling this function drops pendingOpsTable
* and causes any subsequent requests to be forwarded to bgwriter.
*/
void
SetForwardFsyncRequests(void)
{
/* Perform any pending ops we may have queued up */
if (pendingOpsTable)
mdsync();
pendingOpsTable = NULL;
}
/* /*
* mdexists() -- Does the physical file exist? * mdexists() -- Does the physical file exist?
* *
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.91 2009/02/18 15:58:41 heikki Exp $ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.92 2009/06/25 21:36:00 heikki Exp $
*/ */
#ifndef XLOG_H #ifndef XLOG_H
#define XLOG_H #define XLOG_H
...@@ -166,6 +166,8 @@ extern bool XLOG_DEBUG; ...@@ -166,6 +166,8 @@ extern bool XLOG_DEBUG;
/* These indicate the cause of a checkpoint request */ /* These indicate the cause of a checkpoint request */
#define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */ #define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */
#define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */ #define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */
#define CHECKPOINT_END_OF_RECOVERY 0x0040 /* Like shutdown checkpoint, but
* issued at end of WAL recovery */
/* Checkpoint statistics */ /* Checkpoint statistics */
typedef struct CheckpointStatsData typedef struct CheckpointStatsData
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.67 2009/06/11 14:49:12 momjian Exp $ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.68 2009/06/25 21:36:00 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -109,6 +109,7 @@ extern void mdpreckpt(void); ...@@ -109,6 +109,7 @@ extern void mdpreckpt(void);
extern void mdsync(void); extern void mdsync(void);
extern void mdpostckpt(void); extern void mdpostckpt(void);
extern void SetForwardFsyncRequests(void);
extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
BlockNumber segno); BlockNumber segno);
extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum); extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment