Commit 361bd166 authored by Heikki Linnakangas's avatar Heikki Linnakangas

Allow Hot Standby to begin from a shutdown checkpoint.

Patch by Simon Riggs & me
parent ea9c1032
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.59 2010/02/26 02:00:34 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.60 2010/04/13 14:17:46 heikki Exp $
* *
* NOTES * NOTES
* Each global transaction is associated with a global transaction * Each global transaction is associated with a global transaction
...@@ -1718,6 +1718,89 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) ...@@ -1718,6 +1718,89 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
return result; return result;
} }
/*
* StandbyRecoverPreparedTransactions
*
* Scan the pg_twophase directory and setup all the required information to
* allow standby queries to treat prepared transactions as still active.
* This is never called at the end of recovery - we use
* RecoverPreparedTransactions() at that point.
*
* Currently we simply call SubTransSetParent() for any subxids of prepared
* transactions. If overwriteOK is true, it's OK if some XIDs have already
* been marked in pg_subtrans.
*/
void
StandbyRecoverPreparedTransactions(bool overwriteOK)
{
DIR *cldir;
struct dirent *clde;
cldir = AllocateDir(TWOPHASE_DIR);
while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
{
if (strlen(clde->d_name) == 8 &&
strspn(clde->d_name, "0123456789ABCDEF") == 8)
{
TransactionId xid;
char *buf;
TwoPhaseFileHeader *hdr;
TransactionId *subxids;
int i;
xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
/* Already processed? */
if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
{
ereport(WARNING,
(errmsg("removing stale two-phase state file \"%s\"",
clde->d_name)));
RemoveTwoPhaseFile(xid, true);
continue;
}
/* Read and validate file */
buf = ReadTwoPhaseFile(xid, true);
if (buf == NULL)
{
ereport(WARNING,
(errmsg("removing corrupt two-phase state file \"%s\"",
clde->d_name)));
RemoveTwoPhaseFile(xid, true);
continue;
}
/* Deconstruct header */
hdr = (TwoPhaseFileHeader *) buf;
if (!TransactionIdEquals(hdr->xid, xid))
{
ereport(WARNING,
(errmsg("removing corrupt two-phase state file \"%s\"",
clde->d_name)));
RemoveTwoPhaseFile(xid, true);
pfree(buf);
continue;
}
/*
* Examine subtransaction XIDs ... they should all follow main
* XID.
*/
subxids = (TransactionId *)
(buf + MAXALIGN(sizeof(TwoPhaseFileHeader)));
for (i = 0; i < hdr->nsubxacts; i++)
{
TransactionId subxid = subxids[i];
Assert(TransactionIdFollows(subxid, xid));
SubTransSetParent(xid, subxid, overwriteOK);
}
}
}
FreeDir(cldir);
}
/* /*
* RecoverPreparedTransactions * RecoverPreparedTransactions
* *
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.393 2010/04/12 10:40:42 heikki Exp $ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.394 2010/04/13 14:17:46 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -496,6 +496,7 @@ static TimeLineID lastPageTLI = 0; ...@@ -496,6 +496,7 @@ static TimeLineID lastPageTLI = 0;
static XLogRecPtr minRecoveryPoint; /* local copy of static XLogRecPtr minRecoveryPoint; /* local copy of
* ControlFile->minRecoveryPoint */ * ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true; static bool updateMinRecoveryPoint = true;
static bool reachedMinRecoveryPoint = false;
static bool InRedo = false; static bool InRedo = false;
...@@ -551,6 +552,7 @@ static void ValidateXLOGDirectoryStructure(void); ...@@ -551,6 +552,7 @@ static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void); static void CleanupBackupHistory(void);
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt); static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
static void CheckRecoveryConsistency(void);
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode); static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt); static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
static List *readTimeLineHistory(TimeLineID targetTLI); static List *readTimeLineHistory(TimeLineID targetTLI);
...@@ -5591,7 +5593,6 @@ StartupXLOG(void) ...@@ -5591,7 +5593,6 @@ StartupXLOG(void)
uint32 freespace; uint32 freespace;
TransactionId oldestActiveXID; TransactionId oldestActiveXID;
bool bgwriterLaunched = false; bool bgwriterLaunched = false;
bool backendsAllowed = false;
/* /*
* Read control file and check XLOG status looks valid. * Read control file and check XLOG status looks valid.
...@@ -5838,6 +5839,8 @@ StartupXLOG(void) ...@@ -5838,6 +5839,8 @@ StartupXLOG(void)
if (InRecovery) if (InRecovery)
{ {
int rmid; int rmid;
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
/* /*
* Update pg_control to show that we are recovering and to show the * Update pg_control to show that we are recovering and to show the
...@@ -5930,6 +5933,33 @@ StartupXLOG(void) ...@@ -5930,6 +5933,33 @@ StartupXLOG(void)
StartupMultiXact(); StartupMultiXact();
ProcArrayInitRecoveryInfo(oldestActiveXID); ProcArrayInitRecoveryInfo(oldestActiveXID);
/*
* If we're beginning at a shutdown checkpoint, we know that
* nothing was running on the master at this point. So fake-up
* an empty running-xacts record and use that here and now.
* Recover additional standby state for prepared transactions.
*/
if (wasShutdown)
{
RunningTransactionsData running;
/*
* Construct a RunningTransactions snapshot representing a shut
* down server, with only prepared transactions still alive.
* We're never overflowed at this point because all subxids
* are listed with their parent prepared transactions.
*/
running.xcnt = nxids;
running.subxid_overflow = false;
running.nextXid = checkPoint.nextXid;
running.oldestRunningXid = oldestActiveXID;
running.xids = xids;
ProcArrayApplyRecoveryInfo(&running);
StandbyRecoverPreparedTransactions(false);
}
} }
/* Initialize resource managers */ /* Initialize resource managers */
...@@ -5939,6 +5969,46 @@ StartupXLOG(void) ...@@ -5939,6 +5969,46 @@ StartupXLOG(void)
RmgrTable[rmid].rm_startup(); RmgrTable[rmid].rm_startup();
} }
/*
* Initialize shared replayEndRecPtr and recoveryLastRecPtr.
*
* This is slightly confusing if we're starting from an online
* checkpoint; we've just read and replayed the chekpoint record,
* but we're going to start replay from its redo pointer, which
* precedes the location of the checkpoint record itself. So even
* though the last record we've replayed is indeed ReadRecPtr, we
* haven't replayed all the preceding records yet. That's OK for
* the current use of these variables.
*/
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replayEndRecPtr = ReadRecPtr;
xlogctl->recoveryLastRecPtr = ReadRecPtr;
SpinLockRelease(&xlogctl->info_lck);
/*
* Let postmaster know we've started redo now, so that it can
* launch bgwriter to perform restartpoints. We don't bother
* during crash recovery as restartpoints can only be performed
* during archive recovery. And we'd like to keep crash recovery
* simple, to avoid introducing bugs that could you from
* recovering after crash.
*
* After this point, we can no longer assume that we're the only
* process in addition to postmaster! Also, fsync requests are
* subsequently to be handled by the bgwriter, not locally.
*/
if (InArchiveRecovery && IsUnderPostmaster)
{
SetForwardFsyncRequests();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
}
/*
* Allow read-only connections immediately if we're consistent already.
*/
CheckRecoveryConsistency();
/* /*
* Find the first record that logically follows the checkpoint --- it * Find the first record that logically follows the checkpoint --- it
* might physically precede it, though. * might physically precede it, though.
...@@ -5958,43 +6028,14 @@ StartupXLOG(void) ...@@ -5958,43 +6028,14 @@ StartupXLOG(void)
{ {
bool recoveryContinue = true; bool recoveryContinue = true;
bool recoveryApply = true; bool recoveryApply = true;
bool reachedMinRecoveryPoint = false;
ErrorContextCallback errcontext; ErrorContextCallback errcontext;
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
/* initialize shared replayEndRecPtr and recoveryLastRecPtr */
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replayEndRecPtr = ReadRecPtr;
xlogctl->recoveryLastRecPtr = ReadRecPtr;
SpinLockRelease(&xlogctl->info_lck);
InRedo = true; InRedo = true;
ereport(LOG, ereport(LOG,
(errmsg("redo starts at %X/%X", (errmsg("redo starts at %X/%X",
ReadRecPtr.xlogid, ReadRecPtr.xrecoff))); ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
/*
* Let postmaster know we've started redo now, so that it can
* launch bgwriter to perform restartpoints. We don't bother
* during crash recovery as restartpoints can only be performed
* during archive recovery. And we'd like to keep crash recovery
* simple, to avoid introducing bugs that could you from
* recovering after crash.
*
* After this point, we can no longer assume that we're the only
* process in addition to postmaster! Also, fsync requests are
* subsequently to be handled by the bgwriter, not locally.
*/
if (InArchiveRecovery && IsUnderPostmaster)
{
SetForwardFsyncRequests();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
}
/* /*
* main redo apply loop * main redo apply loop
*/ */
...@@ -6024,32 +6065,8 @@ StartupXLOG(void) ...@@ -6024,32 +6065,8 @@ StartupXLOG(void)
/* Handle interrupt signals of startup process */ /* Handle interrupt signals of startup process */
HandleStartupProcInterrupts(); HandleStartupProcInterrupts();
/* /* Allow read-only connections if we're consistent now */
* Have we passed our safe starting point? CheckRecoveryConsistency();
*/
if (!reachedMinRecoveryPoint &&
XLByteLE(minRecoveryPoint, EndRecPtr) &&
XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
{
reachedMinRecoveryPoint = true;
ereport(LOG,
(errmsg("consistent recovery state reached at %X/%X",
EndRecPtr.xlogid, EndRecPtr.xrecoff)));
}
/*
* Have we got a valid starting snapshot that will allow
* queries to be run? If so, we can tell postmaster that the
* database is consistent now, enabling connections.
*/
if (standbyState == STANDBY_SNAPSHOT_READY &&
!backendsAllowed &&
reachedMinRecoveryPoint &&
IsUnderPostmaster)
{
backendsAllowed = true;
SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
}
/* /*
* Have we reached our recovery target? * Have we reached our recovery target?
...@@ -6398,6 +6415,44 @@ StartupXLOG(void) ...@@ -6398,6 +6415,44 @@ StartupXLOG(void)
} }
} }
/*
* Checks if recovery has reached a consistent state. When consistency is
* reached and we have a valid starting standby snapshot, tell postmaster
* that it can start accepting read-only connections.
*/
static void
CheckRecoveryConsistency(void)
{
static bool backendsAllowed = false;
/*
* Have we passed our safe starting point?
*/
if (!reachedMinRecoveryPoint &&
XLByteLE(minRecoveryPoint, EndRecPtr) &&
XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
{
reachedMinRecoveryPoint = true;
ereport(LOG,
(errmsg("consistent recovery state reached at %X/%X",
EndRecPtr.xlogid, EndRecPtr.xrecoff)));
}
/*
* Have we got a valid starting snapshot that will allow
* queries to be run? If so, we can tell postmaster that the
* database is consistent now, enabling connections.
*/
if (standbyState == STANDBY_SNAPSHOT_READY &&
!backendsAllowed &&
reachedMinRecoveryPoint &&
IsUnderPostmaster)
{
backendsAllowed = true;
SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
}
}
/* /*
* Is the system still in recovery? * Is the system still in recovery?
* *
...@@ -7657,13 +7712,36 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) ...@@ -7657,13 +7712,36 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
if (standbyState != STANDBY_DISABLED) if (standbyState != STANDBY_DISABLED)
CheckRequiredParameterValues(checkPoint); CheckRequiredParameterValues(checkPoint);
/*
* If we see a shutdown checkpoint, we know that nothing was
* running on the master at this point. So fake-up an empty
* running-xacts record and use that here and now. Recover
* additional standby state for prepared transactions.
*/
if (standbyState >= STANDBY_INITIALIZED) if (standbyState >= STANDBY_INITIALIZED)
{ {
TransactionId *xids;
int nxids;
TransactionId oldestActiveXID;
RunningTransactionsData running;
oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
/* /*
* Remove stale transactions, if any. * Construct a RunningTransactions snapshot representing a shut
* down server, with only prepared transactions still alive.
* We're never overflowed at this point because all subxids
* are listed with their parent prepared transactions.
*/ */
ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid); running.xcnt = nxids;
StandbyReleaseOldLocks(checkPoint.nextXid); running.subxid_overflow = false;
running.nextXid = checkPoint.nextXid;
running.oldestRunningXid = oldestActiveXID;
running.xids = xids;
ProcArrayApplyRecoveryInfo(&running);
StandbyRecoverPreparedTransactions(true);
} }
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */ /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.14 2010/01/02 16:58:00 momjian Exp $ * $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.15 2010/04/13 14:17:46 heikki Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid); ...@@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid);
extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p, extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
int *nxids_p); int *nxids_p);
extern void StandbyRecoverPreparedTransactions(bool overwriteOK);
extern void RecoverPreparedTransactions(void); extern void RecoverPreparedTransactions(void);
extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment