Commit 361bd166 authored by Heikki Linnakangas's avatar Heikki Linnakangas

Allow Hot Standby to begin from a shutdown checkpoint.

Patch by Simon Riggs & me
parent ea9c1032
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.59 2010/02/26 02:00:34 momjian Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.60 2010/04/13 14:17:46 heikki Exp $
*
* NOTES
* Each global transaction is associated with a global transaction
......@@ -1718,6 +1718,89 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
return result;
}
/*
* StandbyRecoverPreparedTransactions
*
* Scan the pg_twophase directory and setup all the required information to
* allow standby queries to treat prepared transactions as still active.
* This is never called at the end of recovery - we use
* RecoverPreparedTransactions() at that point.
*
* Currently we simply call SubTransSetParent() for any subxids of prepared
* transactions. If overwriteOK is true, it's OK if some XIDs have already
* been marked in pg_subtrans.
*/
void
StandbyRecoverPreparedTransactions(bool overwriteOK)
{
DIR *cldir;
struct dirent *clde;
cldir = AllocateDir(TWOPHASE_DIR);
while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
{
if (strlen(clde->d_name) == 8 &&
strspn(clde->d_name, "0123456789ABCDEF") == 8)
{
TransactionId xid;
char *buf;
TwoPhaseFileHeader *hdr;
TransactionId *subxids;
int i;
xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
/* Already processed? */
if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
{
ereport(WARNING,
(errmsg("removing stale two-phase state file \"%s\"",
clde->d_name)));
RemoveTwoPhaseFile(xid, true);
continue;
}
/* Read and validate file */
buf = ReadTwoPhaseFile(xid, true);
if (buf == NULL)
{
ereport(WARNING,
(errmsg("removing corrupt two-phase state file \"%s\"",
clde->d_name)));
RemoveTwoPhaseFile(xid, true);
continue;
}
/* Deconstruct header */
hdr = (TwoPhaseFileHeader *) buf;
if (!TransactionIdEquals(hdr->xid, xid))
{
ereport(WARNING,
(errmsg("removing corrupt two-phase state file \"%s\"",
clde->d_name)));
RemoveTwoPhaseFile(xid, true);
pfree(buf);
continue;
}
/*
* Examine subtransaction XIDs ... they should all follow main
* XID.
*/
subxids = (TransactionId *)
(buf + MAXALIGN(sizeof(TwoPhaseFileHeader)));
for (i = 0; i < hdr->nsubxacts; i++)
{
TransactionId subxid = subxids[i];
Assert(TransactionIdFollows(subxid, xid));
SubTransSetParent(xid, subxid, overwriteOK);
}
}
}
FreeDir(cldir);
}
/*
* RecoverPreparedTransactions
*
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.393 2010/04/12 10:40:42 heikki Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.394 2010/04/13 14:17:46 heikki Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -496,6 +496,7 @@ static TimeLineID lastPageTLI = 0;
static XLogRecPtr minRecoveryPoint; /* local copy of
* ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true;
static bool reachedMinRecoveryPoint = false;
static bool InRedo = false;
......@@ -551,6 +552,7 @@ static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void);
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
static void CheckRecoveryConsistency(void);
static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
static List *readTimeLineHistory(TimeLineID targetTLI);
......@@ -5591,7 +5593,6 @@ StartupXLOG(void)
uint32 freespace;
TransactionId oldestActiveXID;
bool bgwriterLaunched = false;
bool backendsAllowed = false;
/*
* Read control file and check XLOG status looks valid.
......@@ -5838,6 +5839,8 @@ StartupXLOG(void)
if (InRecovery)
{
int rmid;
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
/*
* Update pg_control to show that we are recovering and to show the
......@@ -5930,6 +5933,33 @@ StartupXLOG(void)
StartupMultiXact();
ProcArrayInitRecoveryInfo(oldestActiveXID);
/*
* If we're beginning at a shutdown checkpoint, we know that
* nothing was running on the master at this point. So fake-up
* an empty running-xacts record and use that here and now.
* Recover additional standby state for prepared transactions.
*/
if (wasShutdown)
{
RunningTransactionsData running;
/*
* Construct a RunningTransactions snapshot representing a shut
* down server, with only prepared transactions still alive.
* We're never overflowed at this point because all subxids
* are listed with their parent prepared transactions.
*/
running.xcnt = nxids;
running.subxid_overflow = false;
running.nextXid = checkPoint.nextXid;
running.oldestRunningXid = oldestActiveXID;
running.xids = xids;
ProcArrayApplyRecoveryInfo(&running);
StandbyRecoverPreparedTransactions(false);
}
}
/* Initialize resource managers */
......@@ -5939,6 +5969,46 @@ StartupXLOG(void)
RmgrTable[rmid].rm_startup();
}
/*
* Initialize shared replayEndRecPtr and recoveryLastRecPtr.
*
* This is slightly confusing if we're starting from an online
* checkpoint; we've just read and replayed the chekpoint record,
* but we're going to start replay from its redo pointer, which
* precedes the location of the checkpoint record itself. So even
* though the last record we've replayed is indeed ReadRecPtr, we
* haven't replayed all the preceding records yet. That's OK for
* the current use of these variables.
*/
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replayEndRecPtr = ReadRecPtr;
xlogctl->recoveryLastRecPtr = ReadRecPtr;
SpinLockRelease(&xlogctl->info_lck);
/*
* Let postmaster know we've started redo now, so that it can
* launch bgwriter to perform restartpoints. We don't bother
* during crash recovery as restartpoints can only be performed
* during archive recovery. And we'd like to keep crash recovery
* simple, to avoid introducing bugs that could you from
* recovering after crash.
*
* After this point, we can no longer assume that we're the only
* process in addition to postmaster! Also, fsync requests are
* subsequently to be handled by the bgwriter, not locally.
*/
if (InArchiveRecovery && IsUnderPostmaster)
{
SetForwardFsyncRequests();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
}
/*
* Allow read-only connections immediately if we're consistent already.
*/
CheckRecoveryConsistency();
/*
* Find the first record that logically follows the checkpoint --- it
* might physically precede it, though.
......@@ -5958,43 +6028,14 @@ StartupXLOG(void)
{
bool recoveryContinue = true;
bool recoveryApply = true;
bool reachedMinRecoveryPoint = false;
ErrorContextCallback errcontext;
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
/* initialize shared replayEndRecPtr and recoveryLastRecPtr */
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replayEndRecPtr = ReadRecPtr;
xlogctl->recoveryLastRecPtr = ReadRecPtr;
SpinLockRelease(&xlogctl->info_lck);
InRedo = true;
ereport(LOG,
(errmsg("redo starts at %X/%X",
ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
/*
* Let postmaster know we've started redo now, so that it can
* launch bgwriter to perform restartpoints. We don't bother
* during crash recovery as restartpoints can only be performed
* during archive recovery. And we'd like to keep crash recovery
* simple, to avoid introducing bugs that could you from
* recovering after crash.
*
* After this point, we can no longer assume that we're the only
* process in addition to postmaster! Also, fsync requests are
* subsequently to be handled by the bgwriter, not locally.
*/
if (InArchiveRecovery && IsUnderPostmaster)
{
SetForwardFsyncRequests();
SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
bgwriterLaunched = true;
}
/*
* main redo apply loop
*/
......@@ -6024,32 +6065,8 @@ StartupXLOG(void)
/* Handle interrupt signals of startup process */
HandleStartupProcInterrupts();
/*
* Have we passed our safe starting point?
*/
if (!reachedMinRecoveryPoint &&
XLByteLE(minRecoveryPoint, EndRecPtr) &&
XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
{
reachedMinRecoveryPoint = true;
ereport(LOG,
(errmsg("consistent recovery state reached at %X/%X",
EndRecPtr.xlogid, EndRecPtr.xrecoff)));
}
/*
* Have we got a valid starting snapshot that will allow
* queries to be run? If so, we can tell postmaster that the
* database is consistent now, enabling connections.
*/
if (standbyState == STANDBY_SNAPSHOT_READY &&
!backendsAllowed &&
reachedMinRecoveryPoint &&
IsUnderPostmaster)
{
backendsAllowed = true;
SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
}
/* Allow read-only connections if we're consistent now */
CheckRecoveryConsistency();
/*
* Have we reached our recovery target?
......@@ -6398,6 +6415,44 @@ StartupXLOG(void)
}
}
/*
* Checks if recovery has reached a consistent state. When consistency is
* reached and we have a valid starting standby snapshot, tell postmaster
* that it can start accepting read-only connections.
*/
static void
CheckRecoveryConsistency(void)
{
static bool backendsAllowed = false;
/*
* Have we passed our safe starting point?
*/
if (!reachedMinRecoveryPoint &&
XLByteLE(minRecoveryPoint, EndRecPtr) &&
XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
{
reachedMinRecoveryPoint = true;
ereport(LOG,
(errmsg("consistent recovery state reached at %X/%X",
EndRecPtr.xlogid, EndRecPtr.xrecoff)));
}
/*
* Have we got a valid starting snapshot that will allow
* queries to be run? If so, we can tell postmaster that the
* database is consistent now, enabling connections.
*/
if (standbyState == STANDBY_SNAPSHOT_READY &&
!backendsAllowed &&
reachedMinRecoveryPoint &&
IsUnderPostmaster)
{
backendsAllowed = true;
SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
}
}
/*
* Is the system still in recovery?
*
......@@ -7657,13 +7712,36 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
if (standbyState != STANDBY_DISABLED)
CheckRequiredParameterValues(checkPoint);
/*
* If we see a shutdown checkpoint, we know that nothing was
* running on the master at this point. So fake-up an empty
* running-xacts record and use that here and now. Recover
* additional standby state for prepared transactions.
*/
if (standbyState >= STANDBY_INITIALIZED)
{
TransactionId *xids;
int nxids;
TransactionId oldestActiveXID;
RunningTransactionsData running;
oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
/*
* Remove stale transactions, if any.
* Construct a RunningTransactions snapshot representing a shut
* down server, with only prepared transactions still alive.
* We're never overflowed at this point because all subxids
* are listed with their parent prepared transactions.
*/
ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid);
StandbyReleaseOldLocks(checkPoint.nextXid);
running.xcnt = nxids;
running.subxid_overflow = false;
running.nextXid = checkPoint.nextXid;
running.oldestRunningXid = oldestActiveXID;
running.xids = xids;
ProcArrayApplyRecoveryInfo(&running);
StandbyRecoverPreparedTransactions(true);
}
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.14 2010/01/02 16:58:00 momjian Exp $
* $PostgreSQL: pgsql/src/include/access/twophase.h,v 1.15 2010/04/13 14:17:46 heikki Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid);
extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
int *nxids_p);
extern void StandbyRecoverPreparedTransactions(bool overwriteOK);
extern void RecoverPreparedTransactions(void);
extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment