Commit 9c9b6194 authored by Tom Lane's avatar Tom Lane

Remove the CheckpointStartLock in favor of having backends show whether they

are in their commit critical sections via flags in the ProcArray.  Checkpoint
can watch the ProcArray to determine when it's safe to proceed.  This is
a considerably better solution to the original problem of race conditions
between checkpoint and transaction commit: it speeds up commit, since there's
one less lock to fool with, and it prevents the problem of checkpoint being
delayed indefinitely when there's a constant flow of commits.  Heikki, with
some kibitzing from Tom.
parent fb4279e9
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.28 2007/02/13 19:39:42 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.29 2007/04/03 16:34:35 tgl Exp $
*
* NOTES
* Each global transaction is associated with a global transaction
......@@ -279,6 +279,7 @@ MarkAsPreparing(TransactionId xid, const char *gid,
gxact->proc.pid = 0;
gxact->proc.databaseId = databaseid;
gxact->proc.roleId = owner;
gxact->proc.inCommit = false;
gxact->proc.inVacuum = false;
gxact->proc.isAutovacuum = false;
gxact->proc.lwWaiting = false;
......@@ -934,18 +935,18 @@ EndPrepare(GlobalTransaction gxact)
* odds of a PANIC actually occurring should be very tiny given that we
* were able to write the bogus CRC above.
*
* We have to lock out checkpoint start here, too; otherwise a checkpoint
* We have to set inCommit here, too; otherwise a checkpoint
* starting immediately after the WAL record is inserted could complete
* without fsync'ing our state file. (This is essentially the same kind
* of race condition as the COMMIT-to-clog-write case that
* RecordTransactionCommit uses CheckpointStartLock for; see notes there.)
* RecordTransactionCommit uses inCommit for; see notes there.)
*
* We save the PREPARE record's location in the gxact for later use by
* CheckPointTwoPhase.
*/
START_CRIT_SECTION();
LWLockAcquire(CheckpointStartLock, LW_SHARED);
MyProc->inCommit = true;
gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
records.head);
......@@ -982,10 +983,11 @@ EndPrepare(GlobalTransaction gxact)
MarkAsPrepared(gxact);
/*
* Now we can release the checkpoint start lock: a checkpoint starting
* after this will certainly see the gxact as a candidate for fsyncing.
* Now we can mark ourselves as out of the commit critical section:
* a checkpoint starting after this will certainly see the gxact as a
* candidate for fsyncing.
*/
LWLockRelease(CheckpointStartLock);
MyProc->inCommit = false;
END_CRIT_SECTION();
......@@ -1649,7 +1651,7 @@ RecoverPreparedTransactions(void)
* RecordTransactionCommitPrepared
*
* This is basically the same as RecordTransactionCommit: in particular,
* we must take the CheckpointStartLock to avoid a race condition.
* we must set the inCommit flag to avoid a race condition.
*
* We know the transaction made at least one XLOG entry (its PREPARE),
* so it is never possible to optimize out the commit record.
......@@ -1669,7 +1671,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
START_CRIT_SECTION();
/* See notes in RecordTransactionCommit */
LWLockAcquire(CheckpointStartLock, LW_SHARED);
MyProc->inCommit = true;
/* Emit the XLOG commit record */
xlrec.xid = xid;
......@@ -1713,8 +1715,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
/* to avoid race conditions, the parent must commit first */
TransactionIdCommitTree(nchildren, children);
/* Checkpoint is allowed again */
LWLockRelease(CheckpointStartLock);
/* Checkpoint can proceed now */
MyProc->inCommit = false;
END_CRIT_SECTION();
}
......
......@@ -10,7 +10,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.238 2007/03/22 19:55:04 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.239 2007/04/03 16:34:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -716,35 +716,37 @@ RecordTransactionCommit(void)
START_CRIT_SECTION();
/*
* If our transaction made any transaction-controlled XLOG entries, we
* need to lock out checkpoint start between writing our XLOG record
* and updating pg_clog. Otherwise it is possible for the checkpoint
* to set REDO after the XLOG record but fail to flush the pg_clog
* update to disk, leading to loss of the transaction commit if we
* crash a little later. Slightly klugy fix for problem discovered
* 2004-08-10.
*
* (If it made no transaction-controlled XLOG entries, its XID appears
* nowhere in permanent storage, so no one else will ever care if it
* committed; so it doesn't matter if we lose the commit flag.)
*
* Note we only need a shared lock.
*/
madeTCentries = (MyLastRecPtr.xrecoff != 0);
if (madeTCentries)
LWLockAcquire(CheckpointStartLock, LW_SHARED);
/*
* We only need to log the commit in XLOG if the transaction made any
* transaction-controlled XLOG entries or will delete files.
*/
madeTCentries = (MyLastRecPtr.xrecoff != 0);
if (madeTCentries || nrels > 0)
{
XLogRecData rdata[3];
int lastrdata = 0;
xl_xact_commit xlrec;
/*
* Mark ourselves as within our "commit critical section". This
* forces any concurrent checkpoint to wait until we've updated
* pg_clog. Without this, it is possible for the checkpoint to
* set REDO after the XLOG record but fail to flush the pg_clog
* update to disk, leading to loss of the transaction commit if
* the system crashes a little later.
*
* Note: we could, but don't bother to, set this flag in
* RecordTransactionAbort. That's because loss of a transaction
* abort is noncritical; the presumption would be that it aborted,
* anyway.
*
* It's safe to change the inCommit flag of our own backend
* without holding the ProcArrayLock, since we're the only one
* modifying it. This makes checkpoint's determination of which
* xacts are inCommit a bit fuzzy, but it doesn't matter.
*/
MyProc->inCommit = true;
xlrec.xtime = time(NULL);
xlrec.nrels = nrels;
xlrec.nsubxacts = nchildren;
......@@ -825,9 +827,8 @@ RecordTransactionCommit(void)
TransactionIdCommitTree(nchildren, children);
}
/* Unlock checkpoint lock if we acquired it */
if (madeTCentries)
LWLockRelease(CheckpointStartLock);
/* Checkpoint can proceed now */
MyProc->inCommit = false;
END_CRIT_SECTION();
}
......@@ -1961,6 +1962,7 @@ AbortTransaction(void)
MyProc->xid = InvalidTransactionId;
MyProc->xmin = InvalidTransactionId;
MyProc->inVacuum = false; /* must be cleared with xid/xmin */
MyProc->inCommit = false; /* be sure this gets cleared */
/* Clear the subtransaction-XID cache too while holding the lock */
MyProc->subxids.nxids = 0;
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.266 2007/04/03 04:14:26 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.267 2007/04/03 16:34:35 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -5366,6 +5366,8 @@ CreateCheckPoint(bool shutdown, bool force)
int nsegsadded = 0;
int nsegsremoved = 0;
int nsegsrecycled = 0;
TransactionId *inCommitXids;
int nInCommit;
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
......@@ -5392,14 +5394,9 @@ CreateCheckPoint(bool shutdown, bool force)
checkPoint.time = time(NULL);
/*
* We must hold CheckpointStartLock while determining the checkpoint REDO
* pointer. This ensures that any concurrent transaction commits will be
* either not yet logged, or logged and recorded in pg_clog. See notes in
* RecordTransactionCommit().
* We must hold WALInsertLock while examining insert state to determine
* the checkpoint REDO pointer.
*/
LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE);
/* And we need WALInsertLock too */
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
/*
......@@ -5431,7 +5428,6 @@ CreateCheckPoint(bool shutdown, bool force)
ControlFile->checkPointCopy.redo.xrecoff)
{
LWLockRelease(WALInsertLock);
LWLockRelease(CheckpointStartLock);
LWLockRelease(CheckpointLock);
END_CRIT_SECTION();
return;
......@@ -5476,17 +5472,48 @@ CreateCheckPoint(bool shutdown, bool force)
}
/*
* Now we can release insert lock and checkpoint start lock, allowing
* other xacts to proceed even while we are flushing disk buffers.
* Now we can release WAL insert lock, allowing other xacts to proceed
* while we are flushing disk buffers.
*/
LWLockRelease(WALInsertLock);
LWLockRelease(CheckpointStartLock);
if (!shutdown)
ereport(DEBUG2,
(errmsg("checkpoint starting")));
/*
* Before flushing data, we must wait for any transactions that are
* currently in their commit critical sections. If an xact inserted its
* commit record into XLOG just before the REDO point, then a crash
* restart from the REDO point would not replay that record, which means
* that our flushing had better include the xact's update of pg_clog. So
* we wait till he's out of his commit critical section before proceeding.
* See notes in RecordTransactionCommit().
*
* Because we've already released WALInsertLock, this test is a bit fuzzy:
* it is possible that we will wait for xacts we didn't really need to
* wait for. But the delay should be short and it seems better to make
* checkpoint take a bit longer than to hold locks longer than necessary.
* (In fact, the whole reason we have this issue is that xact.c does
* commit record XLOG insertion and clog update as two separate steps
* protected by different locks, but again that seems best on grounds
* of minimizing lock contention.)
*
* A transaction that has not yet set inCommit when we look cannot be
* at risk, since he's not inserted his commit record yet; and one that's
* already cleared it is not at risk either, since he's done fixing clog
* and we will correctly flush the update below. So we cannot miss any
* xacts we need to wait for.
*/
nInCommit = GetTransactionsInCommit(&inCommitXids);
if (nInCommit > 0)
{
do {
pg_usleep(10000L); /* wait for 10 msec */
} while (HaveTransactionsInCommit(inCommitXids, nInCommit));
}
pfree(inCommitXids);
/*
* Get the other info we need for the checkpoint record.
*/
......
......@@ -23,7 +23,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.23 2007/03/25 19:45:14 tgl Exp $
* $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.24 2007/04/03 16:34:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -739,6 +739,98 @@ restart:
return (num != 0);
}
/*
* GetTransactionsInCommit -- Get the XIDs of transactions that are committing
*
* Constructs an array of XIDs of transactions that are currently in commit
* critical sections, as shown by having inCommit set in their PGPROC entries.
*
* *xids_p is set to a palloc'd array that should be freed by the caller.
* The return value is the number of valid entries.
*
* Note that because backends set or clear inCommit without holding any lock,
* the result is somewhat indeterminate, but we don't really care. Even in
* a multiprocessor with delayed writes to shared memory, it should be certain
* that setting of inCommit will propagate to shared memory when the backend
* takes the WALInsertLock, so we cannot fail to see an xact as inCommit if
* it's already inserted its commit record. Whether it takes a little while
* for clearing of inCommit to propagate is unimportant for correctness.
*/
int
GetTransactionsInCommit(TransactionId **xids_p)
{
ProcArrayStruct *arrayP = procArray;
TransactionId *xids;
int nxids;
int index;
xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId));
nxids = 0;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++)
{
PGPROC *proc = arrayP->procs[index];
/* Fetch xid just once - see GetNewTransactionId */
TransactionId pxid = proc->xid;
if (proc->inCommit && TransactionIdIsValid(pxid))
xids[nxids++] = pxid;
}
LWLockRelease(ProcArrayLock);
*xids_p = xids;
return nxids;
}
/*
* HaveTransactionsInCommit -- Are any of the specified XIDs in commit?
*
* This is used with the results of GetTransactionsInCommit to see if any
* of the specified XIDs are still in their commit critical sections.
*
* Note: this is O(N^2) in the number of xacts that are/were in commit, but
* those numbers should be small enough for it not to be a problem.
*/
bool
HaveTransactionsInCommit(TransactionId *xids, int nxids)
{
bool result = false;
ProcArrayStruct *arrayP = procArray;
int index;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++)
{
PGPROC *proc = arrayP->procs[index];
/* Fetch xid just once - see GetNewTransactionId */
TransactionId pxid = proc->xid;
if (proc->inCommit && TransactionIdIsValid(pxid))
{
int i;
for (i = 0; i < nxids; i++)
{
if (xids[i] == pxid)
{
result = true;
break;
}
}
if (result)
break;
}
}
LWLockRelease(ProcArrayLock);
return result;
}
/*
* BackendPidGetProc -- get a backend's PGPROC given its PID
*
......
......@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.186 2007/03/07 13:35:03 alvherre Exp $
* $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.187 2007/04/03 16:34:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -259,6 +259,7 @@ InitProcess(void)
/* databaseId and roleId will be filled in later */
MyProc->databaseId = InvalidOid;
MyProc->roleId = InvalidOid;
MyProc->inCommit = false;
MyProc->inVacuum = false;
MyProc->isAutovacuum = IsAutoVacuumWorkerProcess();
MyProc->lwWaiting = false;
......@@ -392,6 +393,7 @@ InitAuxiliaryProcess(void)
MyProc->xmin = InvalidTransactionId;
MyProc->databaseId = InvalidOid;
MyProc->roleId = InvalidOid;
MyProc->inCommit = false;
MyProc->inVacuum = false;
MyProc->isAutovacuum = IsAutoVacuumLauncherProcess(); /* is this needed? */
MyProc->lwWaiting = false;
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.34 2007/02/15 23:23:23 alvherre Exp $
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.35 2007/04/03 16:34:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -49,7 +49,6 @@ typedef enum LWLockId
WALWriteLock,
ControlFileLock,
CheckpointLock,
CheckpointStartLock,
CLogControlLock,
SubtransControlLock,
MultiXactGenLock,
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.96 2007/03/07 13:35:03 alvherre Exp $
* $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.97 2007/04/03 16:34:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -74,6 +74,8 @@ struct PGPROC
Oid databaseId; /* OID of database this backend is using */
Oid roleId; /* OID of role using this backend */
bool inCommit; /* true if within commit critical section */
bool inVacuum; /* true if current xact is a LAZY VACUUM */
bool isAutovacuum; /* true if it's autovacuum */
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.12 2007/01/16 13:28:57 alvherre Exp $
* $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.13 2007/04/03 16:34:36 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -26,6 +26,9 @@ extern bool TransactionIdIsInProgress(TransactionId xid);
extern bool TransactionIdIsActive(TransactionId xid);
extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum);
extern int GetTransactionsInCommit(TransactionId **xids_p);
extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids);
extern PGPROC *BackendPidGetProc(int pid);
extern int BackendXidGetPid(TransactionId xid);
extern bool IsBackendPid(int pid);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment