Commit 3fdf649f authored by Tom Lane's avatar Tom Lane

Fix failure to guarantee that a checkpoint will write out pg_clog updates

for transaction commits that occurred just before the checkpoint.  This is
an EXTREMELY serious bug --- kudos to Satoshi Okada for creating a
reproducible test case to prove its existence.
parent bc8a1fc2
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.177 2004/08/03 15:57:26 tgl Exp $ * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.178 2004/08/11 04:07:15 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -574,13 +574,28 @@ RecordTransactionCommit(void) ...@@ -574,13 +574,28 @@ RecordTransactionCommit(void)
START_CRIT_SECTION(); START_CRIT_SECTION();
/* /*
* We only need to log the commit in XLOG if the transaction made * If our transaction made any transaction-controlled XLOG entries,
* any transaction-controlled XLOG entries or will delete files. * we need to lock out checkpoint start between writing our XLOG
* record and updating pg_clog. Otherwise it is possible for the
* checkpoint to set REDO after the XLOG record but fail to flush the
* pg_clog update to disk, leading to loss of the transaction commit
* if we crash a little later. Slightly klugy fix for problem
* discovered 2004-08-10.
*
* (If it made no transaction-controlled XLOG entries, its XID * (If it made no transaction-controlled XLOG entries, its XID
* appears nowhere in permanent storage, so no one else will ever care * appears nowhere in permanent storage, so no one else will ever care
* if it committed.) * if it committed; so it doesn't matter if we lose the commit flag.)
*
* Note we only need a shared lock.
*/ */
madeTCentries = (MyLastRecPtr.xrecoff != 0); madeTCentries = (MyLastRecPtr.xrecoff != 0);
if (madeTCentries)
LWLockAcquire(CheckpointStartLock, LW_SHARED);
/*
* We only need to log the commit in XLOG if the transaction made
* any transaction-controlled XLOG entries or will delete files.
*/
if (madeTCentries || nrels > 0) if (madeTCentries || nrels > 0)
{ {
XLogRecData rdata[3]; XLogRecData rdata[3];
...@@ -668,6 +683,10 @@ RecordTransactionCommit(void) ...@@ -668,6 +683,10 @@ RecordTransactionCommit(void)
TransactionIdCommitTree(nchildren, children); TransactionIdCommitTree(nchildren, children);
} }
/* Unlock checkpoint lock if we acquired it */
if (madeTCentries)
LWLockRelease(CheckpointStartLock);
END_CRIT_SECTION(); END_CRIT_SECTION();
} }
...@@ -850,6 +869,8 @@ RecordTransactionAbort(void) ...@@ -850,6 +869,8 @@ RecordTransactionAbort(void)
* *
* We do not flush XLOG to disk unless deleting files, since the * We do not flush XLOG to disk unless deleting files, since the
* default assumption after a crash would be that we aborted, anyway. * default assumption after a crash would be that we aborted, anyway.
* For the same reason, we don't need to worry about interlocking
* against checkpoint start.
*/ */
if (MyLastRecPtr.xrecoff != 0 || nrels > 0) if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
{ {
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.158 2004/08/09 16:26:01 tgl Exp $ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.159 2004/08/11 04:07:15 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -4699,6 +4699,15 @@ CreateCheckPoint(bool shutdown, bool force) ...@@ -4699,6 +4699,15 @@ CreateCheckPoint(bool shutdown, bool force)
checkPoint.ThisTimeLineID = ThisTimeLineID; checkPoint.ThisTimeLineID = ThisTimeLineID;
checkPoint.time = time(NULL); checkPoint.time = time(NULL);
/*
* We must hold CheckpointStartLock while determining the checkpoint
* REDO pointer. This ensures that any concurrent transaction commits
* will be either not yet logged, or logged and recorded in pg_clog.
* See notes in RecordTransactionCommit().
*/
LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE);
/* And we need WALInsertLock too */
LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
/* /*
...@@ -4731,6 +4740,7 @@ CreateCheckPoint(bool shutdown, bool force) ...@@ -4731,6 +4740,7 @@ CreateCheckPoint(bool shutdown, bool force)
ControlFile->checkPointCopy.redo.xrecoff) ControlFile->checkPointCopy.redo.xrecoff)
{ {
LWLockRelease(WALInsertLock); LWLockRelease(WALInsertLock);
LWLockRelease(CheckpointStartLock);
LWLockRelease(CheckpointLock); LWLockRelease(CheckpointLock);
END_CRIT_SECTION(); END_CRIT_SECTION();
return; return;
...@@ -4789,6 +4799,9 @@ CreateCheckPoint(bool shutdown, bool force) ...@@ -4789,6 +4799,9 @@ CreateCheckPoint(bool shutdown, bool force)
* GetSnapshotData needs to get XidGenLock while holding SInvalLock, * GetSnapshotData needs to get XidGenLock while holding SInvalLock,
* so there's a risk of deadlock. Need to find a better solution. See * so there's a risk of deadlock. Need to find a better solution. See
* pgsql-hackers discussion of 17-Dec-01. * pgsql-hackers discussion of 17-Dec-01.
*
* XXX actually, the whole UNDO code is dead code and unlikely to ever
* be revived, so the lack of a good solution here is not troubling.
*/ */
#ifdef NOT_USED #ifdef NOT_USED
checkPoint.undo = GetUndoRecPtr(); checkPoint.undo = GetUndoRecPtr();
...@@ -4798,11 +4811,13 @@ CreateCheckPoint(bool shutdown, bool force) ...@@ -4798,11 +4811,13 @@ CreateCheckPoint(bool shutdown, bool force)
#endif #endif
/* /*
* Now we can release insert lock, allowing other xacts to proceed * Now we can release insert lock and checkpoint start lock, allowing
* even while we are flushing disk buffers. * other xacts to proceed even while we are flushing disk buffers.
*/ */
LWLockRelease(WALInsertLock); LWLockRelease(WALInsertLock);
LWLockRelease(CheckpointStartLock);
/* /*
* Get the other info we need for the checkpoint record. * Get the other info we need for the checkpoint record.
*/ */
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.12 2004/06/11 16:43:24 tgl Exp $ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.13 2004/08/11 04:07:16 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -36,6 +36,7 @@ typedef enum LWLockId ...@@ -36,6 +36,7 @@ typedef enum LWLockId
WALWriteLock, WALWriteLock,
ControlFileLock, ControlFileLock,
CheckpointLock, CheckpointLock,
CheckpointStartLock,
RelCacheInitLock, RelCacheInitLock,
BgWriterCommLock, BgWriterCommLock,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment