Commit bbace569 authored by Robert Haas's avatar Robert Haas

Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.

If TRUNCATE causes some buffers to be invalidated and thus the
checkpoint does not flush them, TRUNCATE must also ensure that the
corresponding files are truncated on disk. Otherwise, a replay
from the checkpoint might find that the buffers exist but have
the wrong contents, which may cause replay to fail.

Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design
suggestion from Heikki Linnakangas, with some changes to the
comments by me. Review of this and a prior patch that approached
the issue differently by Heikki Linnakangas, Andres Freund, Álvaro
Herrera, Masahiko Sawada, and Tom Lane.

Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
parent 81045e1e
...@@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) ...@@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
* crash/basebackup, even though the state of the data directory would * crash/basebackup, even though the state of the data directory would
* require it. * require it.
*/ */
Assert(!MyProc->delayChkpt); Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
MyProc->delayChkpt = true; MyProc->delayChkpt |= DELAY_CHKPT_START;
/* WAL log truncation */ /* WAL log truncation */
WriteMTruncateXlogRec(newOldestMultiDB, WriteMTruncateXlogRec(newOldestMultiDB,
...@@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) ...@@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
/* Then offsets */ /* Then offsets */
PerformOffsetsTruncation(oldestMulti, newOldestMulti); PerformOffsetsTruncation(oldestMulti, newOldestMulti);
MyProc->delayChkpt = false; MyProc->delayChkpt &= ~DELAY_CHKPT_START;
END_CRIT_SECTION(); END_CRIT_SECTION();
LWLockRelease(MultiXactTruncationLock); LWLockRelease(MultiXactTruncationLock);
......
...@@ -474,7 +474,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, ...@@ -474,7 +474,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
} }
proc->xid = xid; proc->xid = xid;
Assert(proc->xmin == InvalidTransactionId); Assert(proc->xmin == InvalidTransactionId);
proc->delayChkpt = false; proc->delayChkpt = 0;
proc->statusFlags = 0; proc->statusFlags = 0;
proc->pid = 0; proc->pid = 0;
proc->databaseId = databaseid; proc->databaseId = databaseid;
...@@ -1165,7 +1165,8 @@ EndPrepare(GlobalTransaction gxact) ...@@ -1165,7 +1165,8 @@ EndPrepare(GlobalTransaction gxact)
START_CRIT_SECTION(); START_CRIT_SECTION();
MyProc->delayChkpt = true; Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
MyProc->delayChkpt |= DELAY_CHKPT_START;
XLogBeginInsert(); XLogBeginInsert();
for (record = records.head; record != NULL; record = record->next) for (record = records.head; record != NULL; record = record->next)
...@@ -1208,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact) ...@@ -1208,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact)
* checkpoint starting after this will certainly see the gxact as a * checkpoint starting after this will certainly see the gxact as a
* candidate for fsyncing. * candidate for fsyncing.
*/ */
MyProc->delayChkpt = false; MyProc->delayChkpt &= ~DELAY_CHKPT_START;
/* /*
* Remember that we have this GlobalTransaction entry locked for us. If * Remember that we have this GlobalTransaction entry locked for us. If
...@@ -2275,7 +2276,8 @@ RecordTransactionCommitPrepared(TransactionId xid, ...@@ -2275,7 +2276,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
START_CRIT_SECTION(); START_CRIT_SECTION();
/* See notes in RecordTransactionCommit */ /* See notes in RecordTransactionCommit */
MyProc->delayChkpt = true; Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
MyProc->delayChkpt |= DELAY_CHKPT_START;
/* /*
* Emit the XLOG commit record. Note that we mark 2PC commits as * Emit the XLOG commit record. Note that we mark 2PC commits as
...@@ -2323,7 +2325,7 @@ RecordTransactionCommitPrepared(TransactionId xid, ...@@ -2323,7 +2325,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
TransactionIdCommitTree(xid, nchildren, children); TransactionIdCommitTree(xid, nchildren, children);
/* Checkpoint can proceed now */ /* Checkpoint can proceed now */
MyProc->delayChkpt = false; MyProc->delayChkpt &= ~DELAY_CHKPT_START;
END_CRIT_SECTION(); END_CRIT_SECTION();
......
...@@ -1335,8 +1335,9 @@ RecordTransactionCommit(void) ...@@ -1335,8 +1335,9 @@ RecordTransactionCommit(void)
* This makes checkpoint's determination of which xacts are delayChkpt * This makes checkpoint's determination of which xacts are delayChkpt
* a bit fuzzy, but it doesn't matter. * a bit fuzzy, but it doesn't matter.
*/ */
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
START_CRIT_SECTION(); START_CRIT_SECTION();
MyProc->delayChkpt = true; MyProc->delayChkpt |= DELAY_CHKPT_START;
SetCurrentTransactionStopTimestamp(); SetCurrentTransactionStopTimestamp();
...@@ -1437,7 +1438,7 @@ RecordTransactionCommit(void) ...@@ -1437,7 +1438,7 @@ RecordTransactionCommit(void)
*/ */
if (markXidCommitted) if (markXidCommitted)
{ {
MyProc->delayChkpt = false; MyProc->delayChkpt &= ~DELAY_CHKPT_START;
END_CRIT_SECTION(); END_CRIT_SECTION();
} }
......
...@@ -9228,18 +9228,30 @@ CreateCheckPoint(int flags) ...@@ -9228,18 +9228,30 @@ CreateCheckPoint(int flags)
* and we will correctly flush the update below. So we cannot miss any * and we will correctly flush the update below. So we cannot miss any
* xacts we need to wait for. * xacts we need to wait for.
*/ */
vxids = GetVirtualXIDsDelayingChkpt(&nvxids); vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
if (nvxids > 0) if (nvxids > 0)
{ {
do do
{ {
pg_usleep(10000L); /* wait for 10 msec */ pg_usleep(10000L); /* wait for 10 msec */
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids)); } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
DELAY_CHKPT_START));
} }
pfree(vxids); pfree(vxids);
CheckPointGuts(checkPoint.redo, flags); CheckPointGuts(checkPoint.redo, flags);
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
if (nvxids > 0)
{
do
{
pg_usleep(10000L); /* wait for 10 msec */
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
DELAY_CHKPT_COMPLETE));
}
pfree(vxids);
/* /*
* Take a snapshot of running transactions and write this to WAL. This * Take a snapshot of running transactions and write this to WAL. This
* allows us to reconstruct the state of running transactions during * allows us to reconstruct the state of running transactions during
......
...@@ -925,7 +925,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) ...@@ -925,7 +925,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
/* /*
* Ensure no checkpoint can change our view of RedoRecPtr. * Ensure no checkpoint can change our view of RedoRecPtr.
*/ */
Assert(MyProc->delayChkpt); Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
/* /*
* Update RedoRecPtr so that we can make the right decision * Update RedoRecPtr so that we can make the right decision
......
...@@ -325,6 +325,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks) ...@@ -325,6 +325,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
RelationPreTruncate(rel); RelationPreTruncate(rel);
/*
* Make sure that a concurrent checkpoint can't complete while truncation
* is in progress.
*
* The truncation operation might drop buffers that the checkpoint
* otherwise would have flushed. If it does, then it's essential that
* the files actually get truncated on disk before the checkpoint record
* is written. Otherwise, if reply begins from that checkpoint, the
* to-be-truncated blocks might still exist on disk but have older
* contents than expected, which can cause replay to fail. It's OK for
* the blocks to not exist on disk at all, but not for them to have the
* wrong contents.
*/
Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
/* /*
* We WAL-log the truncation before actually truncating, which means * We WAL-log the truncation before actually truncating, which means
* trouble if the truncation fails. If we then crash, the WAL replay * trouble if the truncation fails. If we then crash, the WAL replay
...@@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks) ...@@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
XLogFlush(lsn); XLogFlush(lsn);
} }
/* Do the real work to truncate relation forks */ /*
* This will first remove any buffers from the buffer pool that should no
* longer exist after truncation is complete, and then truncate the
* corresponding files on disk.
*/
smgrtruncate(rel->rd_smgr, forks, nforks, blocks); smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
/* We've done all the critical work, so checkpoints are OK now. */
MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
/* /*
* Update upper-level FSM pages to account for the truncation. This is * Update upper-level FSM pages to account for the truncation. This is
* important because the just-truncated pages were likely marked as * important because the just-truncated pages were likely marked as
* all-free, and would be preferentially selected. * all-free, and would be preferentially selected.
*
* NB: There's no point in delaying checkpoints until this is done.
* Because the FSM is not WAL-logged, we have to be prepared for the
* possibility of corruption after a crash anyway.
*/ */
if (need_fsm_vacuum) if (need_fsm_vacuum)
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);
......
...@@ -3946,7 +3946,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) ...@@ -3946,7 +3946,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
* essential that CreateCheckpoint waits for virtual transactions * essential that CreateCheckpoint waits for virtual transactions
* rather than full transactionids. * rather than full transactionids.
*/ */
MyProc->delayChkpt = delayChkpt = true; Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
MyProc->delayChkpt |= DELAY_CHKPT_START;
delayChkpt = true;
lsn = XLogSaveBufferForHint(buffer, buffer_std); lsn = XLogSaveBufferForHint(buffer, buffer_std);
} }
...@@ -3979,7 +3981,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) ...@@ -3979,7 +3981,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
UnlockBufHdr(bufHdr, buf_state); UnlockBufHdr(bufHdr, buf_state);
if (delayChkpt) if (delayChkpt)
MyProc->delayChkpt = false; MyProc->delayChkpt &= ~DELAY_CHKPT_START;
if (dirtied) if (dirtied)
{ {
......
...@@ -689,7 +689,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) ...@@ -689,7 +689,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
proc->lxid = InvalidLocalTransactionId; proc->lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId; proc->xmin = InvalidTransactionId;
proc->delayChkpt = false; /* be sure this is cleared in abort */
/* be sure this is cleared in abort */
proc->delayChkpt = 0;
proc->recoveryConflictPending = false; proc->recoveryConflictPending = false;
/* must be cleared with xid/xmin: */ /* must be cleared with xid/xmin: */
...@@ -728,7 +731,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) ...@@ -728,7 +731,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
proc->xid = InvalidTransactionId; proc->xid = InvalidTransactionId;
proc->lxid = InvalidLocalTransactionId; proc->lxid = InvalidLocalTransactionId;
proc->xmin = InvalidTransactionId; proc->xmin = InvalidTransactionId;
proc->delayChkpt = false; /* be sure this is cleared in abort */
/* be sure this is cleared in abort */
proc->delayChkpt = 0;
proc->recoveryConflictPending = false; proc->recoveryConflictPending = false;
/* must be cleared with xid/xmin: */ /* must be cleared with xid/xmin: */
...@@ -3043,7 +3049,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) ...@@ -3043,7 +3049,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
* delaying checkpoint because they have critical actions in progress. * delaying checkpoint because they have critical actions in progress.
* *
* Constructs an array of VXIDs of transactions that are currently in commit * Constructs an array of VXIDs of transactions that are currently in commit
* critical sections, as shown by having delayChkpt set in their PGPROC. * critical sections, as shown by having specified delayChkpt bits set in their
* PGPROC.
* *
* Returns a palloc'd array that should be freed by the caller. * Returns a palloc'd array that should be freed by the caller.
* *nvxids is the number of valid entries. * *nvxids is the number of valid entries.
...@@ -3057,13 +3064,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) ...@@ -3057,13 +3064,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
* for clearing of delayChkpt to propagate is unimportant for correctness. * for clearing of delayChkpt to propagate is unimportant for correctness.
*/ */
VirtualTransactionId * VirtualTransactionId *
GetVirtualXIDsDelayingChkpt(int *nvxids) GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
{ {
VirtualTransactionId *vxids; VirtualTransactionId *vxids;
ProcArrayStruct *arrayP = procArray; ProcArrayStruct *arrayP = procArray;
int count = 0; int count = 0;
int index; int index;
Assert(type != 0);
/* allocate what's certainly enough result space */ /* allocate what's certainly enough result space */
vxids = (VirtualTransactionId *) vxids = (VirtualTransactionId *)
palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
...@@ -3075,7 +3084,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids) ...@@ -3075,7 +3084,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
int pgprocno = arrayP->pgprocnos[index]; int pgprocno = arrayP->pgprocnos[index];
PGPROC *proc = &allProcs[pgprocno]; PGPROC *proc = &allProcs[pgprocno];
if (proc->delayChkpt) if ((proc->delayChkpt & type) != 0)
{ {
VirtualTransactionId vxid; VirtualTransactionId vxid;
...@@ -3101,12 +3110,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids) ...@@ -3101,12 +3110,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
* those numbers should be small enough for it not to be a problem. * those numbers should be small enough for it not to be a problem.
*/ */
bool bool
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
{ {
bool result = false; bool result = false;
ProcArrayStruct *arrayP = procArray; ProcArrayStruct *arrayP = procArray;
int index; int index;
Assert(type != 0);
LWLockAcquire(ProcArrayLock, LW_SHARED); LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) for (index = 0; index < arrayP->numProcs; index++)
...@@ -3117,7 +3128,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) ...@@ -3117,7 +3128,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
GET_VXID_FROM_PGPROC(vxid, *proc); GET_VXID_FROM_PGPROC(vxid, *proc);
if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid)) if ((proc->delayChkpt & type) != 0 &&
VirtualTransactionIdIsValid(vxid))
{ {
int i; int i;
......
...@@ -394,7 +394,7 @@ InitProcess(void) ...@@ -394,7 +394,7 @@ InitProcess(void)
MyProc->roleId = InvalidOid; MyProc->roleId = InvalidOid;
MyProc->tempNamespaceId = InvalidOid; MyProc->tempNamespaceId = InvalidOid;
MyProc->isBackgroundWorker = IsBackgroundWorker; MyProc->isBackgroundWorker = IsBackgroundWorker;
MyProc->delayChkpt = false; MyProc->delayChkpt = 0;
MyProc->statusFlags = 0; MyProc->statusFlags = 0;
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
if (IsAutoVacuumWorkerProcess()) if (IsAutoVacuumWorkerProcess())
...@@ -579,7 +579,7 @@ InitAuxiliaryProcess(void) ...@@ -579,7 +579,7 @@ InitAuxiliaryProcess(void)
MyProc->roleId = InvalidOid; MyProc->roleId = InvalidOid;
MyProc->tempNamespaceId = InvalidOid; MyProc->tempNamespaceId = InvalidOid;
MyProc->isBackgroundWorker = IsBackgroundWorker; MyProc->isBackgroundWorker = IsBackgroundWorker;
MyProc->delayChkpt = false; MyProc->delayChkpt = 0;
MyProc->statusFlags = 0; MyProc->statusFlags = 0;
MyProc->lwWaiting = false; MyProc->lwWaiting = false;
MyProc->lwWaitMode = 0; MyProc->lwWaitMode = 0;
......
...@@ -86,6 +86,41 @@ struct XidCache ...@@ -86,6 +86,41 @@ struct XidCache
*/ */
#define INVALID_PGPROCNO PG_INT32_MAX #define INVALID_PGPROCNO PG_INT32_MAX
/*
* Flags for PGPROC.delayChkpt
*
* These flags can be used to delay the start or completion of a checkpoint
* for short periods. A flag is in effect if the corresponding bit is set in
* the PGPROC of any backend.
*
* For our purposes here, a checkpoint has three phases: (1) determine the
* location to which the redo pointer will be moved, (2) write all the
* data durably to disk, and (3) WAL-log the checkpoint.
*
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
* to phase 2. This is useful when we are performing a WAL-logged modification
* of data that will be flushed to disk in phase 2. By setting this flag
* before writing WAL and clearing it after we've both written WAL and
* performed the corresponding modification, we ensure that if the WAL record
* is inserted prior to the new redo point, the corresponding data changes will
* also be flushed to disk before the checkpoint can complete. (In the
* extremely common case where the data being modified is in shared buffers
* and we acquire an exclusive content lock on the relevant buffers before
* writing WAL, this mechanism is not needed, because phase 2 will block
* until we release the content lock and then flush the modified data to
* disk.)
*
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
* to phase 3. This is useful if we are performing a WAL-logged operation that
* might invalidate buffers, such as relation truncation. In this case, we need
* to ensure that any buffers which were invalidated and thus not flushed by
* the checkpoint are actaully destroyed on disk. Replay can cope with a file
* or block that doesn't exist, but not with a block that has the wrong
* contents.
*/
#define DELAY_CHKPT_START (1<<0)
#define DELAY_CHKPT_COMPLETE (1<<1)
typedef enum typedef enum
{ {
PROC_WAIT_STATUS_OK, PROC_WAIT_STATUS_OK,
...@@ -191,7 +226,7 @@ struct PGPROC ...@@ -191,7 +226,7 @@ struct PGPROC
pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
* started */ * started */
bool delayChkpt; /* true if this proc delays checkpoint start */ int delayChkpt; /* for DELAY_CHKPT_* flags */
uint8 statusFlags; /* this backend's status flags, see PROC_* uint8 statusFlags; /* this backend's status flags, see PROC_*
* above. mirrored in * above. mirrored in
......
...@@ -59,8 +59,9 @@ extern TransactionId GetOldestActiveTransactionId(void); ...@@ -59,8 +59,9 @@ extern TransactionId GetOldestActiveTransactionId(void);
extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin); extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids); extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids); extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
int nvxids, int type);
extern PGPROC *BackendPidGetProc(int pid); extern PGPROC *BackendPidGetProc(int pid);
extern PGPROC *BackendPidGetProcWithLock(int pid); extern PGPROC *BackendPidGetProcWithLock(int pid);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment