Commit 4f627f89 authored by Andres Freund's avatar Andres Freund

Rework the way multixact truncations work.

The fact that multixact truncations are not WAL logged has caused a fair
share of problems. Amongst others it requires to do computations during
recovery while the database is not in a consistent state, delaying
truncations till checkpoints, and handling members being truncated, but
offset not.

We tried to put bandaids on lots of these issues over the last years,
but it seems time to change course. Thus this patch introduces WAL
logging for multixact truncations.

This allows:
1) to perform the truncation directly during VACUUM, instead of delaying it
   to the checkpoint.
2) to avoid looking at the offsets SLRU for truncation during recovery,
   we can just use the master's values.
3) simplify a fair amount of logic to keep in memory limits straight,
   this has gotten much easier

During the course of fixing this a bunch of additional bugs had to be
fixed:
1) Data was not purged from memory the member's SLRU before deleting
   segments. This happened to be hard or impossible to hit due to the
   interlock between checkpoints and truncation.
2) find_multixact_start() relied on SimpleLruDoesPhysicalPageExist - but
   that doesn't work for offsets that haven't yet been flushed to
   disk. Add code to flush the SLRUs to fix. Not pretty, but it feels
   slightly safer to only make decisions based on actual on-disk state.
3) find_multixact_start() could be called concurrently with a truncation
   and thus fail. Via SetOffsetVacuumLimit() that could lead to a round
   of emergency vacuuming. The problem remains in
   pg_get_multixact_members(), but that's quite harmless.

For now this is going to only get applied to 9.5+, leaving the issues in
the older branches in place. It is quite possible that we need to
backpatch at a later point though.

For the case this gets backpatched we need to handle that an updated
standby may be replaying WAL from a not-yet upgraded primary. We have to
recognize that situation and use "old style" truncation (i.e. looking at
the SLRUs) during WAL replay. In contrast to before, this now happens in
the startup process, when replaying a checkpoint record, instead of the
checkpointer. Doing truncation in the restartpoint is incorrect, they
can happen much later than the original checkpoint, thereby leading to
wraparound.  To avoid "multixact_redo: unknown op code 48" errors
standbys would have to be upgraded before primaries.

A later patch will bump the WAL page magic, and remove the legacy
truncation codepaths. Legacy truncation support is just included to make
a possible future backpatch easier.

Discussion: 20150621192409.GA4797@alap3.anarazel.de
Reviewed-By: Robert Haas, Alvaro Herrera, Thomas Munro
Backpatch: 9.5 for now
parent 2abfd9d5
...@@ -70,6 +70,14 @@ multixact_desc(StringInfo buf, XLogReaderState *record) ...@@ -70,6 +70,14 @@ multixact_desc(StringInfo buf, XLogReaderState *record)
for (i = 0; i < xlrec->nmembers; i++) for (i = 0; i < xlrec->nmembers; i++)
out_member(buf, &xlrec->members[i]); out_member(buf, &xlrec->members[i]);
} }
else if (info == XLOG_MULTIXACT_TRUNCATE_ID)
{
xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec;
appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)",
xlrec->startTruncOff, xlrec->endTruncOff,
xlrec->startTruncMemb, xlrec->endTruncMemb);
}
} }
const char * const char *
...@@ -88,6 +96,9 @@ multixact_identify(uint8 info) ...@@ -88,6 +96,9 @@ multixact_identify(uint8 info)
case XLOG_MULTIXACT_CREATE_ID: case XLOG_MULTIXACT_CREATE_ID:
id = "CREATE_ID"; id = "CREATE_ID";
break; break;
case XLOG_MULTIXACT_TRUNCATE_ID:
id = "TRUNCATE_ID";
break;
} }
return id; return id;
......
This diff is collapsed.
...@@ -134,6 +134,7 @@ static int SlruSelectLRUPage(SlruCtl ctl, int pageno); ...@@ -134,6 +134,7 @@ static int SlruSelectLRUPage(SlruCtl ctl, int pageno);
static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
int segpage, void *data); int segpage, void *data);
static void SlruInternalDeleteSegment(SlruCtl ctl, char *filename);
/* /*
* Initialization of shared memory * Initialization of shared memory
...@@ -1075,7 +1076,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) ...@@ -1075,7 +1076,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
* Flush dirty pages to disk during checkpoint or database shutdown * Flush dirty pages to disk during checkpoint or database shutdown
*/ */
void void
SimpleLruFlush(SlruCtl ctl, bool checkpoint) SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
{ {
SlruShared shared = ctl->shared; SlruShared shared = ctl->shared;
SlruFlushData fdata; SlruFlushData fdata;
...@@ -1096,11 +1097,11 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint) ...@@ -1096,11 +1097,11 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
SlruInternalWritePage(ctl, slotno, &fdata); SlruInternalWritePage(ctl, slotno, &fdata);
/* /*
* When called during a checkpoint, we cannot assert that the slot is * In some places (e.g. checkpoints), we cannot assert that the slot
* clean now, since another process might have re-dirtied it already. * is clean now, since another process might have re-dirtied it
* That's okay. * already. That's okay.
*/ */
Assert(checkpoint || Assert(allow_redirtied ||
shared->page_status[slotno] == SLRU_PAGE_EMPTY || shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
(shared->page_status[slotno] == SLRU_PAGE_VALID && (shared->page_status[slotno] == SLRU_PAGE_VALID &&
!shared->page_dirty[slotno])); !shared->page_dirty[slotno]));
...@@ -1210,8 +1211,14 @@ restart:; ...@@ -1210,8 +1211,14 @@ restart:;
(void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage); (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
} }
void /*
SlruDeleteSegment(SlruCtl ctl, char *filename) * Delete an individual SLRU segment, identified by the filename.
*
* NB: This does not touch the SLRU buffers themselves, callers have to ensure
* they either can't yet contain anything, or have already been cleaned out.
*/
static void
SlruInternalDeleteSegment(SlruCtl ctl, char *filename)
{ {
char path[MAXPGPATH]; char path[MAXPGPATH];
...@@ -1221,6 +1228,64 @@ SlruDeleteSegment(SlruCtl ctl, char *filename) ...@@ -1221,6 +1228,64 @@ SlruDeleteSegment(SlruCtl ctl, char *filename)
unlink(path); unlink(path);
} }
/*
* Delete an individual SLRU segment, identified by the segment number.
*/
void
SlruDeleteSegment(SlruCtl ctl, int segno)
{
SlruShared shared = ctl->shared;
int slotno;
char path[MAXPGPATH];
bool did_write;
/* Clean out any possibly existing references to the segment. */
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
restart:
did_write = false;
for (slotno = 0; slotno < shared->num_slots; slotno++)
{
int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
continue;
/* not the segment we're looking for */
if (pagesegno != segno)
continue;
/* If page is clean, just change state to EMPTY (expected case). */
if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
!shared->page_dirty[slotno])
{
shared->page_status[slotno] = SLRU_PAGE_EMPTY;
continue;
}
/* Same logic as SimpleLruTruncate() */
if (shared->page_status[slotno] == SLRU_PAGE_VALID)
SlruInternalWritePage(ctl, slotno, NULL);
else
SimpleLruWaitIO(ctl, slotno);
did_write = true;
}
/*
* Be extra careful and re-check. The IO functions release the control
* lock, so new pages could have been read in.
*/
if (did_write)
goto restart;
snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno);
ereport(DEBUG2,
(errmsg("removing file \"%s\"", path)));
unlink(path);
LWLockRelease(shared->ControlLock);
}
/* /*
* SlruScanDirectory callback * SlruScanDirectory callback
* This callback reports true if there's any segment prior to the one * This callback reports true if there's any segment prior to the one
...@@ -1249,7 +1314,7 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) ...@@ -1249,7 +1314,7 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data)
int cutoffPage = *(int *) data; int cutoffPage = *(int *) data;
if (ctl->PagePrecedes(segpage, cutoffPage)) if (ctl->PagePrecedes(segpage, cutoffPage))
SlruDeleteSegment(ctl, filename); SlruInternalDeleteSegment(ctl, filename);
return false; /* keep going */ return false; /* keep going */
} }
...@@ -1261,7 +1326,7 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) ...@@ -1261,7 +1326,7 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data)
bool bool
SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data) SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data)
{ {
SlruDeleteSegment(ctl, filename); SlruInternalDeleteSegment(ctl, filename);
return false; /* keep going */ return false; /* keep going */
} }
......
...@@ -6330,7 +6330,6 @@ StartupXLOG(void) ...@@ -6330,7 +6330,6 @@ StartupXLOG(void)
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
SetCommitTsLimit(checkPoint.oldestCommitTs, SetCommitTsLimit(checkPoint.oldestCommitTs,
checkPoint.newestCommitTs); checkPoint.newestCommitTs);
MultiXactSetSafeTruncate(checkPoint.oldestMulti);
XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch; XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
XLogCtl->ckptXid = checkPoint.nextXid; XLogCtl->ckptXid = checkPoint.nextXid;
...@@ -6347,10 +6346,8 @@ StartupXLOG(void) ...@@ -6347,10 +6346,8 @@ StartupXLOG(void)
StartupReorderBuffer(); StartupReorderBuffer();
/* /*
* Startup MultiXact. We need to do this early for two reasons: one is * Startup MultiXact. We need to do this early to be able to replay
* that we might try to access multixacts when we do tuple freezing, and * truncations.
* the other is we need its state initialized because we attempt
* truncation during restartpoints.
*/ */
StartupMultiXact(); StartupMultiXact();
...@@ -8507,12 +8504,6 @@ CreateCheckPoint(int flags) ...@@ -8507,12 +8504,6 @@ CreateCheckPoint(int flags)
*/ */
END_CRIT_SECTION(); END_CRIT_SECTION();
/*
* Now that the checkpoint is safely on disk, we can update the point to
* which multixact can be truncated.
*/
MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/* /*
* Let smgr do post-checkpoint cleanup (eg, deleting old files). * Let smgr do post-checkpoint cleanup (eg, deleting old files).
*/ */
...@@ -8552,11 +8543,6 @@ CreateCheckPoint(int flags) ...@@ -8552,11 +8543,6 @@ CreateCheckPoint(int flags)
if (!RecoveryInProgress()) if (!RecoveryInProgress())
TruncateSUBTRANS(GetOldestXmin(NULL, false)); TruncateSUBTRANS(GetOldestXmin(NULL, false));
/*
* Truncate pg_multixact too.
*/
TruncateMultiXact();
/* Real work is done, but log and update stats before releasing lock. */ /* Real work is done, but log and update stats before releasing lock. */
LogCheckpointEnd(false); LogCheckpointEnd(false);
...@@ -8886,21 +8872,6 @@ CreateRestartPoint(int flags) ...@@ -8886,21 +8872,6 @@ CreateRestartPoint(int flags)
ThisTimeLineID = 0; ThisTimeLineID = 0;
} }
/*
* Due to a historical accident multixact truncations are not WAL-logged,
* but just performed everytime the mxact horizon is increased. So, unless
* we explicitly execute truncations on a standby it will never clean out
* /pg_multixact which obviously is bad, both because it uses space and
* because we can wrap around into pre-existing data...
*
* We can only do the truncation here, after the UpdateControlFile()
* above, because we've now safely established a restart point. That
* guarantees we will not need to access those multis.
*
* It's probably worth improving this.
*/
TruncateMultiXact();
/* /*
* Truncate pg_subtrans if possible. We can throw away all data before * Truncate pg_subtrans if possible. We can throw away all data before
* the oldest XMIN of any running transaction. No future transaction will * the oldest XMIN of any running transaction. No future transaction will
...@@ -9261,9 +9232,14 @@ xlog_redo(XLogReaderState *record) ...@@ -9261,9 +9232,14 @@ xlog_redo(XLogReaderState *record)
LWLockRelease(OidGenLock); LWLockRelease(OidGenLock);
MultiXactSetNextMXact(checkPoint.nextMulti, MultiXactSetNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset); checkPoint.nextMultiOffset);
/*
* NB: This may perform multixact truncation when replaying WAL
* generated by an older primary.
*/
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/* /*
* If we see a shutdown checkpoint while waiting for an end-of-backup * If we see a shutdown checkpoint while waiting for an end-of-backup
...@@ -9353,14 +9329,17 @@ xlog_redo(XLogReaderState *record) ...@@ -9353,14 +9329,17 @@ xlog_redo(XLogReaderState *record)
LWLockRelease(OidGenLock); LWLockRelease(OidGenLock);
MultiXactAdvanceNextMXact(checkPoint.nextMulti, MultiXactAdvanceNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset); checkPoint.nextMultiOffset);
/*
* NB: This may perform multixact truncation when replaying WAL
* generated by an older primary.
*/
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
if (TransactionIdPrecedes(ShmemVariableCache->oldestXid, if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
checkPoint.oldestXid)) checkPoint.oldestXid))
SetTransactionIdLimit(checkPoint.oldestXid, SetTransactionIdLimit(checkPoint.oldestXid,
checkPoint.oldestXidDB); checkPoint.oldestXidDB);
MultiXactAdvanceOldest(checkPoint.oldestMulti,
checkPoint.oldestMultiDB);
MultiXactSetSafeTruncate(checkPoint.oldestMulti);
/* ControlFile->checkPointCopy always tracks the latest ckpt XID */ /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch; ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
......
...@@ -1137,11 +1137,11 @@ vac_truncate_clog(TransactionId frozenXID, ...@@ -1137,11 +1137,11 @@ vac_truncate_clog(TransactionId frozenXID,
return; return;
/* /*
* Truncate CLOG and CommitTs to the oldest computed value. Note we don't * Truncate CLOG, multixact and CommitTs to the oldest computed value.
* truncate multixacts; that will be done by the next checkpoint.
*/ */
TruncateCLOG(frozenXID); TruncateCLOG(frozenXID);
TruncateCommitTs(frozenXID, true); TruncateCommitTs(frozenXID, true);
TruncateMultiXact(minMulti, minmulti_datoid, false);
/* /*
* Update the wrap limit for GetNewTransactionId and creation of new * Update the wrap limit for GetNewTransactionId and creation of new
......
...@@ -45,3 +45,4 @@ ReplicationSlotControlLock 37 ...@@ -45,3 +45,4 @@ ReplicationSlotControlLock 37
CommitTsControlLock 38 CommitTsControlLock 38
CommitTsLock 39 CommitTsLock 39
ReplicationOriginLock 40 ReplicationOriginLock 40
MultiXactTruncationLock 41
...@@ -71,6 +71,7 @@ typedef struct MultiXactMember ...@@ -71,6 +71,7 @@ typedef struct MultiXactMember
#define XLOG_MULTIXACT_ZERO_OFF_PAGE 0x00 #define XLOG_MULTIXACT_ZERO_OFF_PAGE 0x00
#define XLOG_MULTIXACT_ZERO_MEM_PAGE 0x10 #define XLOG_MULTIXACT_ZERO_MEM_PAGE 0x10
#define XLOG_MULTIXACT_CREATE_ID 0x20 #define XLOG_MULTIXACT_CREATE_ID 0x20
#define XLOG_MULTIXACT_TRUNCATE_ID 0x30
typedef struct xl_multixact_create typedef struct xl_multixact_create
{ {
...@@ -82,6 +83,21 @@ typedef struct xl_multixact_create ...@@ -82,6 +83,21 @@ typedef struct xl_multixact_create
#define SizeOfMultiXactCreate (offsetof(xl_multixact_create, members)) #define SizeOfMultiXactCreate (offsetof(xl_multixact_create, members))
typedef struct xl_multixact_truncate
{
Oid oldestMultiDB;
/* to-be-truncated range of multixact offsets */
MultiXactId startTruncOff; /* just for completeness' sake */
MultiXactId endTruncOff;
/* to-be-truncated range of multixact members */
MultiXactOffset startTruncMemb;
MultiXactOffset endTruncMemb;
} xl_multixact_truncate;
#define SizeOfMultiXactTruncate (sizeof(xl_multixact_truncate))
extern MultiXactId MultiXactIdCreate(TransactionId xid1, extern MultiXactId MultiXactIdCreate(TransactionId xid1,
MultiXactStatus status1, TransactionId xid2, MultiXactStatus status1, TransactionId xid2,
...@@ -119,13 +135,12 @@ extern void MultiXactGetCheckptMulti(bool is_shutdown, ...@@ -119,13 +135,12 @@ extern void MultiXactGetCheckptMulti(bool is_shutdown,
Oid *oldestMultiDB); Oid *oldestMultiDB);
extern void CheckPointMultiXact(void); extern void CheckPointMultiXact(void);
extern MultiXactId GetOldestMultiXactId(void); extern MultiXactId GetOldestMultiXactId(void);
extern void TruncateMultiXact(void); extern void TruncateMultiXact(MultiXactId oldestMulti, Oid oldestMultiDB, bool in_recovery);
extern void MultiXactSetNextMXact(MultiXactId nextMulti, extern void MultiXactSetNextMXact(MultiXactId nextMulti,
MultiXactOffset nextMultiOffset); MultiXactOffset nextMultiOffset);
extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, extern void MultiXactAdvanceNextMXact(MultiXactId minMulti,
MultiXactOffset minMultiOffset); MultiXactOffset minMultiOffset);
extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB);
extern void MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti);
extern int MultiXactMemberFreezeThreshold(void); extern int MultiXactMemberFreezeThreshold(void);
extern void multixact_twophase_recover(TransactionId xid, uint16 info, extern void multixact_twophase_recover(TransactionId xid, uint16 info,
......
...@@ -143,14 +143,14 @@ extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, ...@@ -143,14 +143,14 @@ extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
TransactionId xid); TransactionId xid);
extern void SimpleLruWritePage(SlruCtl ctl, int slotno); extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint); extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied);
extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno); extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno);
typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage, typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
void *data); void *data);
extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data); extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
extern void SlruDeleteSegment(SlruCtl ctl, char *filename); extern void SlruDeleteSegment(SlruCtl ctl, int segno);
/* SlruScanDirectory public callbacks */ /* SlruScanDirectory public callbacks */
extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
......
...@@ -2750,6 +2750,7 @@ xl_invalid_page ...@@ -2750,6 +2750,7 @@ xl_invalid_page
xl_invalid_page_key xl_invalid_page_key
xl_multi_insert_tuple xl_multi_insert_tuple
xl_multixact_create xl_multixact_create
xl_multixact_truncate
xl_parameter_change xl_parameter_change
xl_relmap_update xl_relmap_update
xl_replorigin_drop xl_replorigin_drop
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment