Commit 3b97e682 authored by Alvaro Herrera's avatar Alvaro Herrera

Rework tuple freezing protocol

Tuple freezing was broken in connection to MultiXactIds; commit
8e53ae025de9 tried to fix it, but didn't go far enough.  As noted by
Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted
update might cause locks in the multi to go ignored by later
transactions.  This is because the code depended on a multixact above
their cutoff point not having any lock-only member older than the cutoff
point for Xids, which is easily defeated in READ COMMITTED transactions.

The fix for this involves creating a new MultiXactId when necessary.
But this cannot be done during WAL replay, and moreover multixact
examination requires using CLOG access routines which are not supposed
to be used during WAL replay either; so tuple freezing cannot be done
with the old freeze WAL record.  Therefore, separate the freezing
computation from its execution, and change the WAL record to carry all
necessary information.  At WAL replay time, it's easy to re-execute
freezing because we don't need to re-compute the new infomask/Xmax
values but just take them from the WAL record.

While at it, restructure the coding to ensure all page changes occur in
a single critical section without much room for failures.  The previous
coding wasn't using a critical section, without any explanation as to
why this was acceptable.

In replication scenarios using the 9.3 branch, standby servers must be
upgraded before their master, so that they are prepared to deal with the
new WAL record once the master is upgraded; failure to do so will cause
WAL replay to die with a PANIC message.  Later upgrade of the standby
will allow the process to continue where it left off, so there's no
disruption of the data in the standby in any case.  Standbys know how to
deal with the old WAL record, so it's okay to keep the master running
the old code for a while.

In master, the old freeze WAL record is gone, for cleanliness' sake;
there's no compatibility concern there.

Backpatch to 9.3, where the original bug was introduced and where the
previous fix was backpatched.

Álvaro Herrera and Andres Freund
parent 30b96549
This diff is collapsed.
......@@ -131,23 +131,23 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
uint8 info = xl_info & ~XLR_INFO_MASK;
info &= XLOG_HEAP_OPMASK;
if (info == XLOG_HEAP2_FREEZE)
if (info == XLOG_HEAP2_CLEAN)
{
xl_heap_freeze *xlrec = (xl_heap_freeze *) rec;
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff xid %u multi %u",
appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode, xlrec->block,
xlrec->cutoff_xid, xlrec->cutoff_multi);
xlrec->latestRemovedXid);
}
else if (info == XLOG_HEAP2_CLEAN)
else if (info == XLOG_HEAP2_FREEZE_PAGE)
{
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec;
appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
appendStringInfo(buf, "freeze_page: rel %u/%u/%u; blk %u; cutoff xid %u ntuples %u",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode, xlrec->block,
xlrec->latestRemovedXid);
xlrec->cutoff_xid, xlrec->ntuples);
}
else if (info == XLOG_HEAP2_CLEANUP_INFO)
{
......
......@@ -289,7 +289,6 @@ static MemoryContext MXactContext = NULL;
/* internal MultiXactId management */
static void MultiXactIdSetOldestVisible(void);
static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members);
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
int nmembers, MultiXactMember *members);
static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
......@@ -336,6 +335,9 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
/* MultiXactIdSetOldestMember() must have been called already. */
Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
/*
* Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
* are still running. In typical usage, xid2 will be our own XID and the
......@@ -347,7 +349,7 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
members[1].xid = xid2;
members[1].status = status2;
newMulti = CreateMultiXactId(2, members);
newMulti = MultiXactIdCreateFromMembers(2, members);
debug_elog3(DEBUG2, "Create: %s",
mxid_to_string(newMulti, 2, members));
......@@ -387,6 +389,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
AssertArg(MultiXactIdIsValid(multi));
AssertArg(TransactionIdIsValid(xid));
/* MultiXactIdSetOldestMember() must have been called already. */
Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
multi, xid, mxstatus_to_string(status));
......@@ -410,7 +415,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
*/
member.xid = xid;
member.status = status;
newMulti = CreateMultiXactId(1, &member);
newMulti = MultiXactIdCreateFromMembers(1, &member);
debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
multi, newMulti);
......@@ -462,7 +467,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
newMembers[j].xid = xid;
newMembers[j++].status = status;
newMulti = CreateMultiXactId(j, newMembers);
newMulti = MultiXactIdCreateFromMembers(j, newMembers);
pfree(members);
pfree(newMembers);
......@@ -667,16 +672,16 @@ ReadNextMultiXactId(void)
}
/*
* CreateMultiXactId
* Make a new MultiXactId
* MultiXactIdCreateFromMembers
* Make a new MultiXactId from the specified set of members
*
* Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
* given TransactionIds as members. Returns the newly created MultiXactId.
*
* NB: the passed members[] array will be sorted in-place.
*/
static MultiXactId
CreateMultiXactId(int nmembers, MultiXactMember *members)
MultiXactId
MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
{
MultiXactId multi;
MultiXactOffset offset;
......@@ -707,6 +712,13 @@ CreateMultiXactId(int nmembers, MultiXactMember *members)
* Assign the MXID and offsets range to use, and make sure there is space
* in the OFFSETs and MEMBERs files. NB: this routine does
* START_CRIT_SECTION().
*
* Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
* that we've called MultiXactIdSetOldestMember here. This is because
* this routine is used in some places to create new MultiXactIds of which
* the current backend is not a member, notably during freezing of multis
* in vacuum. During vacuum, in particular, it would be unacceptable to
* keep OldestMulti set, in case it runs for long.
*/
multi = GetNewMultiXactId(nmembers, &offset);
......@@ -763,7 +775,8 @@ CreateMultiXactId(int nmembers, MultiXactMember *members)
* RecordNewMultiXact
* Write info about a new multixact into the offsets and members files
*
* This is broken out of CreateMultiXactId so that xlog replay can use it.
* This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
* use it.
*/
static void
RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
......@@ -867,9 +880,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
/* MultiXactIdSetOldestMember() must have been called already */
Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
/* safety check, we should never get this far in a HS slave */
if (RecoveryInProgress())
elog(ERROR, "cannot assign MultiXactIds during recovery");
......
......@@ -424,6 +424,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
Buffer vmbuffer = InvalidBuffer;
BlockNumber next_not_all_visible_block;
bool skipping_all_visible_blocks;
xl_heap_freeze_tuple *frozen;
pg_rusage_init(&ru0);
......@@ -446,6 +447,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
vacrelstats->latestRemovedXid = InvalidTransactionId;
lazy_space_alloc(vacrelstats, nblocks);
frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
/*
* We want to skip pages that don't require vacuuming according to the
......@@ -500,7 +502,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
bool tupgone,
hastup;
int prev_dead_count;
OffsetNumber frozen[MaxOffsetNumber];
int nfrozen;
Size freespace;
bool all_visible_according_to_vm;
......@@ -890,9 +891,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
* Each non-removable tuple must be checked to see if it needs
* freezing. Note we already have exclusive buffer lock.
*/
if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
MultiXactCutoff))
frozen[nfrozen++] = offnum;
if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
MultiXactCutoff, &frozen[nfrozen]))
frozen[nfrozen++].offset = offnum;
}
} /* scan along page */
......@@ -903,15 +904,33 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
*/
if (nfrozen > 0)
{
START_CRIT_SECTION();
MarkBufferDirty(buf);
/* execute collected freezes */
for (i = 0; i < nfrozen; i++)
{
ItemId itemid;
HeapTupleHeader htup;
itemid = PageGetItemId(page, frozen[i].offset);
htup = (HeapTupleHeader) PageGetItem(page, itemid);
heap_execute_freeze_tuple(htup, &frozen[i]);
}
/* Now WAL-log freezing if neccessary */
if (RelationNeedsWAL(onerel))
{
XLogRecPtr recptr;
recptr = log_heap_freeze(onerel, buf, FreezeLimit,
MultiXactCutoff, frozen, nfrozen);
frozen, nfrozen);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
/*
......@@ -1012,6 +1031,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
RecordPageWithFreeSpace(onerel, blkno, freespace);
}
pfree(frozen);
/* save stats for use later */
vacrelstats->scanned_tuples = num_tuples;
vacrelstats->tuples_deleted = tups_vacuumed;
......
......@@ -48,9 +48,9 @@
* the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to
* these, too.
*/
#define XLOG_HEAP2_FREEZE 0x00
/* 0x00 is free, was XLOG_HEAP2_FREEZE */
#define XLOG_HEAP2_CLEAN 0x10
/* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */
#define XLOG_HEAP2_FREEZE_PAGE 0x20
#define XLOG_HEAP2_CLEANUP_INFO 0x30
#define XLOG_HEAP2_VISIBLE 0x40
#define XLOG_HEAP2_MULTI_INSERT 0x50
......@@ -270,17 +270,36 @@ typedef struct xl_heap_inplace
#define SizeOfHeapInplace (offsetof(xl_heap_inplace, target) + SizeOfHeapTid)
/* This is what we need to know about tuple freezing during vacuum */
typedef struct xl_heap_freeze
/*
* This struct represents a 'freeze plan', which is what we need to know about
* a single tuple being frozen during vacuum.
*/
#define XLH_FREEZE_XMIN 0x01
#define XLH_FREEZE_XVAC 0x02
#define XLH_INVALID_XVAC 0x04
typedef struct xl_heap_freeze_tuple
{
TransactionId xmax;
OffsetNumber offset;
uint16 t_infomask2;
uint16 t_infomask;
uint8 frzflags;
} xl_heap_freeze_tuple;
/*
* This is what we need to know about a block being frozen during vacuum
*/
typedef struct xl_heap_freeze_page
{
RelFileNode node;
BlockNumber block;
TransactionId cutoff_xid;
MultiXactId cutoff_multi;
/* TUPLE OFFSET NUMBERS FOLLOW AT THE END */
} xl_heap_freeze;
uint16 ntuples;
xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER];
} xl_heap_freeze_page;
#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_multi) + sizeof(MultiXactId))
#define SizeOfHeapFreezePage offsetof(xl_heap_freeze_page, tuples)
/* This is what we need to know about setting a visibility map bit */
typedef struct xl_heap_visible
......@@ -331,8 +350,14 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
OffsetNumber *nowunused, int nunused,
TransactionId latestRemovedXid);
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid, MultiXactId cutoff_multi,
OffsetNumber *offsets, int offcnt);
TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples,
int ntuples);
extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple,
TransactionId cutoff_xid,
TransactionId cutoff_multi,
xl_heap_freeze_tuple *frz);
extern void heap_execute_freeze_tuple(HeapTupleHeader tuple,
xl_heap_freeze_tuple *xlrec_tp);
extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer,
Buffer vm_buffer, TransactionId cutoff_xid);
extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
......
......@@ -81,6 +81,9 @@ extern MultiXactId MultiXactIdCreate(TransactionId xid1,
MultiXactStatus status2);
extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid,
MultiXactStatus status);
extern MultiXactId MultiXactIdCreateFromMembers(int nmembers,
MultiXactMember *members);
extern MultiXactId ReadNextMultiXactId(void);
extern bool MultiXactIdIsRunning(MultiXactId multi);
extern void MultiXactIdSetOldestMember(void);
......
......@@ -55,7 +55,7 @@ typedef struct BkpBlock
/*
* Each page of XLOG file has a header like this:
*/
#define XLOG_PAGE_MAGIC 0xD079 /* can be used as WAL version indicator */
#define XLOG_PAGE_MAGIC 0xD07A /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment