Commit 3b97e682 authored by Alvaro Herrera's avatar Alvaro Herrera

Rework tuple freezing protocol

Tuple freezing was broken in connection to MultiXactIds; commit
8e53ae025de9 tried to fix it, but didn't go far enough.  As noted by
Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted
update might cause locks in the multi to go ignored by later
transactions.  This is because the code depended on a multixact above
their cutoff point not having any lock-only member older than the cutoff
point for Xids, which is easily defeated in READ COMMITTED transactions.

The fix for this involves creating a new MultiXactId when necessary.
But this cannot be done during WAL replay, and moreover multixact
examination requires using CLOG access routines which are not supposed
to be used during WAL replay either; so tuple freezing cannot be done
with the old freeze WAL record.  Therefore, separate the freezing
computation from its execution, and change the WAL record to carry all
necessary information.  At WAL replay time, it's easy to re-execute
freezing because we don't need to re-compute the new infomask/Xmax
values but just take them from the WAL record.

While at it, restructure the coding to ensure all page changes occur in
a single critical section without much room for failures.  The previous
coding wasn't using a critical section, without any explanation as to
why this was acceptable.

In replication scenarios using the 9.3 branch, standby servers must be
upgraded before their master, so that they are prepared to deal with the
new WAL record once the master is upgraded; failure to do so will cause
WAL replay to die with a PANIC message.  Later upgrade of the standby
will allow the process to continue where it left off, so there's no
disruption of the data in the standby in any case.  Standbys know how to
deal with the old WAL record, so it's okay to keep the master running
the old code for a while.

In master, the old freeze WAL record is gone, for cleanliness' sake;
there's no compatibility concern there.

Backpatch to 9.3, where the original bug was introduced and where the
previous fix was backpatched.

Álvaro Herrera and Andres Freund
parent 30b96549
......@@ -5409,14 +5409,282 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
CacheInvalidateHeapTuple(relation, tuple, NULL);
}
#define FRM_NOOP 0x0001
#define FRM_INVALIDATE_XMAX 0x0002
#define FRM_RETURN_IS_XID 0x0004
#define FRM_RETURN_IS_MULTI 0x0008
#define FRM_MARK_COMMITTED 0x0010
/*
* heap_freeze_tuple
* FreezeMultiXactId
* Determine what to do during freezing when a tuple is marked by a
* MultiXactId.
*
* NB -- this might have the side-effect of creating a new MultiXactId!
*
* "flags" is an output value; it's used to tell caller what to do on return.
* Possible flags are:
* FRM_NOOP
* don't do anything -- keep existing Xmax
* FRM_INVALIDATE_XMAX
* mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
* FRM_RETURN_IS_XID
* The Xid return value is a single update Xid to set as xmax.
* FRM_MARK_COMMITTED
* Xmax can be marked as HEAP_XMAX_COMMITTED
* FRM_RETURN_IS_MULTI
* The return value is a new MultiXactId to set as new Xmax.
* (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
*/
static TransactionId
FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
TransactionId cutoff_xid, MultiXactId cutoff_multi,
uint16 *flags)
{
TransactionId xid = InvalidTransactionId;
int i;
MultiXactMember *members;
int nmembers;
bool need_replace;
int nnewmembers;
MultiXactMember *newmembers;
bool has_lockers;
TransactionId update_xid;
bool update_committed;
*flags = 0;
/* We should only be called in Multis */
Assert(t_infomask & HEAP_XMAX_IS_MULTI);
if (!MultiXactIdIsValid(multi))
{
/* Ensure infomask bits are appropriately set/reset */
*flags |= FRM_INVALIDATE_XMAX;
return InvalidTransactionId;
}
else if (MultiXactIdPrecedes(multi, cutoff_multi))
{
/*
* This old multi cannot possibly have members still running. If it
* was a locker only, it can be removed without any further
* consideration; but if it contained an update, we might need to
* preserve it.
*/
Assert(!MultiXactIdIsRunning(multi));
if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
{
*flags |= FRM_INVALIDATE_XMAX;
xid = InvalidTransactionId; /* not strictly necessary */
}
else
{
/* replace multi by update xid */
xid = MultiXactIdGetUpdateXid(multi, t_infomask);
/* wasn't only a lock, xid needs to be valid */
Assert(TransactionIdIsValid(xid));
/*
* If the xid is older than the cutoff, it has to have aborted,
* otherwise the tuple would have gotten pruned away.
*/
if (TransactionIdPrecedes(xid, cutoff_xid))
{
Assert(!TransactionIdDidCommit(xid));
*flags |= FRM_INVALIDATE_XMAX;
xid = InvalidTransactionId; /* not strictly necessary */
}
else
{
*flags |= FRM_RETURN_IS_XID;
}
}
return xid;
}
/*
* This multixact might have or might not have members still running, but
* we know it's valid and is newer than the cutoff point for multis.
* However, some member(s) of it may be below the cutoff for Xids, so we
* need to walk the whole members array to figure out what to do, if
* anything.
*/
nmembers = GetMultiXactIdMembers(multi, &members, false);
if (nmembers <= 0)
{
/* Nothing worth keeping */
*flags |= FRM_INVALIDATE_XMAX;
return InvalidTransactionId;
}
/* is there anything older than the cutoff? */
need_replace = false;
for (i = 0; i < nmembers; i++)
{
if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
{
need_replace = true;
break;
}
}
/*
* In the simplest case, there is no member older than the cutoff; we can
* keep the existing MultiXactId as is.
*/
if (!need_replace)
{
*flags |= FRM_NOOP;
pfree(members);
return InvalidTransactionId;
}
/*
* If the multi needs to be updated, figure out which members do we need
* to keep.
*/
nnewmembers = 0;
newmembers = palloc(sizeof(MultiXactMember) * nmembers);
has_lockers = false;
update_xid = InvalidTransactionId;
update_committed = false;
for (i = 0; i < nmembers; i++)
{
/*
* Determine whether to keep this member or ignore it.
*/
if (ISUPDATE_from_mxstatus(members[i].status))
{
TransactionId xid = members[i].xid;
/*
* It's an update; should we keep it? If the transaction is known
* aborted then it's okay to ignore it, otherwise not. However,
* if the Xid is older than the cutoff_xid, we must remove it.
* Note that such an old updater cannot possibly be committed,
* because HeapTupleSatisfiesVacuum would have returned
* HEAPTUPLE_DEAD and we would not be trying to freeze the tuple.
*
* Note the TransactionIdDidAbort() test is just an optimization
* and not strictly necessary for correctness.
*
* As with all tuple visibility routines, it's critical to test
* TransactionIdIsInProgress before the transam.c routines,
* because of race conditions explained in detail in tqual.c.
*/
if (TransactionIdIsCurrentTransactionId(xid) ||
TransactionIdIsInProgress(xid))
{
Assert(!TransactionIdIsValid(update_xid));
update_xid = xid;
}
else if (!TransactionIdDidAbort(xid))
{
/*
* Test whether to tell caller to set HEAP_XMAX_COMMITTED
* while we have the Xid still in cache. Note this can only
* be done if the transaction is known not running.
*/
if (TransactionIdDidCommit(xid))
update_committed = true;
Assert(!TransactionIdIsValid(update_xid));
update_xid = xid;
}
/*
* If we determined that it's an Xid corresponding to an update
* that must be retained, additionally add it to the list of
* members of the new Multis, in case we end up using that. (We
* might still decide to use only an update Xid and not a multi,
* but it's easier to maintain the list as we walk the old members
* list.)
*
* It is possible to end up with a very old updater Xid that
* crashed and thus did not mark itself as aborted in pg_clog.
* That would manifest as a pre-cutoff Xid. Make sure to ignore
* it.
*/
if (TransactionIdIsValid(update_xid))
{
if (!TransactionIdPrecedes(update_xid, cutoff_xid))
{
newmembers[nnewmembers++] = members[i];
}
else
{
/* cannot have committed: would be HEAPTUPLE_DEAD */
Assert(!TransactionIdDidCommit(update_xid));
update_xid = InvalidTransactionId;
update_committed = false;
}
}
}
else
{
/* We only keep lockers if they are still running */
if (TransactionIdIsCurrentTransactionId(members[i].xid) ||
TransactionIdIsInProgress(members[i].xid))
{
/* running locker cannot possibly be older than the cutoff */
Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
newmembers[nnewmembers++] = members[i];
has_lockers = true;
}
}
}
pfree(members);
if (nnewmembers == 0)
{
/* nothing worth keeping!? Tell caller to remove the whole thing */
*flags |= FRM_INVALIDATE_XMAX;
xid = InvalidTransactionId;
}
else if (TransactionIdIsValid(update_xid) && !has_lockers)
{
/*
* If there's a single member and it's an update, pass it back alone
* without creating a new Multi. (XXX we could do this when there's a
* single remaining locker, too, but that would complicate the API too
* much; moreover, the case with the single updater is more
* interesting, because those are longer-lived.)
*/
Assert(nnewmembers == 1);
*flags |= FRM_RETURN_IS_XID;
if (update_committed)
*flags |= FRM_MARK_COMMITTED;
xid = update_xid;
}
else
{
/*
* Create a new multixact with the surviving members of the previous
* one, to set as new Xmax in the tuple.
*/
xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
*flags |= FRM_RETURN_IS_MULTI;
}
pfree(newmembers);
return xid;
}
/*
* heap_prepare_freeze_tuple
*
* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
* are older than the specified cutoff XID. If so, replace them with
* FrozenTransactionId or InvalidTransactionId as appropriate, and return
* TRUE. Return FALSE if nothing was changed.
* are older than the specified cutoff XID and cutoff MultiXactId. If so,
* setup enough state (in the *frz output argument) to later execute and
* WAL-log what we would need to do, and return TRUE. Return FALSE if nothing
* is to be changed.
*
* Caller is responsible for setting the offset field, if appropriate.
*
* It is assumed that the caller has checked the tuple with
* HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
......@@ -5425,54 +5693,44 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
* NB: cutoff_xid *must* be <= the current global xmin, to ensure that any
* XID older than it could neither be running nor seen as running by any
* open transaction. This ensures that the replacement will not change
* anyone's idea of the tuple state. Also, since we assume the tuple is
* not HEAPTUPLE_DEAD, the fact that an XID is not still running allows us
* to assume that it is either committed good or aborted, as appropriate;
* so we need no external state checks to decide what to do. (This is good
* because this function is applied during WAL recovery, when we don't have
* access to any such state, and can't depend on the hint bits to be set.)
* There is an exception we make which is to assume GetMultiXactIdMembers can
* be called during recovery.
*
* anyone's idea of the tuple state.
* Similarly, cutoff_multi must be less than or equal to the smallest
* MultiXactId used by any transaction currently open.
*
* If the tuple is in a shared buffer, caller must hold an exclusive lock on
* that buffer.
*
* Note: it might seem we could make the changes without exclusive lock, since
* TransactionId read/write is assumed atomic anyway. However there is a race
* condition: someone who just fetched an old XID that we overwrite here could
* conceivably not finish checking the XID against pg_clog before we finish
* the VACUUM and perhaps truncate off the part of pg_clog he needs. Getting
* exclusive lock ensures no other backend is in process of checking the
* tuple status. Also, getting exclusive lock makes it safe to adjust the
* infomask bits.
*
* NB: Cannot rely on hint bits here, they might not be set after a crash or
* on a standby.
* NB: It is not enough to set hint bits to indicate something is
* committed/invalid -- they might not be set on a standby, or after crash
* recovery. We really need to remove old xids.
*/
bool
heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
MultiXactId cutoff_multi)
heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
TransactionId cutoff_multi,
xl_heap_freeze_tuple *frz)
{
bool changed = false;
bool freeze_xmax = false;
TransactionId xid;
frz->frzflags = 0;
frz->t_infomask2 = tuple->t_infomask2;
frz->t_infomask = tuple->t_infomask;
frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
/* Process xmin */
xid = HeapTupleHeaderGetXmin(tuple);
if (TransactionIdIsNormal(xid) &&
TransactionIdPrecedes(xid, cutoff_xid))
{
HeapTupleHeaderSetXmin(tuple, FrozenTransactionId);
frz->frzflags |= XLH_FREEZE_XMIN;
/*
* Might as well fix the hint bits too; usually XMIN_COMMITTED will
* already be set here, but there's a small chance not.
*/
Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
tuple->t_infomask |= HEAP_XMIN_COMMITTED;
frz->t_infomask |= HEAP_XMIN_COMMITTED;
changed = true;
}
......@@ -5489,91 +5747,53 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
{
if (!MultiXactIdIsValid(xid))
{
/* no xmax set, ignore */
;
}
else if (MultiXactIdPrecedes(xid, cutoff_multi))
TransactionId newxmax;
uint16 flags;
newxmax = FreezeMultiXactId(xid, tuple->t_infomask,
cutoff_xid, cutoff_multi, &flags);
if (flags & FRM_INVALIDATE_XMAX)
freeze_xmax = true;
else if (flags & FRM_RETURN_IS_XID)
{
/*
* This old multi cannot possibly be running. If it was a locker
* only, it can be removed without much further thought; but if it
* contained an update, we need to preserve it.
* NB -- some of these transformations are only valid because
* we know the return Xid is a tuple updater (i.e. not merely a
* locker.) Also note that the only reason we don't explicitely
* worry about HEAP_KEYS_UPDATED is because it lives in t_infomask2
* rather than t_infomask.
*/
if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
freeze_xmax = true;
else
frz->t_infomask &= ~HEAP_XMAX_BITS;
frz->xmax = newxmax;
if (flags & FRM_MARK_COMMITTED)
frz->t_infomask &= HEAP_XMAX_COMMITTED;
changed = true;
}
else if (flags & FRM_RETURN_IS_MULTI)
{
TransactionId update_xid;
update_xid = HeapTupleGetUpdateXid(tuple);
uint16 newbits;
uint16 newbits2;
/*
* The multixact has an update hidden within. Get rid of it.
*
* If the update_xid is below the cutoff_xid, it necessarily
* must be an aborted transaction. In a primary server, such
* an Xmax would have gotten marked invalid by
* HeapTupleSatisfiesVacuum, but in a replica that is not
* called before we are, so deal with it in the same way.
*
* If not below the cutoff_xid, then the tuple would have been
* pruned by vacuum, if the update committed long enough ago,
* and we wouldn't be freezing it; so it's either recently
* committed, or in-progress. Deal with this by setting the
* Xmax to the update Xid directly and remove the IS_MULTI
* bit. (We know there cannot be running lockers in this
* multi, because it's below the cutoff_multi value.)
* We can't use GetMultiXactIdHintBits directly on the new multi
* here; that routine initializes the masks to all zeroes, which
* would lose other bits we need. Doing it this way ensures all
* unrelated bits remain untouched.
*/
frz->t_infomask &= ~HEAP_XMAX_BITS;
frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
frz->t_infomask |= newbits;
frz->t_infomask2 |= newbits2;
frz->xmax = newxmax;
if (TransactionIdPrecedes(update_xid, cutoff_xid))
{
Assert(InRecovery || TransactionIdDidAbort(update_xid));
freeze_xmax = true;
}
else
{
Assert(InRecovery || !TransactionIdIsInProgress(update_xid));
tuple->t_infomask &= ~HEAP_XMAX_BITS;
HeapTupleHeaderSetXmax(tuple, update_xid);
changed = true;
}
}
}
else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
{
/* newer than the cutoff, so don't touch it */
;
}
else
{
TransactionId update_xid;
/*
* This is a multixact which is not marked LOCK_ONLY, but which
* is newer than the cutoff_multi. If the update_xid is below the
* cutoff_xid point, then we can just freeze the Xmax in the
* tuple, removing it altogether. This seems simple, but there
* are several underlying assumptions:
*
* 1. A tuple marked by an multixact containing a very old
* committed update Xid would have been pruned away by vacuum; we
* wouldn't be freezing this tuple at all.
*
* 2. There cannot possibly be any live locking members remaining
* in the multixact. This is because if they were alive, the
* update's Xid would had been considered, via the lockers'
* snapshot's Xmin, as part the cutoff_xid.
*
* 3. We don't create new MultiXacts via MultiXactIdExpand() that
* include a very old aborted update Xid: in that function we only
* include update Xids corresponding to transactions that are
* committed or in-progress.
*/
update_xid = HeapTupleGetUpdateXid(tuple);
if (TransactionIdPrecedes(update_xid, cutoff_xid))
freeze_xmax = true;
Assert(flags & FRM_NOOP);
}
}
else if (TransactionIdIsNormal(xid) &&
......@@ -5584,17 +5804,17 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
if (freeze_xmax)
{
HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
frz->xmax = InvalidTransactionId;
/*
* The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
* LOCKED. Normalize to INVALID just to be sure no one gets confused.
* Also get rid of the HEAP_KEYS_UPDATED bit.
*/
tuple->t_infomask &= ~HEAP_XMAX_BITS;
tuple->t_infomask |= HEAP_XMAX_INVALID;
HeapTupleHeaderClearHotUpdated(tuple);
tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED;
frz->t_infomask &= ~HEAP_XMAX_BITS;
frz->t_infomask |= HEAP_XMAX_INVALID;
frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
changed = true;
}
......@@ -5614,16 +5834,16 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
* xvac transaction succeeded.
*/
if (tuple->t_infomask & HEAP_MOVED_OFF)
HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
frz->frzflags |= XLH_INVALID_XVAC;
else
HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
frz->frzflags |= XLH_FREEZE_XVAC;
/*
* Might as well fix the hint bits too; usually XMIN_COMMITTED
* will already be set here, but there's a small chance not.
*/
Assert(!(tuple->t_infomask & HEAP_XMIN_INVALID));
tuple->t_infomask |= HEAP_XMIN_COMMITTED;
frz->t_infomask |= HEAP_XMIN_COMMITTED;
changed = true;
}
}
......@@ -5631,6 +5851,70 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
return changed;
}
/*
* heap_execute_freeze_tuple
* Execute the prepared freezing of a tuple.
*
* Caller is responsible for ensuring that no other backend can access the
* storage underlying this tuple, either by holding an exclusive lock on the
* buffer containing it (which is what lazy VACUUM does), or by having it by
* in private storage (which is what CLUSTER and friends do).
*
* Note: it might seem we could make the changes without exclusive lock, since
* TransactionId read/write is assumed atomic anyway. However there is a race
* condition: someone who just fetched an old XID that we overwrite here could
* conceivably not finish checking the XID against pg_clog before we finish
* the VACUUM and perhaps truncate off the part of pg_clog he needs. Getting
* exclusive lock ensures no other backend is in process of checking the
* tuple status. Also, getting exclusive lock makes it safe to adjust the
* infomask bits.
*
* NB: All code in here must be safe to execute during crash recovery!
*/
void
heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz)
{
if (frz->frzflags & XLH_FREEZE_XMIN)
HeapTupleHeaderSetXmin(tuple, FrozenTransactionId);
HeapTupleHeaderSetXmax(tuple, frz->xmax);
if (frz->frzflags & XLH_FREEZE_XVAC)
HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
if (frz->frzflags & XLH_INVALID_XVAC)
HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
tuple->t_infomask = frz->t_infomask;
tuple->t_infomask2 = frz->t_infomask2;
}
/*
* heap_freeze_tuple
* Freeze tuple in place, without WAL logging.
*
* Useful for callers like CLUSTER that perform their own WAL logging.
*/
bool
heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
TransactionId cutoff_multi)
{
xl_heap_freeze_tuple frz;
bool do_freeze;
do_freeze = heap_prepare_freeze_tuple(tuple, cutoff_xid, cutoff_multi,
&frz);
/*
* Note that because this is not a WAL-logged operation, we don't need to
* fill in the offset in the freeze record.
*/
if (do_freeze)
heap_execute_freeze_tuple(tuple, &frz);
return do_freeze;
}
/*
* For a given MultiXactId, return the hint bits that should be set in the
* tuple's infomask.
......@@ -5934,18 +6218,28 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
}
else if (MultiXactIdPrecedes(multi, cutoff_multi))
return true;
else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
{
/* only-locker multis don't need internal examination */
;
}
else
{
if (TransactionIdPrecedes(HeapTupleGetUpdateXid(tuple),
cutoff_xid))
MultiXactMember *members;
int nmembers;
int i;
/* need to check whether any member of the mxact is too old */
nmembers = GetMultiXactIdMembers(multi, &members, false);
for (i = 0; i < nmembers; i++)
{
if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
{
pfree(members);
return true;
}
}
if (nmembers > 0)
pfree(members);
}
}
else
{
xid = HeapTupleHeaderGetRawXmax(tuple);
......@@ -6193,45 +6487,44 @@ log_heap_clean(Relation reln, Buffer buffer,
}
/*
* Perform XLogInsert for a heap-freeze operation. Caller must already
* have modified the buffer and marked it dirty.
* Perform XLogInsert for a heap-freeze operation. Caller must have already
* modified the buffer and marked it dirty.
*/
XLogRecPtr
log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid, MultiXactId cutoff_multi,
OffsetNumber *offsets, int offcnt)
log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
xl_heap_freeze_tuple *tuples, int ntuples)
{
xl_heap_freeze xlrec;
xl_heap_freeze_page xlrec;
XLogRecPtr recptr;
XLogRecData rdata[2];
/* Caller should not call me on a non-WAL-logged relation */
Assert(RelationNeedsWAL(reln));
/* nor when there are no tuples to freeze */
Assert(offcnt > 0);
Assert(ntuples > 0);
xlrec.node = reln->rd_node;
xlrec.block = BufferGetBlockNumber(buffer);
xlrec.cutoff_xid = cutoff_xid;
xlrec.cutoff_multi = cutoff_multi;
xlrec.ntuples = ntuples;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapFreeze;
rdata[0].len = SizeOfHeapFreezePage;
rdata[0].buffer = InvalidBuffer;
rdata[0].next = &(rdata[1]);
/*
* The tuple-offsets array is not actually in the buffer, but pretend that
* it is. When XLogInsert stores the whole buffer, the offsets array need
* The freeze plan array is not actually in the buffer, but pretend that
* it is. When XLogInsert stores the whole buffer, the freeze plan need
* not be stored too.
*/
rdata[1].data = (char *) offsets;
rdata[1].len = offcnt * sizeof(OffsetNumber);
rdata[1].data = (char *) tuples;
rdata[1].len = ntuples * sizeof(xl_heap_freeze_tuple);
rdata[1].buffer = buffer;
rdata[1].buffer_std = true;
rdata[1].next = NULL;
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE, rdata);
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE, rdata);
return recptr;
}
......@@ -6848,64 +7141,6 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
XLogRecordPageWithFreeSpace(xlrec->node, xlrec->block, freespace);
}
static void
heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
{
xl_heap_freeze *xlrec = (xl_heap_freeze *) XLogRecGetData(record);
TransactionId cutoff_xid = xlrec->cutoff_xid;
MultiXactId cutoff_multi = xlrec->cutoff_multi;
Buffer buffer;
Page page;
/*
* In Hot Standby mode, ensure that there's no queries running which still
* consider the frozen xids as running.
*/
if (InHotStandby)
ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node);
/* If we have a full-page image, restore it and we're done */
if (record->xl_info & XLR_BKP_BLOCK(0))
{
(void) RestoreBackupBlock(lsn, record, 0, false, false);
return;
}
buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
if (!BufferIsValid(buffer))
return;
page = (Page) BufferGetPage(buffer);
if (lsn <= PageGetLSN(page))
{
UnlockReleaseBuffer(buffer);
return;
}
if (record->xl_len > SizeOfHeapFreeze)
{
OffsetNumber *offsets;
OffsetNumber *offsets_end;
offsets = (OffsetNumber *) ((char *) xlrec + SizeOfHeapFreeze);
offsets_end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
while (offsets < offsets_end)
{
/* offsets[] entries are one-based */
ItemId lp = PageGetItemId(page, *offsets);
HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp);
(void) heap_freeze_tuple(tuple, cutoff_xid, cutoff_multi);
offsets++;
}
}
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
UnlockReleaseBuffer(buffer);
}
/*
* Replay XLOG_HEAP2_VISIBLE record.
*
......@@ -7020,6 +7255,63 @@ heap_xlog_visible(XLogRecPtr lsn, XLogRecord *record)
}
}
/*
* Replay XLOG_HEAP2_FREEZE_PAGE records
*/
static void
heap_xlog_freeze_page(XLogRecPtr lsn, XLogRecord *record)
{
xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
TransactionId cutoff_xid = xlrec->cutoff_xid;
Buffer buffer;
Page page;
int ntup;
/*
* In Hot Standby mode, ensure that there's no queries running which still
* consider the frozen xids as running.
*/
if (InHotStandby)
ResolveRecoveryConflictWithSnapshot(cutoff_xid, xlrec->node);
/* If we have a full-page image, restore it and we're done */
if (record->xl_info & XLR_BKP_BLOCK(0))
{
(void) RestoreBackupBlock(lsn, record, 0, false, false);
return;
}
buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
if (!BufferIsValid(buffer))
return;
page = (Page) BufferGetPage(buffer);
if (lsn <= PageGetLSN(page))
{
UnlockReleaseBuffer(buffer);
return;
}
/* now execute freeze plan for each frozen tuple */
for (ntup = 0; ntup < xlrec->ntuples; ntup++)
{
xl_heap_freeze_tuple *xlrec_tp;
ItemId lp;
HeapTupleHeader tuple;
xlrec_tp = &xlrec->tuples[ntup];
lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */
tuple = (HeapTupleHeader) PageGetItem(page, lp);
heap_execute_freeze_tuple(tuple, xlrec_tp);
}
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
UnlockReleaseBuffer(buffer);
}
static void
heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
{
......@@ -7883,12 +8175,12 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
switch (info & XLOG_HEAP_OPMASK)
{
case XLOG_HEAP2_FREEZE:
heap_xlog_freeze(lsn, record);
break;
case XLOG_HEAP2_CLEAN:
heap_xlog_clean(lsn, record);
break;
case XLOG_HEAP2_FREEZE_PAGE:
heap_xlog_freeze_page(lsn, record);
break;
case XLOG_HEAP2_CLEANUP_INFO:
heap_xlog_cleanup_info(lsn, record);
break;
......
......@@ -131,23 +131,23 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
uint8 info = xl_info & ~XLR_INFO_MASK;
info &= XLOG_HEAP_OPMASK;
if (info == XLOG_HEAP2_FREEZE)
if (info == XLOG_HEAP2_CLEAN)
{
xl_heap_freeze *xlrec = (xl_heap_freeze *) rec;
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
appendStringInfo(buf, "freeze: rel %u/%u/%u; blk %u; cutoff xid %u multi %u",
appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode, xlrec->block,
xlrec->cutoff_xid, xlrec->cutoff_multi);
xlrec->latestRemovedXid);
}
else if (info == XLOG_HEAP2_CLEAN)
else if (info == XLOG_HEAP2_FREEZE_PAGE)
{
xl_heap_clean *xlrec = (xl_heap_clean *) rec;
xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec;
appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
appendStringInfo(buf, "freeze_page: rel %u/%u/%u; blk %u; cutoff xid %u ntuples %u",
xlrec->node.spcNode, xlrec->node.dbNode,
xlrec->node.relNode, xlrec->block,
xlrec->latestRemovedXid);
xlrec->cutoff_xid, xlrec->ntuples);
}
else if (info == XLOG_HEAP2_CLEANUP_INFO)
{
......
......@@ -289,7 +289,6 @@ static MemoryContext MXactContext = NULL;
/* internal MultiXactId management */
static void MultiXactIdSetOldestVisible(void);
static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members);
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
int nmembers, MultiXactMember *members);
static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
......@@ -336,6 +335,9 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
/* MultiXactIdSetOldestMember() must have been called already. */
Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
/*
* Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
* are still running. In typical usage, xid2 will be our own XID and the
......@@ -347,7 +349,7 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
members[1].xid = xid2;
members[1].status = status2;
newMulti = CreateMultiXactId(2, members);
newMulti = MultiXactIdCreateFromMembers(2, members);
debug_elog3(DEBUG2, "Create: %s",
mxid_to_string(newMulti, 2, members));
......@@ -387,6 +389,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
AssertArg(MultiXactIdIsValid(multi));
AssertArg(TransactionIdIsValid(xid));
/* MultiXactIdSetOldestMember() must have been called already. */
Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
multi, xid, mxstatus_to_string(status));
......@@ -410,7 +415,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
*/
member.xid = xid;
member.status = status;
newMulti = CreateMultiXactId(1, &member);
newMulti = MultiXactIdCreateFromMembers(1, &member);
debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
multi, newMulti);
......@@ -462,7 +467,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
newMembers[j].xid = xid;
newMembers[j++].status = status;
newMulti = CreateMultiXactId(j, newMembers);
newMulti = MultiXactIdCreateFromMembers(j, newMembers);
pfree(members);
pfree(newMembers);
......@@ -667,16 +672,16 @@ ReadNextMultiXactId(void)
}
/*
* CreateMultiXactId
* Make a new MultiXactId
* MultiXactIdCreateFromMembers
* Make a new MultiXactId from the specified set of members
*
* Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
* given TransactionIds as members. Returns the newly created MultiXactId.
*
* NB: the passed members[] array will be sorted in-place.
*/
static MultiXactId
CreateMultiXactId(int nmembers, MultiXactMember *members)
MultiXactId
MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
{
MultiXactId multi;
MultiXactOffset offset;
......@@ -707,6 +712,13 @@ CreateMultiXactId(int nmembers, MultiXactMember *members)
* Assign the MXID and offsets range to use, and make sure there is space
* in the OFFSETs and MEMBERs files. NB: this routine does
* START_CRIT_SECTION().
*
* Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check
* that we've called MultiXactIdSetOldestMember here. This is because
* this routine is used in some places to create new MultiXactIds of which
* the current backend is not a member, notably during freezing of multis
* in vacuum. During vacuum, in particular, it would be unacceptable to
* keep OldestMulti set, in case it runs for long.
*/
multi = GetNewMultiXactId(nmembers, &offset);
......@@ -763,7 +775,8 @@ CreateMultiXactId(int nmembers, MultiXactMember *members)
* RecordNewMultiXact
* Write info about a new multixact into the offsets and members files
*
* This is broken out of CreateMultiXactId so that xlog replay can use it.
* This is broken out of MultiXactIdCreateFromMembers so that xlog replay can
* use it.
*/
static void
RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
......@@ -867,9 +880,6 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
/* MultiXactIdSetOldestMember() must have been called already */
Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
/* safety check, we should never get this far in a HS slave */
if (RecoveryInProgress())
elog(ERROR, "cannot assign MultiXactIds during recovery");
......
......@@ -424,6 +424,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
Buffer vmbuffer = InvalidBuffer;
BlockNumber next_not_all_visible_block;
bool skipping_all_visible_blocks;
xl_heap_freeze_tuple *frozen;
pg_rusage_init(&ru0);
......@@ -446,6 +447,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
vacrelstats->latestRemovedXid = InvalidTransactionId;
lazy_space_alloc(vacrelstats, nblocks);
frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
/*
* We want to skip pages that don't require vacuuming according to the
......@@ -500,7 +502,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
bool tupgone,
hastup;
int prev_dead_count;
OffsetNumber frozen[MaxOffsetNumber];
int nfrozen;
Size freespace;
bool all_visible_according_to_vm;
......@@ -890,9 +891,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
* Each non-removable tuple must be checked to see if it needs
* freezing. Note we already have exclusive buffer lock.
*/
if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
MultiXactCutoff))
frozen[nfrozen++] = offnum;
if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
MultiXactCutoff, &frozen[nfrozen]))
frozen[nfrozen++].offset = offnum;
}
} /* scan along page */
......@@ -903,15 +904,33 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
*/
if (nfrozen > 0)
{
START_CRIT_SECTION();
MarkBufferDirty(buf);
/* execute collected freezes */
for (i = 0; i < nfrozen; i++)
{
ItemId itemid;
HeapTupleHeader htup;
itemid = PageGetItemId(page, frozen[i].offset);
htup = (HeapTupleHeader) PageGetItem(page, itemid);
heap_execute_freeze_tuple(htup, &frozen[i]);
}
/* Now WAL-log freezing if neccessary */
if (RelationNeedsWAL(onerel))
{
XLogRecPtr recptr;
recptr = log_heap_freeze(onerel, buf, FreezeLimit,
MultiXactCutoff, frozen, nfrozen);
frozen, nfrozen);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
/*
......@@ -1012,6 +1031,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
RecordPageWithFreeSpace(onerel, blkno, freespace);
}
pfree(frozen);
/* save stats for use later */
vacrelstats->scanned_tuples = num_tuples;
vacrelstats->tuples_deleted = tups_vacuumed;
......
......@@ -48,9 +48,9 @@
* the ones above associated with RM_HEAP_ID. XLOG_HEAP_OPMASK applies to
* these, too.
*/
#define XLOG_HEAP2_FREEZE 0x00
/* 0x00 is free, was XLOG_HEAP2_FREEZE */
#define XLOG_HEAP2_CLEAN 0x10
/* 0x20 is free, was XLOG_HEAP2_CLEAN_MOVE */
#define XLOG_HEAP2_FREEZE_PAGE 0x20
#define XLOG_HEAP2_CLEANUP_INFO 0x30
#define XLOG_HEAP2_VISIBLE 0x40
#define XLOG_HEAP2_MULTI_INSERT 0x50
......@@ -270,17 +270,36 @@ typedef struct xl_heap_inplace
#define SizeOfHeapInplace (offsetof(xl_heap_inplace, target) + SizeOfHeapTid)
/* This is what we need to know about tuple freezing during vacuum */
typedef struct xl_heap_freeze
/*
* This struct represents a 'freeze plan', which is what we need to know about
* a single tuple being frozen during vacuum.
*/
#define XLH_FREEZE_XMIN 0x01
#define XLH_FREEZE_XVAC 0x02
#define XLH_INVALID_XVAC 0x04
typedef struct xl_heap_freeze_tuple
{
TransactionId xmax;
OffsetNumber offset;
uint16 t_infomask2;
uint16 t_infomask;
uint8 frzflags;
} xl_heap_freeze_tuple;
/*
* This is what we need to know about a block being frozen during vacuum
*/
typedef struct xl_heap_freeze_page
{
RelFileNode node;
BlockNumber block;
TransactionId cutoff_xid;
MultiXactId cutoff_multi;
/* TUPLE OFFSET NUMBERS FOLLOW AT THE END */
} xl_heap_freeze;
uint16 ntuples;
xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER];
} xl_heap_freeze_page;
#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_multi) + sizeof(MultiXactId))
#define SizeOfHeapFreezePage offsetof(xl_heap_freeze_page, tuples)
/* This is what we need to know about setting a visibility map bit */
typedef struct xl_heap_visible
......@@ -331,8 +350,14 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
OffsetNumber *nowunused, int nunused,
TransactionId latestRemovedXid);
extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
TransactionId cutoff_xid, MultiXactId cutoff_multi,
OffsetNumber *offsets, int offcnt);
TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples,
int ntuples);
extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple,
TransactionId cutoff_xid,
TransactionId cutoff_multi,
xl_heap_freeze_tuple *frz);
extern void heap_execute_freeze_tuple(HeapTupleHeader tuple,
xl_heap_freeze_tuple *xlrec_tp);
extern XLogRecPtr log_heap_visible(RelFileNode rnode, Buffer heap_buffer,
Buffer vm_buffer, TransactionId cutoff_xid);
extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum,
......
......@@ -81,6 +81,9 @@ extern MultiXactId MultiXactIdCreate(TransactionId xid1,
MultiXactStatus status2);
extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid,
MultiXactStatus status);
extern MultiXactId MultiXactIdCreateFromMembers(int nmembers,
MultiXactMember *members);
extern MultiXactId ReadNextMultiXactId(void);
extern bool MultiXactIdIsRunning(MultiXactId multi);
extern void MultiXactIdSetOldestMember(void);
......
......@@ -55,7 +55,7 @@ typedef struct BkpBlock
/*
* Each page of XLOG file has a header like this:
*/
#define XLOG_PAGE_MAGIC 0xD079 /* can be used as WAL version indicator */
#define XLOG_PAGE_MAGIC 0xD07A /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment