Commit 1986ca5c authored by Tom Lane's avatar Tom Lane

Fix race condition in multixact code: it's possible to try to read a

multixact's starting offset before the offset has been stored into the
SLRU file.  A simple fix would be to hold the MultiXactGenLock until the
offset has been stored, but that looks like a big concurrency hit.  Instead
rely on knowledge that unset offsets will be zero, and loop when we see
a zero.  This requires a little extra hacking to ensure that zero is never
a valid value for the offset.  Problem reported by Matteo Beccati, fix
ideas from Martijn van Oosterhout, Alvaro Herrera, and Tom Lane.
parent 21b748e7
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.9 2005/10/15 02:49:09 momjian Exp $ * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.10 2005/10/28 17:27:29 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -631,7 +631,16 @@ CreateMultiXactId(int nxids, TransactionId *xids) ...@@ -631,7 +631,16 @@ CreateMultiXactId(int nxids, TransactionId *xids)
} }
/* /*
* OK, assign the MXID and offsets range to use * Critical section from here until we've written the data; we don't
* want to error out with a partly written MultiXact structure.
* (In particular, failing to write our start offset after advancing
* nextMXact would effectively corrupt the previous MultiXact.)
*/
START_CRIT_SECTION();
/*
* Assign the MXID and offsets range to use, and make sure there is
* space in the OFFSETs and MEMBERs files.
*/ */
multi = GetNewMultiXactId(nxids, &offset); multi = GetNewMultiXactId(nxids, &offset);
...@@ -668,6 +677,9 @@ CreateMultiXactId(int nxids, TransactionId *xids) ...@@ -668,6 +677,9 @@ CreateMultiXactId(int nxids, TransactionId *xids)
/* Now enter the information into the OFFSETs and MEMBERs logs */ /* Now enter the information into the OFFSETs and MEMBERs logs */
RecordNewMultiXact(multi, offset, nxids, xids); RecordNewMultiXact(multi, offset, nxids, xids);
/* Done with critical section */
END_CRIT_SECTION();
/* Store the new MultiXactId in the local cache, too */ /* Store the new MultiXactId in the local cache, too */
mXactCachePut(multi, nxids, xids); mXactCachePut(multi, nxids, xids);
...@@ -761,6 +773,7 @@ static MultiXactId ...@@ -761,6 +773,7 @@ static MultiXactId
GetNewMultiXactId(int nxids, MultiXactOffset *offset) GetNewMultiXactId(int nxids, MultiXactOffset *offset)
{ {
MultiXactId result; MultiXactId result;
MultiXactOffset nextOffset;
debug_elog3(DEBUG2, "GetNew: for %d xids", nxids); debug_elog3(DEBUG2, "GetNew: for %d xids", nxids);
...@@ -784,19 +797,28 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset) ...@@ -784,19 +797,28 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
* Advance counter. As in GetNewTransactionId(), this must not happen * Advance counter. As in GetNewTransactionId(), this must not happen
* until after ExtendMultiXactOffset has succeeded! * until after ExtendMultiXactOffset has succeeded!
* *
* We don't care about MultiXactId wraparound here; it will be handled by the * We don't care about MultiXactId wraparound here; it will be handled by
* next iteration. But note that nextMXact may be InvalidMultiXactId * the next iteration. But note that nextMXact may be InvalidMultiXactId
* after this routine exits, so anyone else looking at the variable must * after this routine exits, so anyone else looking at the variable must
* be prepared to deal with that. * be prepared to deal with that.
*/ */
(MultiXactState->nextMXact)++; (MultiXactState->nextMXact)++;
/* /*
* Reserve the members space. Same considerations as above. * Reserve the members space. Same considerations as above. Also, be
* careful not to return zero as the starting offset for any multixact.
* See GetMultiXactIdMembers() for motivation.
*/ */
*offset = MultiXactState->nextOffset; nextOffset = MultiXactState->nextOffset;
if (nextOffset == 0)
{
*offset = 1;
nxids++; /* allocate member slot 0 too */
}
else
*offset = nextOffset;
ExtendMultiXactMember(*offset, nxids); ExtendMultiXactMember(nextOffset, nxids);
MultiXactState->nextOffset += nxids; MultiXactState->nextOffset += nxids;
...@@ -824,6 +846,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) ...@@ -824,6 +846,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
MultiXactOffset *offptr; MultiXactOffset *offptr;
MultiXactOffset offset; MultiXactOffset offset;
int length; int length;
int truelength;
int i; int i;
MultiXactId nextMXact; MultiXactId nextMXact;
MultiXactId tmpMXact; MultiXactId tmpMXact;
...@@ -849,13 +872,13 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) ...@@ -849,13 +872,13 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
/* /*
* We check known limits on MultiXact before resorting to the SLRU area. * We check known limits on MultiXact before resorting to the SLRU area.
* *
* An ID older than our OldestVisibleMXactId[] entry can't possibly still be * An ID older than our OldestVisibleMXactId[] entry can't possibly still
* running, and we'd run the risk of trying to read already-truncated SLRU * be running, and we'd run the risk of trying to read already-truncated
* data if we did try to examine it. * SLRU data if we did try to examine it.
* *
* Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is seen, * Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
* it implies undetected ID wraparound has occurred. We just silently * seen, it implies undetected ID wraparound has occurred. We just
* assume that such an ID is no longer running. * silently assume that such an ID is no longer running.
* *
* Shared lock is enough here since we aren't modifying any global state. * Shared lock is enough here since we aren't modifying any global state.
* Also, we can examine our own OldestVisibleMXactId without the lock, * Also, we can examine our own OldestVisibleMXactId without the lock,
...@@ -868,27 +891,58 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) ...@@ -868,27 +891,58 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
return -1; return -1;
} }
/*
* Acquire the shared lock just long enough to grab the current counter
* values. We may need both nextMXact and nextOffset; see below.
*/
LWLockAcquire(MultiXactGenLock, LW_SHARED); LWLockAcquire(MultiXactGenLock, LW_SHARED);
if (!MultiXactIdPrecedes(multi, MultiXactState->nextMXact)) nextMXact = MultiXactState->nextMXact;
{ nextOffset = MultiXactState->nextOffset;
LWLockRelease(MultiXactGenLock); LWLockRelease(MultiXactGenLock);
if (!MultiXactIdPrecedes(multi, nextMXact))
{
debug_elog2(DEBUG2, "GetMembers: it's too new!"); debug_elog2(DEBUG2, "GetMembers: it's too new!");
*xids = NULL; *xids = NULL;
return -1; return -1;
} }
/* /*
* Before releasing the lock, save the current counter values, because the * Find out the offset at which we need to start reading MultiXactMembers
* target MultiXactId may be just one less than nextMXact. We will need * and the number of members in the multixact. We determine the latter
* to use nextOffset as the endpoint if so. * as the difference between this multixact's starting offset and the
* next one's. However, there are some corner cases to worry about:
*
* 1. This multixact may be the latest one created, in which case there
* is no next one to look at. In this case the nextOffset value we just
* saved is the correct endpoint.
*
* 2. The next multixact may still be in process of being filled in:
* that is, another process may have done GetNewMultiXactId but not yet
* written the offset entry for that ID. In that scenario, it is
* guaranteed that the offset entry for that multixact exists (because
* GetNewMultiXactId won't release MultiXactGenLock until it does)
* but contains zero (because we are careful to pre-zero offset pages).
* Because GetNewMultiXactId will never return zero as the starting offset
* for a multixact, when we read zero as the next multixact's offset, we
* know we have this case. We sleep for a bit and try again.
*
* 3. Because GetNewMultiXactId increments offset zero to offset one
* to handle case #2, there is an ambiguity near the point of offset
* wraparound. If we see next multixact's offset is one, is that our
* multixact's actual endpoint, or did it end at zero with a subsequent
* increment? We handle this using the knowledge that if the zero'th
* member slot wasn't filled, it'll contain zero, and zero isn't a valid
* transaction ID so it can't be a multixact member. Therefore, if we
* read a zero from the members array, just ignore it.
*
* This is all pretty messy, but the mess occurs only in infrequent corner
* cases, so it seems better than holding the MultiXactGenLock for a long
* time on every multixact creation.
*/ */
nextMXact = MultiXactState->nextMXact; retry:
nextOffset = MultiXactState->nextOffset;
LWLockRelease(MultiXactGenLock);
/* Get the offset at which we need to start reading MultiXactMembers */
LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
pageno = MultiXactIdToOffsetPage(multi); pageno = MultiXactIdToOffsetPage(multi);
...@@ -899,20 +953,23 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) ...@@ -899,20 +953,23 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
offptr += entryno; offptr += entryno;
offset = *offptr; offset = *offptr;
Assert(offset != 0);
/* /*
* How many members do we need to read? If we are at the end of the * Use the same increment rule as GetNewMultiXactId(), that is, don't
* assigned MultiXactIds, use the offset just saved above. Else we need * handle wraparound explicitly until needed.
* to check the MultiXactId following ours.
*
* Use the same increment rule as GetNewMultiXactId(), that is, don't handle
* wraparound explicitly until needed.
*/ */
tmpMXact = multi + 1; tmpMXact = multi + 1;
if (nextMXact == tmpMXact) if (nextMXact == tmpMXact)
{
/* Corner case 1: there is no next multixact */
length = nextOffset - offset; length = nextOffset - offset;
}
else else
{ {
MultiXactOffset nextMXOffset;
/* handle wraparound if needed */ /* handle wraparound if needed */
if (tmpMXact < FirstMultiXactId) if (tmpMXact < FirstMultiXactId)
tmpMXact = FirstMultiXactId; tmpMXact = FirstMultiXactId;
...@@ -927,7 +984,17 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) ...@@ -927,7 +984,17 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
offptr += entryno; offptr += entryno;
length = *offptr - offset; nextMXOffset = *offptr;
if (nextMXOffset == 0)
{
/* Corner case 2: next multixact is still being filled in */
LWLockRelease(MultiXactOffsetControlLock);
pg_usleep(1000L);
goto retry;
}
length = nextMXOffset - offset;
} }
LWLockRelease(MultiXactOffsetControlLock); LWLockRelease(MultiXactOffsetControlLock);
...@@ -938,6 +1005,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) ...@@ -938,6 +1005,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
/* Now get the members themselves. */ /* Now get the members themselves. */
LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
truelength = 0;
prev_pageno = -1; prev_pageno = -1;
for (i = 0; i < length; i++, offset++) for (i = 0; i < length; i++, offset++)
{ {
...@@ -956,7 +1024,14 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) ...@@ -956,7 +1024,14 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
MultiXactMemberCtl->shared->page_buffer[slotno]; MultiXactMemberCtl->shared->page_buffer[slotno];
xactptr += entryno; xactptr += entryno;
ptr[i] = *xactptr; if (!TransactionIdIsValid(*xactptr))
{
/* Corner case 3: we must be looking at unused slot zero */
Assert(offset == 0);
continue;
}
ptr[truelength++] = *xactptr;
} }
LWLockRelease(MultiXactMemberControlLock); LWLockRelease(MultiXactMemberControlLock);
...@@ -964,11 +1039,11 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) ...@@ -964,11 +1039,11 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
/* /*
* Copy the result into the local cache. * Copy the result into the local cache.
*/ */
mXactCachePut(multi, length, ptr); mXactCachePut(multi, truelength, ptr);
debug_elog3(DEBUG2, "GetMembers: no cache for %s", debug_elog3(DEBUG2, "GetMembers: no cache for %s",
mxid_to_string(multi, length, ptr)); mxid_to_string(multi, truelength, ptr));
return length; return truelength;
} }
/* /*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment