Commit f332241a authored by Tom Lane's avatar Tom Lane

Fix race conditions in synchronous standby management.

We have repeatedly seen the buildfarm reach the Assert(false) in
SyncRepGetSyncStandbysPriority.  This apparently is due to failing to
consider the possibility that the sync_standby_priority values in
shared memory might be inconsistent; but they will be whenever only
some of the walsenders have updated their values after a change in
the synchronous_standby_names setting.  That function is vastly too
complex for what it does, anyway, so rewriting it seems better than
trying to apply a band-aid fix.

Furthermore, the API of SyncRepGetSyncStandbys is broken by design:
it returns a list of WalSnd array indexes, but there is nothing
guaranteeing that the contents of the WalSnd array remain stable.
Thus, if some walsender exits and then a new walsender process
takes over that WalSnd array slot, a caller might make use of
WAL position data that it should not, potentially leading to
incorrect decisions about whether to release transactions that
are waiting for synchronous commit.

To fix, replace SyncRepGetSyncStandbys with a new function
SyncRepGetCandidateStandbys that copies all the required data
from shared memory while holding the relevant mutexes.  If the
associated walsender process then exits, this data is still safe to
make release decisions with, since we know that that much WAL *was*
sent to a valid standby server.  This incidentally means that we no
longer need to treat sync_standby_priority as protected by the
SyncRepLock rather than the per-walsender mutex.

SyncRepGetSyncStandbys is no longer used by the core code, so remove
it entirely in HEAD.  However, it seems possible that external code is
relying on that function, so do not remove it from the back branches.
Instead, just remove the known-incorrect Assert.  When the bug occurs,
the function will return a too-short list, which callers should treat
as meaning there are not enough sync standbys, which seems like a
reasonably safe fallback until the inconsistent state is resolved.
Moreover it's bug-compatible with what has been happening in non-assert
builds.  We cannot do anything about the walsender-replacement race
condition without an API/ABI break.

The bogus assertion exists back to 9.6, but 9.6 is sufficiently
different from the later branches that the patch doesn't apply at all.
I chose to just remove the bogus assertion in 9.6, feeling that the
probability of a bad outcome from the walsender-replacement race
condition is too low to justify rewriting the whole patch for 9.6.

Discussion: https://postgr.es/m/21519.1585272409@sss.pgh.pa.us
parent 3cb02e30
This diff is collapsed.
......@@ -2375,14 +2375,16 @@ InitWalSenderSlot(void)
* Found a free slot. Reserve it for us.
*/
walsnd->pid = MyProcPid;
walsnd->state = WALSNDSTATE_STARTUP;
walsnd->sentPtr = InvalidXLogRecPtr;
walsnd->needreload = false;
walsnd->write = InvalidXLogRecPtr;
walsnd->flush = InvalidXLogRecPtr;
walsnd->apply = InvalidXLogRecPtr;
walsnd->writeLag = -1;
walsnd->flushLag = -1;
walsnd->applyLag = -1;
walsnd->state = WALSNDSTATE_STARTUP;
walsnd->sync_standby_priority = 0;
walsnd->latch = &MyProc->procLatch;
walsnd->replyTime = 0;
walsnd->spillTxns = 0;
......@@ -3235,7 +3237,8 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
Tuplestorestate *tupstore;
MemoryContext per_query_ctx;
MemoryContext oldcontext;
List *sync_standbys;
SyncRepStandbyData *sync_standbys;
int num_standbys;
int i;
/* check to see if caller supports us returning a tuplestore */
......@@ -3263,11 +3266,10 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
MemoryContextSwitchTo(oldcontext);
/*
* Get the currently active synchronous standbys.
* Get the currently active synchronous standbys. This could be out of
* date before we're done, but we'll use the data anyway.
*/
LWLockAcquire(SyncRepLock, LW_SHARED);
sync_standbys = SyncRepGetSyncStandbys(NULL);
LWLockRelease(SyncRepLock);
num_standbys = SyncRepGetCandidateStandbys(&sync_standbys);
for (i = 0; i < max_wal_senders; i++)
{
......@@ -3286,9 +3288,12 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
int64 spillTxns;
int64 spillCount;
int64 spillBytes;
bool is_sync_standby;
Datum values[PG_STAT_GET_WAL_SENDERS_COLS];
bool nulls[PG_STAT_GET_WAL_SENDERS_COLS];
int j;
/* Collect data from shared memory */
SpinLockAcquire(&walsnd->mutex);
if (walsnd->pid == 0)
{
......@@ -3311,6 +3316,22 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
spillBytes = walsnd->spillBytes;
SpinLockRelease(&walsnd->mutex);
/*
* Detect whether walsender is/was considered synchronous. We can
* provide some protection against stale data by checking the PID
* along with walsnd_index.
*/
is_sync_standby = false;
for (j = 0; j < num_standbys; j++)
{
if (sync_standbys[j].walsnd_index == i &&
sync_standbys[j].pid == pid)
{
is_sync_standby = true;
break;
}
}
memset(nulls, 0, sizeof(nulls));
values[0] = Int32GetDatum(pid);
......@@ -3380,7 +3401,7 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
*/
if (priority == 0)
values[10] = CStringGetTextDatum("async");
else if (list_member_int(sync_standbys, i))
else if (is_sync_standby)
values[10] = SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY ?
CStringGetTextDatum("sync") : CStringGetTextDatum("quorum");
else
......
......@@ -36,6 +36,24 @@
#define SYNC_REP_PRIORITY 0
#define SYNC_REP_QUORUM 1
/*
* SyncRepGetCandidateStandbys returns an array of these structs,
* one per candidate synchronous walsender.
*/
typedef struct SyncRepStandbyData
{
/* Copies of relevant fields from WalSnd shared-memory struct */
pid_t pid;
XLogRecPtr write;
XLogRecPtr flush;
XLogRecPtr apply;
int sync_standby_priority;
/* Index of this walsender in the WalSnd shared-memory array */
int walsnd_index;
/* This flag indicates whether this struct is about our own process */
bool is_me;
} SyncRepStandbyData;
/*
* Struct for the configuration of synchronous replication.
*
......@@ -74,7 +92,7 @@ extern void SyncRepInitConfig(void);
extern void SyncRepReleaseWaiters(void);
/* called by wal sender and user backend */
extern List *SyncRepGetSyncStandbys(bool *am_sync);
extern int SyncRepGetCandidateStandbys(SyncRepStandbyData **standbys);
/* called by checkpointer */
extern void SyncRepUpdateSyncStandbysDefined(void);
......
......@@ -31,8 +31,7 @@ typedef enum WalSndState
/*
* Each walsender has a WalSnd struct in shared memory.
*
* This struct is protected by 'mutex', with two exceptions: one is
* sync_standby_priority as noted below. The other exception is that some
* This struct is protected by its 'mutex' spinlock field, except that some
* members are only written by the walsender process itself, and thus that
* process is free to read those members without holding spinlock. pid and
* needreload always require the spinlock to be held for all accesses.
......@@ -60,6 +59,12 @@ typedef struct WalSnd
TimeOffset flushLag;
TimeOffset applyLag;
/*
* The priority order of the standby managed by this WALSender, as listed
* in synchronous_standby_names, or 0 if not-listed.
*/
int sync_standby_priority;
/* Protects shared variables shown above. */
slock_t mutex;
......@@ -69,13 +74,6 @@ typedef struct WalSnd
*/
Latch *latch;
/*
* The priority order of the standby managed by this WALSender, as listed
* in synchronous_standby_names, or 0 if not-listed. Protected by
* SyncRepLock.
*/
int sync_standby_priority;
/*
* Timestamp of the last message received from standby.
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment