Commit b2c95a37 authored by Alvaro Herrera's avatar Alvaro Herrera

Fix replication origin-related race conditions

Similar to what was fixed in commit 9915de6c for replication slots,
but this time it's related to replication origins: DROP SUBSCRIPTION
attempts to drop the replication origin, but that fails if the
replication worker process hasn't yet marked it unused.  This causes
failures in the buildfarm:
ERROR:  could not drop replication origin with OID 1, in use by PID 34069

Like the aforementioned commit, fix by having the process running DROP
SUBSCRIPTION sleep until the worker marks the the replication origin
struct as free.  This uses a condition variable on each replication
origin shmem state struct, so that the session trying to drop can sleep
and expect to be awakened by the process keeping the origin open.

Also fix a SGML markup in the previous commit.

Discussion: https://postgr.es/m/20170808001433.rozlseaf4m2wkw3n@alvherre.pgsql
parent 030273b7
......@@ -1222,11 +1222,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
</row>
<row>
<entry><literal>LibPQWalReceiverConnect</></entry>
<entry>Waiting in WAL receiver to establish connection to remote server.<entry>
<entry>Waiting in WAL receiver to establish connection to remote server.</entry>
</row>
<row>
<entry><literal>LibPQWalReceiverReceive</></entry>
<entry>Waiting in WAL receiver to receive data from remote server.<entry>
<entry>Waiting in WAL receiver to receive data from remote server.</entry>
</row>
<row>
<entry><literal>SSLOpenServer</></entry>
......@@ -1302,6 +1302,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry><literal>ProcArrayGroupUpdate</></entry>
<entry>Waiting for group leader to clear transaction id at transaction end.</entry>
</row>
<row>
<entry><literal>ReplicationOriginDrop</></entry>
<entry>Waiting for a replication origin to become inactive to be dropped.</entry>
</row>
<row>
<entry><literal>ReplicationSlotDrop</></entry>
<entry>Waiting for a replication slot to become inactive to be dropped.</entry>
......
......@@ -939,7 +939,7 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel)
snprintf(originname, sizeof(originname), "pg_%u", subid);
originid = replorigin_by_name(originname, true);
if (originid != InvalidRepOriginId)
replorigin_drop(originid);
replorigin_drop(originid, false);
/*
* If there is no slot associated with the subscription, we can finish
......
......@@ -3609,6 +3609,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
case WAIT_EVENT_PROCARRAY_GROUP_UPDATE:
event_name = "ProcArrayGroupUpdate";
break;
case WAIT_EVENT_REPLICATION_ORIGIN_DROP:
event_name = "ReplicationOriginDrop";
break;
case WAIT_EVENT_REPLICATION_SLOT_DROP:
event_name = "ReplicationSlotDrop";
break;
......
......@@ -79,15 +79,15 @@
#include "access/xact.h"
#include "catalog/indexing.h"
#include "nodes/execnodes.h"
#include "replication/origin.h"
#include "replication/logical.h"
#include "pgstat.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/condition_variable.h"
#include "storage/copydir.h"
#include "utils/builtins.h"
......@@ -124,6 +124,11 @@ typedef struct ReplicationState
*/
int acquired_by;
/*
* Condition variable that's signalled when acquired_by changes.
*/
ConditionVariable origin_cv;
/*
* Lock protecting remote_lsn and local_lsn.
*/
......@@ -324,9 +329,9 @@ replorigin_create(char *roname)
* Needs to be called in a transaction.
*/
void
replorigin_drop(RepOriginId roident)
replorigin_drop(RepOriginId roident, bool nowait)
{
HeapTuple tuple = NULL;
HeapTuple tuple;
Relation rel;
int i;
......@@ -334,6 +339,8 @@ replorigin_drop(RepOriginId roident)
rel = heap_open(ReplicationOriginRelationId, ExclusiveLock);
restart:
tuple = NULL;
/* cleanup the slot state info */
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
......@@ -346,11 +353,21 @@ replorigin_drop(RepOriginId roident)
{
if (state->acquired_by != 0)
{
ereport(ERROR,
(errcode(ERRCODE_OBJECT_IN_USE),
errmsg("could not drop replication origin with OID %d, in use by PID %d",
state->roident,
state->acquired_by)));
ConditionVariable *cv;
if (nowait)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_IN_USE),
errmsg("could not drop replication origin with OID %d, in use by PID %d",
state->roident,
state->acquired_by)));
cv = &state->origin_cv;
LWLockRelease(ReplicationOriginLock);
ConditionVariablePrepareToSleep(cv);
ConditionVariableSleep(cv, WAIT_EVENT_REPLICATION_ORIGIN_DROP);
ConditionVariableCancelSleep();
goto restart;
}
/* first WAL log */
......@@ -382,7 +399,7 @@ replorigin_drop(RepOriginId roident)
CommandCounterIncrement();
/* now release lock again, */
/* now release lock again */
heap_close(rel, ExclusiveLock);
}
......@@ -476,8 +493,11 @@ ReplicationOriginShmemInit(void)
MemSet(replication_states, 0, ReplicationOriginShmemSize());
for (i = 0; i < max_replication_slots; i++)
{
LWLockInitialize(&replication_states[i].lock,
replication_states_ctl->tranche_id);
ConditionVariableInit(&replication_states[i].origin_cv);
}
}
LWLockRegisterTranche(replication_states_ctl->tranche_id,
......@@ -957,16 +977,23 @@ replorigin_get_progress(RepOriginId node, bool flush)
static void
ReplicationOriginExitCleanup(int code, Datum arg)
{
ConditionVariable *cv = NULL;
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
if (session_replication_state != NULL &&
session_replication_state->acquired_by == MyProcPid)
{
cv = &session_replication_state->origin_cv;
session_replication_state->acquired_by = 0;
session_replication_state = NULL;
}
LWLockRelease(ReplicationOriginLock);
if (cv)
ConditionVariableBroadcast(cv);
}
/*
......@@ -1056,6 +1083,9 @@ replorigin_session_setup(RepOriginId node)
session_replication_state->acquired_by = MyProcPid;
LWLockRelease(ReplicationOriginLock);
/* probably this one is pointless */
ConditionVariableBroadcast(&session_replication_state->origin_cv);
}
/*
......@@ -1067,6 +1097,8 @@ replorigin_session_setup(RepOriginId node)
void
replorigin_session_reset(void)
{
ConditionVariable *cv;
Assert(max_replication_slots != 0);
if (session_replication_state == NULL)
......@@ -1077,9 +1109,12 @@ replorigin_session_reset(void)
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
session_replication_state->acquired_by = 0;
cv = &session_replication_state->origin_cv;
session_replication_state = NULL;
LWLockRelease(ReplicationOriginLock);
ConditionVariableBroadcast(cv);
}
/*
......@@ -1170,7 +1205,7 @@ pg_replication_origin_drop(PG_FUNCTION_ARGS)
roident = replorigin_by_name(name, false);
Assert(OidIsValid(roident));
replorigin_drop(roident);
replorigin_drop(roident, false);
pfree(name);
......
......@@ -812,6 +812,7 @@ typedef enum
WAIT_EVENT_PARALLEL_FINISH,
WAIT_EVENT_PARALLEL_BITMAP_SCAN,
WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
WAIT_EVENT_REPLICATION_ORIGIN_DROP,
WAIT_EVENT_REPLICATION_SLOT_DROP,
WAIT_EVENT_SAFE_SNAPSHOT,
WAIT_EVENT_SYNC_REP
......
......@@ -41,7 +41,7 @@ extern PGDLLIMPORT TimestampTz replorigin_session_origin_timestamp;
/* API for querying & manipulating replication origins */
extern RepOriginId replorigin_by_name(char *name, bool missing_ok);
extern RepOriginId replorigin_create(char *name);
extern void replorigin_drop(RepOriginId roident);
extern void replorigin_drop(RepOriginId roident, bool nowait);
extern bool replorigin_by_oid(RepOriginId roident, bool missing_ok,
char **roname);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment