Commit 086221cf authored by Peter Eisentraut's avatar Peter Eisentraut

Prevent panic during shutdown checkpoint

When the checkpointer writes the shutdown checkpoint, it checks
afterwards whether any WAL has been written since it started and throws
a PANIC if so.  At that point, only walsenders are still active, so one
might think this could not happen, but walsenders can also generate WAL,
for instance in BASE_BACKUP and certain variants of
CREATE_REPLICATION_SLOT.  So they can trigger this panic if such a
command is run while the shutdown checkpoint is being written.

To fix this, divide the walsender shutdown into two phases.  First, the
postmaster sends a SIGUSR2 signal to all walsenders.  The walsenders
then put themselves into the "stopping" state.  In this state, they
reject any new commands.  (For simplicity, we reject all new commands,
so that in the future we do not have to track meticulously which
commands might generate WAL.)  The checkpointer waits for all walsenders
to reach this state before proceeding with the shutdown checkpoint.
After the shutdown checkpoint is done, the postmaster sends
SIGINT (previously unused) to the walsenders.  This triggers the
existing shutdown behavior of sending out the shutdown checkpoint record
and then terminating.

Author: Michael Paquier <michael.paquier@gmail.com>
Reported-by: default avatarFujii Masao <masao.fujii@gmail.com>
parent 499ae5f5
...@@ -1690,6 +1690,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i ...@@ -1690,6 +1690,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
<literal>backup</>: This WAL sender is sending a backup. <literal>backup</>: This WAL sender is sending a backup.
</para> </para>
</listitem> </listitem>
<listitem>
<para>
<literal>stopping</>: This WAL sender is stopping.
</para>
</listitem>
</itemizedlist> </itemizedlist>
</entry> </entry>
</row> </row>
......
...@@ -8325,6 +8325,12 @@ ShutdownXLOG(int code, Datum arg) ...@@ -8325,6 +8325,12 @@ ShutdownXLOG(int code, Datum arg)
ereport(IsPostmasterEnvironment ? LOG : NOTICE, ereport(IsPostmasterEnvironment ? LOG : NOTICE,
(errmsg("shutting down"))); (errmsg("shutting down")));
/*
* Wait for WAL senders to be in stopping state. This prevents commands
* from writing new WAL.
*/
WalSndWaitStopping();
if (RecoveryInProgress()) if (RecoveryInProgress())
CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
else else
......
...@@ -2918,7 +2918,7 @@ reaper(SIGNAL_ARGS) ...@@ -2918,7 +2918,7 @@ reaper(SIGNAL_ARGS)
* Waken walsenders for the last time. No regular backends * Waken walsenders for the last time. No regular backends
* should be around anymore. * should be around anymore.
*/ */
SignalChildren(SIGUSR2); SignalChildren(SIGINT);
pmState = PM_SHUTDOWN_2; pmState = PM_SHUTDOWN_2;
...@@ -3656,7 +3656,9 @@ PostmasterStateMachine(void) ...@@ -3656,7 +3656,9 @@ PostmasterStateMachine(void)
/* /*
* If we get here, we are proceeding with normal shutdown. All * If we get here, we are proceeding with normal shutdown. All
* the regular children are gone, and it's time to tell the * the regular children are gone, and it's time to tell the
* checkpointer to do a shutdown checkpoint. * checkpointer to do a shutdown checkpoint. All WAL senders
* are told to switch to a stopping state so that the shutdown
* checkpoint can go ahead.
*/ */
Assert(Shutdown > NoShutdown); Assert(Shutdown > NoShutdown);
/* Start the checkpointer if not running */ /* Start the checkpointer if not running */
...@@ -3665,6 +3667,7 @@ PostmasterStateMachine(void) ...@@ -3665,6 +3667,7 @@ PostmasterStateMachine(void)
/* And tell it to shut down */ /* And tell it to shut down */
if (CheckpointerPID != 0) if (CheckpointerPID != 0)
{ {
SignalSomeChildren(SIGUSR2, BACKEND_TYPE_WALSND);
signal_child(CheckpointerPID, SIGUSR2); signal_child(CheckpointerPID, SIGUSR2);
pmState = PM_SHUTDOWN; pmState = PM_SHUTDOWN;
} }
......
...@@ -24,11 +24,14 @@ ...@@ -24,11 +24,14 @@
* are treated as not a crash but approximately normal termination; * are treated as not a crash but approximately normal termination;
* the walsender will exit quickly without sending any more XLOG records. * the walsender will exit quickly without sending any more XLOG records.
* *
* If the server is shut down, postmaster sends us SIGUSR2 after all * If the server is shut down, postmaster sends us SIGUSR2 after all regular
* regular backends have exited and the shutdown checkpoint has been written. * backends have exited. This causes the walsender to switch to the "stopping"
* This instructs walsender to send any outstanding WAL, including the * state. In this state, the walsender will reject any replication command
* shutdown checkpoint record, wait for it to be replicated to the standby, * that may generate WAL activity. The checkpointer begins the shutdown
* and then exit. * checkpoint once all walsenders are confirmed as stopping. When the shutdown
* checkpoint finishes, the postmaster sends us SIGINT. This instructs
* walsender to send any outstanding WAL, including the shutdown checkpoint
* record, wait for it to be replicated to the standby, and then exit.
* *
* *
* Portions Copyright (c) 2010-2017, PostgreSQL Global Development Group * Portions Copyright (c) 2010-2017, PostgreSQL Global Development Group
...@@ -177,13 +180,14 @@ static bool WalSndCaughtUp = false; ...@@ -177,13 +180,14 @@ static bool WalSndCaughtUp = false;
/* Flags set by signal handlers for later service in main loop */ /* Flags set by signal handlers for later service in main loop */
static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t walsender_ready_to_stop = false; static volatile sig_atomic_t got_SIGINT = false;
static volatile sig_atomic_t got_SIGUSR2 = false;
/* /*
* This is set while we are streaming. When not set, SIGUSR2 signal will be * This is set while we are streaming. When not set, SIGINT signal will be
* handled like SIGTERM. When set, the main loop is responsible for checking * handled like SIGTERM. When set, the main loop is responsible for checking
* walsender_ready_to_stop and terminating when it's set (after streaming any * got_SIGINT and terminating when it's set (after streaming any remaining
* remaining WAL). * WAL).
*/ */
static volatile sig_atomic_t replication_active = false; static volatile sig_atomic_t replication_active = false;
...@@ -213,6 +217,7 @@ static struct ...@@ -213,6 +217,7 @@ static struct
/* Signal handlers */ /* Signal handlers */
static void WalSndSigHupHandler(SIGNAL_ARGS); static void WalSndSigHupHandler(SIGNAL_ARGS);
static void WalSndXLogSendHandler(SIGNAL_ARGS); static void WalSndXLogSendHandler(SIGNAL_ARGS);
static void WalSndSwitchStopping(SIGNAL_ARGS);
static void WalSndLastCycleHandler(SIGNAL_ARGS); static void WalSndLastCycleHandler(SIGNAL_ARGS);
/* Prototypes for private functions */ /* Prototypes for private functions */
...@@ -299,11 +304,14 @@ WalSndErrorCleanup(void) ...@@ -299,11 +304,14 @@ WalSndErrorCleanup(void)
ReplicationSlotCleanup(); ReplicationSlotCleanup();
replication_active = false; replication_active = false;
if (walsender_ready_to_stop) if (got_SIGINT)
proc_exit(0); proc_exit(0);
/* Revert back to startup state */ /* Revert back to startup state */
WalSndSetState(WALSNDSTATE_STARTUP); WalSndSetState(WALSNDSTATE_STARTUP);
if (got_SIGUSR2)
WalSndSetState(WALSNDSTATE_STOPPING);
} }
/* /*
...@@ -676,7 +684,7 @@ StartReplication(StartReplicationCmd *cmd) ...@@ -676,7 +684,7 @@ StartReplication(StartReplicationCmd *cmd)
WalSndLoop(XLogSendPhysical); WalSndLoop(XLogSendPhysical);
replication_active = false; replication_active = false;
if (walsender_ready_to_stop) if (got_SIGINT)
proc_exit(0); proc_exit(0);
WalSndSetState(WALSNDSTATE_STARTUP); WalSndSetState(WALSNDSTATE_STARTUP);
...@@ -1053,7 +1061,7 @@ StartLogicalReplication(StartReplicationCmd *cmd) ...@@ -1053,7 +1061,7 @@ StartLogicalReplication(StartReplicationCmd *cmd)
{ {
ereport(LOG, ereport(LOG,
(errmsg("terminating walsender process after promotion"))); (errmsg("terminating walsender process after promotion")));
walsender_ready_to_stop = true; got_SIGINT = true;
} }
WalSndSetState(WALSNDSTATE_CATCHUP); WalSndSetState(WALSNDSTATE_CATCHUP);
...@@ -1103,7 +1111,7 @@ StartLogicalReplication(StartReplicationCmd *cmd) ...@@ -1103,7 +1111,7 @@ StartLogicalReplication(StartReplicationCmd *cmd)
ReplicationSlotRelease(); ReplicationSlotRelease();
replication_active = false; replication_active = false;
if (walsender_ready_to_stop) if (got_SIGINT)
proc_exit(0); proc_exit(0);
WalSndSetState(WALSNDSTATE_STARTUP); WalSndSetState(WALSNDSTATE_STARTUP);
...@@ -1290,6 +1298,14 @@ WalSndWaitForWal(XLogRecPtr loc) ...@@ -1290,6 +1298,14 @@ WalSndWaitForWal(XLogRecPtr loc)
else else
RecentFlushPtr = GetXLogReplayRecPtr(NULL); RecentFlushPtr = GetXLogReplayRecPtr(NULL);
/*
* If postmaster asked us to switch to the stopping state, do so.
* Shutdown is in progress and this will allow the checkpointer to
* move on with the shutdown checkpoint.
*/
if (got_SIGUSR2)
WalSndSetState(WALSNDSTATE_STOPPING);
/* /*
* If postmaster asked us to stop, don't wait here anymore. This will * If postmaster asked us to stop, don't wait here anymore. This will
* cause the xlogreader to return without reading a full record, which * cause the xlogreader to return without reading a full record, which
...@@ -1299,7 +1315,7 @@ WalSndWaitForWal(XLogRecPtr loc) ...@@ -1299,7 +1315,7 @@ WalSndWaitForWal(XLogRecPtr loc)
* RecentFlushPtr, so we can send all remaining data before shutting * RecentFlushPtr, so we can send all remaining data before shutting
* down. * down.
*/ */
if (walsender_ready_to_stop) if (got_SIGINT)
break; break;
/* /*
...@@ -1373,6 +1389,22 @@ exec_replication_command(const char *cmd_string) ...@@ -1373,6 +1389,22 @@ exec_replication_command(const char *cmd_string)
MemoryContext cmd_context; MemoryContext cmd_context;
MemoryContext old_context; MemoryContext old_context;
/*
* If WAL sender has been told that shutdown is getting close, switch its
* status accordingly to handle the next replication commands correctly.
*/
if (got_SIGUSR2)
WalSndSetState(WALSNDSTATE_STOPPING);
/*
* Throw error if in stopping mode. We need prevent commands that could
* generate WAL while the shutdown checkpoint is being written. To be
* safe, we just prohibit all new commands.
*/
if (MyWalSnd->state == WALSNDSTATE_STOPPING)
ereport(ERROR,
(errmsg("cannot execute new commands while WAL sender is in stopping mode")));
/* /*
* CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot until the next * CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot until the next
* command arrives. Clean up the old stuff if there's anything. * command arrives. Clean up the old stuff if there's anything.
...@@ -2095,13 +2127,20 @@ WalSndLoop(WalSndSendDataCallback send_data) ...@@ -2095,13 +2127,20 @@ WalSndLoop(WalSndSendDataCallback send_data)
} }
/* /*
* When SIGUSR2 arrives, we send any outstanding logs up to the * At the reception of SIGUSR2, switch the WAL sender to the stopping
* state.
*/
if (got_SIGUSR2)
WalSndSetState(WALSNDSTATE_STOPPING);
/*
* When SIGINT arrives, we send any outstanding logs up to the
* shutdown checkpoint record (i.e., the latest record), wait for * shutdown checkpoint record (i.e., the latest record), wait for
* them to be replicated to the standby, and exit. This may be a * them to be replicated to the standby, and exit. This may be a
* normal termination at shutdown, or a promotion, the walsender * normal termination at shutdown, or a promotion, the walsender
* is not sure which. * is not sure which.
*/ */
if (walsender_ready_to_stop) if (got_SIGINT)
WalSndDone(send_data); WalSndDone(send_data);
} }
...@@ -2841,7 +2880,23 @@ WalSndXLogSendHandler(SIGNAL_ARGS) ...@@ -2841,7 +2880,23 @@ WalSndXLogSendHandler(SIGNAL_ARGS)
errno = save_errno; errno = save_errno;
} }
/* SIGUSR2: set flag to do a last cycle and shut down afterwards */ /* SIGUSR2: set flag to switch to stopping state */
static void
WalSndSwitchStopping(SIGNAL_ARGS)
{
int save_errno = errno;
got_SIGUSR2 = true;
SetLatch(MyLatch);
errno = save_errno;
}
/*
* SIGINT: set flag to do a last cycle and shut down afterwards. The WAL
* sender should already have been switched to WALSNDSTATE_STOPPING at
* this point.
*/
static void static void
WalSndLastCycleHandler(SIGNAL_ARGS) WalSndLastCycleHandler(SIGNAL_ARGS)
{ {
...@@ -2856,7 +2911,7 @@ WalSndLastCycleHandler(SIGNAL_ARGS) ...@@ -2856,7 +2911,7 @@ WalSndLastCycleHandler(SIGNAL_ARGS)
if (!replication_active) if (!replication_active)
kill(MyProcPid, SIGTERM); kill(MyProcPid, SIGTERM);
walsender_ready_to_stop = true; got_SIGINT = true;
SetLatch(MyLatch); SetLatch(MyLatch);
errno = save_errno; errno = save_errno;
...@@ -2869,14 +2924,14 @@ WalSndSignals(void) ...@@ -2869,14 +2924,14 @@ WalSndSignals(void)
/* Set up signal handlers */ /* Set up signal handlers */
pqsignal(SIGHUP, WalSndSigHupHandler); /* set flag to read config pqsignal(SIGHUP, WalSndSigHupHandler); /* set flag to read config
* file */ * file */
pqsignal(SIGINT, SIG_IGN); /* not used */ pqsignal(SIGINT, WalSndLastCycleHandler); /* request a last cycle and
* shutdown */
pqsignal(SIGTERM, die); /* request shutdown */ pqsignal(SIGTERM, die); /* request shutdown */
pqsignal(SIGQUIT, quickdie); /* hard crash time */ pqsignal(SIGQUIT, quickdie); /* hard crash time */
InitializeTimeouts(); /* establishes SIGALRM handler */ InitializeTimeouts(); /* establishes SIGALRM handler */
pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN);
pqsignal(SIGUSR1, WalSndXLogSendHandler); /* request WAL sending */ pqsignal(SIGUSR1, WalSndXLogSendHandler); /* request WAL sending */
pqsignal(SIGUSR2, WalSndLastCycleHandler); /* request a last cycle and pqsignal(SIGUSR2, WalSndSwitchStopping); /* switch to stopping state */
* shutdown */
/* Reset some signals that are accepted by postmaster but not here */ /* Reset some signals that are accepted by postmaster but not here */
pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGCHLD, SIG_DFL);
...@@ -2954,6 +3009,50 @@ WalSndWakeup(void) ...@@ -2954,6 +3009,50 @@ WalSndWakeup(void)
} }
} }
/*
* Wait that all the WAL senders have reached the stopping state. This is
* used by the checkpointer to control when shutdown checkpoints can
* safely begin.
*/
void
WalSndWaitStopping(void)
{
for (;;)
{
int i;
bool all_stopped = true;
for (i = 0; i < max_wal_senders; i++)
{
WalSndState state;
WalSnd *walsnd = &WalSndCtl->walsnds[i];
SpinLockAcquire(&walsnd->mutex);
if (walsnd->pid == 0)
{
SpinLockRelease(&walsnd->mutex);
continue;
}
state = walsnd->state;
SpinLockRelease(&walsnd->mutex);
if (state != WALSNDSTATE_STOPPING)
{
all_stopped = false;
break;
}
}
/* safe to leave if confirmation is done for all WAL senders */
if (all_stopped)
return;
pg_usleep(10000L); /* wait for 10 msec */
}
}
/* Set state for current walsender (only called in walsender) */ /* Set state for current walsender (only called in walsender) */
void void
WalSndSetState(WalSndState state) WalSndSetState(WalSndState state)
...@@ -2987,6 +3086,8 @@ WalSndGetStateString(WalSndState state) ...@@ -2987,6 +3086,8 @@ WalSndGetStateString(WalSndState state)
return "catchup"; return "catchup";
case WALSNDSTATE_STREAMING: case WALSNDSTATE_STREAMING:
return "streaming"; return "streaming";
case WALSNDSTATE_STOPPING:
return "stopping";
} }
return "UNKNOWN"; return "UNKNOWN";
} }
......
...@@ -44,6 +44,7 @@ extern void WalSndSignals(void); ...@@ -44,6 +44,7 @@ extern void WalSndSignals(void);
extern Size WalSndShmemSize(void); extern Size WalSndShmemSize(void);
extern void WalSndShmemInit(void); extern void WalSndShmemInit(void);
extern void WalSndWakeup(void); extern void WalSndWakeup(void);
extern void WalSndWaitStopping(void);
extern void WalSndRqstFileReload(void); extern void WalSndRqstFileReload(void);
/* /*
......
...@@ -24,7 +24,8 @@ typedef enum WalSndState ...@@ -24,7 +24,8 @@ typedef enum WalSndState
WALSNDSTATE_STARTUP = 0, WALSNDSTATE_STARTUP = 0,
WALSNDSTATE_BACKUP, WALSNDSTATE_BACKUP,
WALSNDSTATE_CATCHUP, WALSNDSTATE_CATCHUP,
WALSNDSTATE_STREAMING WALSNDSTATE_STREAMING,
WALSNDSTATE_STOPPING
} WalSndState; } WalSndState;
/* /*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment