Fix postmaster's handling of a startup-process crash.

Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d7, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch.

Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d7, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch.
45811be9 · Tom Lane · 6ba365aa · 45811be9
Commit 45811be9 authored Jul 09, 2015 by Tom Lane
Show whitespace changes
Inline Side-by-side

Showing with 40 additions and 16 deletions

src/backend/postmaster/postmaster.c src/backend/postmaster/postmaster.c +40 -16

No files found.
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
 			PgStatPID = 0,
 			SysLoggerPID = 0;
+/* Startup process's status */
+typedef enum
+{
+	STARTUP_NOT_RUNNING,
+	STARTUP_RUNNING,
+	STARTUP_SIGNALED,			/* we sent it a SIGQUIT or SIGKILL */
+	STARTUP_CRASHED
+} StartupStatusEnum;
+static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
 /* Startup/shutdown state */
 #define			NoShutdown		0
 #define			SmartShutdown	1
@@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
 static int	Shutdown = NoShutdown;
 static bool FatalError = false; /* T if recovering from backend crash */
-static bool RecoveryError = false;		/* T if WAL recovery failed */
 /*
 * We use a simple state machine to control startup, shutdown, and
@@ -301,8 +311,6 @@ static bool RecoveryError = false;		/* T if WAL recovery failed */
 * states, nor in PM_SHUTDOWN states (because we don't enter those states
 * when trying to recover from a crash).  It can be true in PM_STARTUP state,
 * because we don't clear it until we've successfully started WAL redo.
- * Similarly, RecoveryError means that we have crashed during recovery, and
- * should not try to restart.
 */
 typedef enum
 {
@@ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[])
 	 */
 	StartupPID = StartupDataBase();
 	Assert(StartupPID != 0);
+	StartupStatus = STARTUP_RUNNING;
 	pmState = PM_STARTUP;
 	/* Some workers may be scheduled to start now */
@@ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS)
 			if (Shutdown > NoShutdown &&
 				(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
 			{
+				StartupStatus = STARTUP_NOT_RUNNING;
 				pmState = PM_WAIT_BACKENDS;
 				/* PostmasterStateMachine logic does the rest */
 				continue;
@@ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS)
 			{
 				ereport(LOG,
 						(errmsg("shutdown at recovery target")));
+				StartupStatus = STARTUP_NOT_RUNNING;
 				Shutdown = SmartShutdown;
 				TerminateChildren(SIGTERM);
 				pmState = PM_WAIT_BACKENDS;
@@ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS)
 			/*
 			 * After PM_STARTUP, any unexpected exit (including FATAL exit) of
 			 * the startup process is catastrophic, so kill other children,
-			 * and set RecoveryError so we don't try to reinitialize after
+			 * and set StartupStatus so we don't try to reinitialize after
-			 * they're gone.  Exception: if FatalError is already set, that
+			 * they're gone.  Exception: if StartupStatus is STARTUP_SIGNALED,
-			 * implies we previously sent the startup process a SIGQUIT, so
+			 * then we previously sent the startup process a SIGQUIT; so
 			 * that's probably the reason it died, and we do want to try to
 			 * restart in that case.
 			 */
 			if (!EXIT_STATUS_0(exitstatus))
 			{
-				if (!FatalError)
+				if (StartupStatus == STARTUP_SIGNALED)
-					RecoveryError = true;
+					StartupStatus = STARTUP_NOT_RUNNING;
+				else
+					StartupStatus = STARTUP_CRASHED;
 				HandleChildCrash(pid, exitstatus,
 								 _("startup process"));
 				continue;
@@ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS)
 			/*
 			 * Startup succeeded, commence normal operations
 			 */
+			StartupStatus = STARTUP_NOT_RUNNING;
 			FatalError = false;
 			Assert(AbortStartTime == 0);
 			ReachedNormalRunning = true;
@@ -3190,7 +3204,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 	/* Take care of the startup process too */
 	if (pid == StartupPID)
+	{
 		StartupPID = 0;
+		StartupStatus = STARTUP_CRASHED;
+	}
 	else if (StartupPID != 0 && take_action)
 	{
 		ereport(DEBUG2,
@@ -3198,6 +3215,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
 								 (int) StartupPID)));
 		signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
+		StartupStatus = STARTUP_SIGNALED;
 	}
 	/* Take care of the bgwriter too */
@@ -3589,13 +3607,14 @@ PostmasterStateMachine(void)
 	}
 	/*
-	 * If recovery failed, or the user does not want an automatic restart
+	 * If the startup process failed, or the user does not want an automatic
-	 * after backend crashes, wait for all non-syslogger children to exit, and
+	 * restart after backend crashes, wait for all non-syslogger children to
-	 * then exit postmaster. We don't try to reinitialize when recovery fails,
+	 * exit, and then exit postmaster.  We don't try to reinitialize when the
-	 * because more than likely it will just fail again and we will keep
+	 * startup process fails, because more than likely it will just fail again
-	 * trying forever.
+	 * and we will keep trying forever.
 	 */
-	if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))
+	if (pmState == PM_NO_CHILDREN &&
+		(StartupStatus == STARTUP_CRASHED || !restart_after_crash))
 		ExitPostmaster(1);
 	/*
@@ -3615,6 +3634,7 @@ PostmasterStateMachine(void)
 		StartupPID = StartupDataBase();
 		Assert(StartupPID != 0);
+		StartupStatus = STARTUP_RUNNING;
 		pmState = PM_STARTUP;
 		/* crash recovery started, reset SIGKILL flag */
 		AbortStartTime = 0;
@@ -3746,7 +3766,11 @@ TerminateChildren(int signal)
 {
 	SignalChildren(signal);
 	if (StartupPID != 0)
+	{
 		signal_child(StartupPID, signal);
+		if (signal == SIGQUIT || signal == SIGKILL)
+			StartupStatus = STARTUP_SIGNALED;
+	}
 	if (BgWriterPID != 0)
 		signal_child(BgWriterPID, signal);
 	if (CheckpointerPID != 0)