Adjust elog.c so that elog(FATAL) exits (including cases where ERROR is

promoted to FATAL) end in exit(1) not exit(0). Then change the postmaster to allow exit(1) without a system-wide panic, but not for the startup subprocess or the bgwriter. There were a couple of places that were using exit(1) to deliberately force a system-wide panic; adjust these to be exit(2) instead. This fixes the problem noted back in July that if the startup process exits with elog(ERROR), the postmaster would think everything is hunky-dory and proceed to start up. Alternative solutions such as trying to run the entire startup process as a critical section seem less clean, primarily because of the fact that a fair amount of startup code is shared by all postmaster children in the EXEC_BACKEND case. We'd need an ugly special case somewhere near the head of main.c to make it work if it's the child process's responsibility to determine what happens; and what's the point when the postmaster already treats different children differently?

Adjust elog.c so that elog(FATAL) exits (including cases where ERROR is
promoted to FATAL) end in exit(1) not exit(0). Then change the postmaster to allow exit(1) without a system-wide panic, but not for the startup subprocess or the bgwriter. There were a couple of places that were using exit(1) to deliberately force a system-wide panic; adjust these to be exit(2) instead. This fixes the problem noted back in July that if the startup process exits with elog(ERROR), the postmaster would think everything is hunky-dory and proceed to start up. Alternative solutions such as trying to run the entire startup process as a critical section seem less clean, primarily because of the fact that a fair amount of startup code is shared by all postmaster children in the EXEC_BACKEND case. We'd need an ugly special case somewhere near the head of main.c to make it work if it's the child process's responsibility to determine what happens; and what's the point when the postmaster already treats different children differently?
e82d9e62 · Tom Lane · 778bb7b6 · e82d9e62 · e82d9e62 · e82d9e62
Commit e82d9e62 authored Nov 21, 2006 by Tom Lane
5 changed files
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.225 2006/10/04 00:29:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.226 2006/11/21 00:49:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -421,15 +421,8 @@ BootstrapMain(int argc, char *argv[])
 		case BS_XLOG_STARTUP:
 			bootstrap_signals();
 			StartupXLOG();
-			/*
-			 * These next two functions don't consider themselves critical,
-			 * but we'd best PANIC anyway if they fail.
-			 */
-			START_CRIT_SECTION();
 			LoadFreeSpaceMap();
 			BuildFlatFiles(false);
-			END_CRIT_SECTION();
 			proc_exit(0);		/* startup done */
 		case BS_XLOG_BGWRITER:

--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.29 2006/10/06 17:13:59 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.30 2006/11/21 00:49:55 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -503,12 +503,12 @@ bg_quickdie(SIGNAL_ARGS)
 	 * corrupted, so we don't want to try to clean up our transaction. Just
 	 * nail the windows shut and get out of town.
 	 *
-	 * Note we do exit(1) not exit(0).	This is to force the postmaster into a
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
 	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
 	 * backend.  This is necessary precisely because we don't clean up our
 	 * shared memory state.
 	 */
-	exit(1);
+	exit(2);
 }
 /* SIGHUP: set flag to re-read config file at next convenient time */

--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.501 2006/11/05 22:42:09 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.502 2006/11/21 00:49:55 tgl Exp $
 *
 * NOTES
 *
@@ -358,6 +358,10 @@ static void ShmemBackendArrayRemove(pid_t pid);
 #define StartupDataBase()		StartChildProcess(BS_XLOG_STARTUP)
 #define StartBackgroundWriter() StartChildProcess(BS_XLOG_BGWRITER)
+/* Macros to check exit status of a child process */
+#define EXIT_STATUS_0(st)  ((st) == 0)
+#define EXIT_STATUS_1(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 1)
 /*
 * Postmaster main entry point
@@ -2025,7 +2029,8 @@ reaper(SIGNAL_ARGS)
 		if (StartupPID != 0 && pid == StartupPID)
 		{
 			StartupPID = 0;
-			if (exitstatus != 0)
+			/* Note: FATAL exit of startup is treated as catastrophic */
+			if (!EXIT_STATUS_0(exitstatus))
 			{
 				LogChildExit(LOG, _("startup process"),
 							 pid, exitstatus);
@@ -2078,7 +2083,8 @@ reaper(SIGNAL_ARGS)
 		if (BgWriterPID != 0 && pid == BgWriterPID)
 		{
 			BgWriterPID = 0;
-			if (exitstatus == 0 && Shutdown > NoShutdown && !FatalError &&
+			if (EXIT_STATUS_0(exitstatus) &&
+				Shutdown > NoShutdown && !FatalError &&
 				!DLGetHead(BackendList) && AutoVacPID == 0)
 			{
 				/*
@@ -2096,7 +2102,8 @@ reaper(SIGNAL_ARGS)
 			}
 			/*
-			 * Any unexpected exit of the bgwriter is treated as a crash.
+			 * Any unexpected exit of the bgwriter (including FATAL exit)
+			 * is treated as a crash.
 			 */
 			HandleChildCrash(pid, exitstatus,
 							 _("background writer process"));
@@ -2104,15 +2111,16 @@ reaper(SIGNAL_ARGS)
 		}
 		/*
-		 * Was it the autovacuum process?  Normal exit can be ignored; we'll
+		 * Was it the autovacuum process?  Normal or FATAL exit can be
-		 * start a new one at the next iteration of the postmaster's main
+		 * ignored; we'll start a new one at the next iteration of the
-		 * loop, if necessary.  An unexpected exit is treated as a crash.
+		 * postmaster's main loop, if necessary.  Any other exit condition
+		 * is treated as a crash.
 		 */
 		if (AutoVacPID != 0 && pid == AutoVacPID)
 		{
 			AutoVacPID = 0;
 			autovac_stopped();
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
 				HandleChildCrash(pid, exitstatus,
 								 _("autovacuum process"));
 			continue;
@@ -2126,7 +2134,7 @@ reaper(SIGNAL_ARGS)
 		if (PgArchPID != 0 && pid == PgArchPID)
 		{
 			PgArchPID = 0;
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("archiver process"),
 							 pid, exitstatus);
 			if (XLogArchivingActive() &&
@@ -2143,7 +2151,7 @@ reaper(SIGNAL_ARGS)
 		if (PgStatPID != 0 && pid == PgStatPID)
 		{
 			PgStatPID = 0;
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("statistics collector process"),
 							 pid, exitstatus);
 			if (StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
@@ -2157,7 +2165,7 @@ reaper(SIGNAL_ARGS)
 			SysLoggerPID = 0;
 			/* for safety's sake, launch new logger *first* */
 			SysLoggerPID = SysLogger_Start();
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("system logger process"),
 							 pid, exitstatus);
 			continue;
@@ -2229,12 +2237,12 @@ CleanupBackend(int pid,
 	LogChildExit(DEBUG2, _("server process"), pid, exitstatus);
 	/*
-	 * If a backend dies in an ugly way (i.e. exit status not 0) then we must
+	 * If a backend dies in an ugly way then we must signal all other backends
-	 * signal all other backends to quickdie.  If exit status is zero we
+	 * to quickdie.  If exit status is zero (normal) or one (FATAL exit), we
-	 * assume everything is hunky dory and simply remove the backend from the
+	 * assume everything is all right and simply remove the backend from the
 	 * active backend list.
 	 */
-	if (exitstatus != 0)
+	if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
 	{
 		HandleChildCrash(pid, exitstatus, _("server process"));
 		return;

--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.516 2006/10/19 19:52:22 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.517 2006/11/21 00:49:55 tgl Exp $
 *
 * NOTES
 *	  this is the "main" module of the postgres backend and
@@ -2327,12 +2327,12 @@ quickdie(SIGNAL_ARGS)
 	 * corrupted, so we don't want to try to clean up our transaction. Just
 	 * nail the windows shut and get out of town.
 	 *
-	 * Note we do exit(1) not exit(0).	This is to force the postmaster into a
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
 	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
 	 * backend.  This is necessary precisely because we don't clean up our
 	 * shared memory state.
 	 */
-	exit(1);
+	exit(2);
 }
 /*
@@ -2374,7 +2374,7 @@ die(SIGNAL_ARGS)
 /*
 * Timeout or shutdown signal from postmaster during client authentication.
- * Simply exit(0).
+ * Simply exit(1).
 *
 * XXX: possible future improvement: try to send a message indicating
 * why we are disconnecting.  Problem is to be sure we don't block while
@@ -2383,7 +2383,7 @@ die(SIGNAL_ARGS)
 void
 authdie(SIGNAL_ARGS)
 {
-	exit(0);
+	exit(1);
 }
 /*

--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -42,7 +42,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.175 2006/10/01 22:08:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.176 2006/11/21 00:49:55 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -421,25 +421,23 @@ errfinish(int dummy,...)
 		 * fflush here is just to improve the odds that we get to see the
 		 * error message, in case things are so hosed that proc_exit crashes.
 		 * Any other code you might be tempted to add here should probably be
-		 * in an on_proc_exit callback instead.
+		 * in an on_proc_exit or on_shmem_exit callback instead.
 		 */
 		fflush(stdout);
 		fflush(stderr);
 		/*
-		 * If proc_exit is already running, we exit with nonzero exit code to
+		 * Do normal process-exit cleanup, then return exit code 1 to indicate
-		 * indicate that something's pretty wrong.  We also want to exit with
+		 * FATAL termination.  The postmaster may or may not consider this
-		 * nonzero exit code if not running under the postmaster (for example,
+		 * worthy of panic, depending on which subprocess returns it.
-		 * if we are being run from the initdb script, we'd better return an
-		 * error status).
 		 */
-		proc_exit(proc_exit_inprogress || !IsUnderPostmaster);
+		proc_exit(1);
 	}
 	if (elevel >= PANIC)
 	{
 		/*
-		 * Serious crash time. Postmaster will observe nonzero process exit
+		 * Serious crash time. Postmaster will observe SIGABRT process exit
 		 * status and kill the other backends too.
 		 *
 		 * XXX: what if we are *in* the postmaster?  abort() won't kill our