Adjust elog.c so that elog(FATAL) exits (including cases where ERROR is

promoted to FATAL) end in exit(1) not exit(0). Then change the postmaster to allow exit(1) without a system-wide panic, but not for the startup subprocess or the bgwriter. There were a couple of places that were using exit(1) to deliberately force a system-wide panic; adjust these to be exit(2) instead. This fixes the problem noted back in July that if the startup process exits with elog(ERROR), the postmaster would think everything is hunky-dory and proceed to start up. Alternative solutions such as trying to run the entire startup process as a critical section seem less clean, primarily because of the fact that a fair amount of startup code is shared by all postmaster children in the EXEC_BACKEND case. We'd need an ugly special case somewhere near the head of main.c to make it work if it's the child process's responsibility to determine what happens; and what's the point when the postmaster already treats different children differently?
2006-11-21 00:49:55 +00:00 · 2006-11-21 00:49:55 +00:00 · e82d9e6283
parent 778bb7b60d
commit e82d9e6283
5 changed files with 39 additions and 40 deletions
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.225 2006/10/04 00:29:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.226 2006/11/21 00:49:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -421,15 +421,8 @@ BootstrapMain(int argc, char *argv[])
 		case BS_XLOG_STARTUP:
 			bootstrap_signals();
 			StartupXLOG();
-
-			/*
-			 * These next two functions don't consider themselves critical,
-			 * but we'd best PANIC anyway if they fail.
-			 */
-			START_CRIT_SECTION();
 			LoadFreeSpaceMap();
 			BuildFlatFiles(false);
-			END_CRIT_SECTION();
 			proc_exit(0);		/* startup done */

 		case BS_XLOG_BGWRITER:
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.29 2006/10/06 17:13:59 petere Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.30 2006/11/21 00:49:55 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -503,12 +503,12 @@ bg_quickdie(SIGNAL_ARGS)
 	 * corrupted, so we don't want to try to clean up our transaction. Just
 	 * nail the windows shut and get out of town.
 	 *
-	 * Note we do exit(1) not exit(0).	This is to force the postmaster into a
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
 	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
 	 * backend.  This is necessary precisely because we don't clean up our
 	 * shared memory state.
 	 */
-	exit(1);
+	exit(2);
 }

 /* SIGHUP: set flag to re-read config file at next convenient time */
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.501 2006/11/05 22:42:09 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.502 2006/11/21 00:49:55 tgl Exp $
 *
 * NOTES
 *
@ -358,6 +358,10 @@ static void ShmemBackendArrayRemove(pid_t pid);
 #define StartupDataBase()		StartChildProcess(BS_XLOG_STARTUP)
 #define StartBackgroundWriter() StartChildProcess(BS_XLOG_BGWRITER)

+/* Macros to check exit status of a child process */
+#define EXIT_STATUS_0(st)  ((st) == 0)
+#define EXIT_STATUS_1(st)  (WIFEXITED(st) && WEXITSTATUS(st) == 1)
+

 /*
 * Postmaster main entry point
@ -2025,7 +2029,8 @@ reaper(SIGNAL_ARGS)
 		if (StartupPID != 0 && pid == StartupPID)
 		{
 			StartupPID = 0;
-			if (exitstatus != 0)
+			/* Note: FATAL exit of startup is treated as catastrophic */
+			if (!EXIT_STATUS_0(exitstatus))
 			{
 				LogChildExit(LOG, _("startup process"),
 							 pid, exitstatus);
@ -2078,7 +2083,8 @@ reaper(SIGNAL_ARGS)
 		if (BgWriterPID != 0 && pid == BgWriterPID)
 		{
 			BgWriterPID = 0;
-			if (exitstatus == 0 && Shutdown > NoShutdown && !FatalError &&
+			if (EXIT_STATUS_0(exitstatus) &&
+				Shutdown > NoShutdown && !FatalError &&
 				!DLGetHead(BackendList) && AutoVacPID == 0)
 			{
 				/*
@ -2096,7 +2102,8 @@ reaper(SIGNAL_ARGS)
 			}

 			/*
-			 * Any unexpected exit of the bgwriter is treated as a crash.
+			 * Any unexpected exit of the bgwriter (including FATAL exit)
+			 * is treated as a crash.
 			 */
 			HandleChildCrash(pid, exitstatus,
 							 _("background writer process"));
@ -2104,15 +2111,16 @@ reaper(SIGNAL_ARGS)
 		}

 		/*
-		 * Was it the autovacuum process?  Normal exit can be ignored; we'll
-		 * start a new one at the next iteration of the postmaster's main
-		 * loop, if necessary.  An unexpected exit is treated as a crash.
+		 * Was it the autovacuum process?  Normal or FATAL exit can be
+		 * ignored; we'll start a new one at the next iteration of the
+		 * postmaster's main loop, if necessary.  Any other exit condition
+		 * is treated as a crash.
 		 */
 		if (AutoVacPID != 0 && pid == AutoVacPID)
 		{
 			AutoVacPID = 0;
 			autovac_stopped();
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
 				HandleChildCrash(pid, exitstatus,
 								 _("autovacuum process"));
 			continue;
@ -2126,7 +2134,7 @@ reaper(SIGNAL_ARGS)
 		if (PgArchPID != 0 && pid == PgArchPID)
 		{
 			PgArchPID = 0;
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("archiver process"),
 							 pid, exitstatus);
 			if (XLogArchivingActive() &&
@ -2143,7 +2151,7 @@ reaper(SIGNAL_ARGS)
 		if (PgStatPID != 0 && pid == PgStatPID)
 		{
 			PgStatPID = 0;
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("statistics collector process"),
 							 pid, exitstatus);
 			if (StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
@ -2157,7 +2165,7 @@ reaper(SIGNAL_ARGS)
 			SysLoggerPID = 0;
 			/* for safety's sake, launch new logger *first* */
 			SysLoggerPID = SysLogger_Start();
-			if (exitstatus != 0)
+			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("system logger process"),
 							 pid, exitstatus);
 			continue;
@ -2229,12 +2237,12 @@ CleanupBackend(int pid,
 	LogChildExit(DEBUG2, _("server process"), pid, exitstatus);

 	/*
-	 * If a backend dies in an ugly way (i.e. exit status not 0) then we must
-	 * signal all other backends to quickdie.  If exit status is zero we
-	 * assume everything is hunky dory and simply remove the backend from the
+	 * If a backend dies in an ugly way then we must signal all other backends
+	 * to quickdie.  If exit status is zero (normal) or one (FATAL exit), we
+	 * assume everything is all right and simply remove the backend from the
 	 * active backend list.
 	 */
-	if (exitstatus != 0)
+	if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
 	{
 		HandleChildCrash(pid, exitstatus, _("server process"));
 		return;
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.516 2006/10/19 19:52:22 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.517 2006/11/21 00:49:55 tgl Exp $
 *
 * NOTES
 *	  this is the "main" module of the postgres backend and
@ -2327,12 +2327,12 @@ quickdie(SIGNAL_ARGS)
 	 * corrupted, so we don't want to try to clean up our transaction. Just
 	 * nail the windows shut and get out of town.
 	 *
-	 * Note we do exit(1) not exit(0).	This is to force the postmaster into a
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
 	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
 	 * backend.  This is necessary precisely because we don't clean up our
 	 * shared memory state.
 	 */
-	exit(1);
+	exit(2);
 }

 /*
@ -2374,7 +2374,7 @@ die(SIGNAL_ARGS)

 /*
 * Timeout or shutdown signal from postmaster during client authentication.
- * Simply exit(0).
+ * Simply exit(1).
 *
 * XXX: possible future improvement: try to send a message indicating
 * why we are disconnecting.  Problem is to be sure we don't block while
@ -2383,7 +2383,7 @@ die(SIGNAL_ARGS)
 void
 authdie(SIGNAL_ARGS)
 {
-	exit(0);
+	exit(1);
 }

 /*
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@ -42,7 +42,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.175 2006/10/01 22:08:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.176 2006/11/21 00:49:55 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -421,25 +421,23 @@ errfinish(int dummy,...)
 		 * fflush here is just to improve the odds that we get to see the
 		 * error message, in case things are so hosed that proc_exit crashes.
 		 * Any other code you might be tempted to add here should probably be
-		 * in an on_proc_exit callback instead.
+		 * in an on_proc_exit or on_shmem_exit callback instead.
 		 */
 		fflush(stdout);
 		fflush(stderr);

 		/*
-		 * If proc_exit is already running, we exit with nonzero exit code to
-		 * indicate that something's pretty wrong.  We also want to exit with
-		 * nonzero exit code if not running under the postmaster (for example,
-		 * if we are being run from the initdb script, we'd better return an
-		 * error status).
+		 * Do normal process-exit cleanup, then return exit code 1 to indicate
+		 * FATAL termination.  The postmaster may or may not consider this
+		 * worthy of panic, depending on which subprocess returns it.
 		 */
-		proc_exit(proc_exit_inprogress || !IsUnderPostmaster);
+		proc_exit(1);
 	}

 	if (elevel >= PANIC)
 	{
 		/*
-		 * Serious crash time. Postmaster will observe nonzero process exit
+		 * Serious crash time. Postmaster will observe SIGABRT process exit
 		 * status and kill the other backends too.
 		 *
 		 * XXX: what if we are *in* the postmaster?  abort() won't kill our