Adjust elog.c so that elog(FATAL) exits (including cases where ERROR is

promoted to FATAL) end in exit(1) not exit(0).  Then change the postmaster to
allow exit(1) without a system-wide panic, but not for the startup subprocess
or the bgwriter.  There were a couple of places that were using exit(1) to
deliberately force a system-wide panic; adjust these to be exit(2) instead.
This fixes the problem noted back in July that if the startup process exits
with elog(ERROR), the postmaster would think everything is hunky-dory and
proceed to start up.  Alternative solutions such as trying to run the entire
startup process as a critical section seem less clean, primarily because of
the fact that a fair amount of startup code is shared by all postmaster
children in the EXEC_BACKEND case.  We'd need an ugly special case somewhere
near the head of main.c to make it work if it's the child process's
responsibility to determine what happens; and what's the point when the
postmaster already treats different children differently?
This commit is contained in:
Tom Lane 2006-11-21 00:49:55 +00:00
parent 778bb7b60d
commit e82d9e6283
5 changed files with 39 additions and 40 deletions

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.225 2006/10/04 00:29:49 momjian Exp $ * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.226 2006/11/21 00:49:54 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -421,15 +421,8 @@ BootstrapMain(int argc, char *argv[])
case BS_XLOG_STARTUP: case BS_XLOG_STARTUP:
bootstrap_signals(); bootstrap_signals();
StartupXLOG(); StartupXLOG();
/*
* These next two functions don't consider themselves critical,
* but we'd best PANIC anyway if they fail.
*/
START_CRIT_SECTION();
LoadFreeSpaceMap(); LoadFreeSpaceMap();
BuildFlatFiles(false); BuildFlatFiles(false);
END_CRIT_SECTION();
proc_exit(0); /* startup done */ proc_exit(0); /* startup done */
case BS_XLOG_BGWRITER: case BS_XLOG_BGWRITER:

View File

@ -37,7 +37,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.29 2006/10/06 17:13:59 petere Exp $ * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.30 2006/11/21 00:49:55 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -503,12 +503,12 @@ bg_quickdie(SIGNAL_ARGS)
* corrupted, so we don't want to try to clean up our transaction. Just * corrupted, so we don't want to try to clean up our transaction. Just
* nail the windows shut and get out of town. * nail the windows shut and get out of town.
* *
* Note we do exit(1) not exit(0). This is to force the postmaster into a * Note we do exit(2) not exit(0). This is to force the postmaster into a
* system reset cycle if some idiot DBA sends a manual SIGQUIT to a random * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
* backend. This is necessary precisely because we don't clean up our * backend. This is necessary precisely because we don't clean up our
* shared memory state. * shared memory state.
*/ */
exit(1); exit(2);
} }
/* SIGHUP: set flag to re-read config file at next convenient time */ /* SIGHUP: set flag to re-read config file at next convenient time */

View File

@ -37,7 +37,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.501 2006/11/05 22:42:09 tgl Exp $ * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.502 2006/11/21 00:49:55 tgl Exp $
* *
* NOTES * NOTES
* *
@ -358,6 +358,10 @@ static void ShmemBackendArrayRemove(pid_t pid);
#define StartupDataBase() StartChildProcess(BS_XLOG_STARTUP) #define StartupDataBase() StartChildProcess(BS_XLOG_STARTUP)
#define StartBackgroundWriter() StartChildProcess(BS_XLOG_BGWRITER) #define StartBackgroundWriter() StartChildProcess(BS_XLOG_BGWRITER)
/* Macros to check exit status of a child process */
#define EXIT_STATUS_0(st) ((st) == 0)
#define EXIT_STATUS_1(st) (WIFEXITED(st) && WEXITSTATUS(st) == 1)
/* /*
* Postmaster main entry point * Postmaster main entry point
@ -2025,7 +2029,8 @@ reaper(SIGNAL_ARGS)
if (StartupPID != 0 && pid == StartupPID) if (StartupPID != 0 && pid == StartupPID)
{ {
StartupPID = 0; StartupPID = 0;
if (exitstatus != 0) /* Note: FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
{ {
LogChildExit(LOG, _("startup process"), LogChildExit(LOG, _("startup process"),
pid, exitstatus); pid, exitstatus);
@ -2078,7 +2083,8 @@ reaper(SIGNAL_ARGS)
if (BgWriterPID != 0 && pid == BgWriterPID) if (BgWriterPID != 0 && pid == BgWriterPID)
{ {
BgWriterPID = 0; BgWriterPID = 0;
if (exitstatus == 0 && Shutdown > NoShutdown && !FatalError && if (EXIT_STATUS_0(exitstatus) &&
Shutdown > NoShutdown && !FatalError &&
!DLGetHead(BackendList) && AutoVacPID == 0) !DLGetHead(BackendList) && AutoVacPID == 0)
{ {
/* /*
@ -2096,7 +2102,8 @@ reaper(SIGNAL_ARGS)
} }
/* /*
* Any unexpected exit of the bgwriter is treated as a crash. * Any unexpected exit of the bgwriter (including FATAL exit)
* is treated as a crash.
*/ */
HandleChildCrash(pid, exitstatus, HandleChildCrash(pid, exitstatus,
_("background writer process")); _("background writer process"));
@ -2104,15 +2111,16 @@ reaper(SIGNAL_ARGS)
} }
/* /*
* Was it the autovacuum process? Normal exit can be ignored; we'll * Was it the autovacuum process? Normal or FATAL exit can be
* start a new one at the next iteration of the postmaster's main * ignored; we'll start a new one at the next iteration of the
* loop, if necessary. An unexpected exit is treated as a crash. * postmaster's main loop, if necessary. Any other exit condition
* is treated as a crash.
*/ */
if (AutoVacPID != 0 && pid == AutoVacPID) if (AutoVacPID != 0 && pid == AutoVacPID)
{ {
AutoVacPID = 0; AutoVacPID = 0;
autovac_stopped(); autovac_stopped();
if (exitstatus != 0) if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
HandleChildCrash(pid, exitstatus, HandleChildCrash(pid, exitstatus,
_("autovacuum process")); _("autovacuum process"));
continue; continue;
@ -2126,7 +2134,7 @@ reaper(SIGNAL_ARGS)
if (PgArchPID != 0 && pid == PgArchPID) if (PgArchPID != 0 && pid == PgArchPID)
{ {
PgArchPID = 0; PgArchPID = 0;
if (exitstatus != 0) if (!EXIT_STATUS_0(exitstatus))
LogChildExit(LOG, _("archiver process"), LogChildExit(LOG, _("archiver process"),
pid, exitstatus); pid, exitstatus);
if (XLogArchivingActive() && if (XLogArchivingActive() &&
@ -2143,7 +2151,7 @@ reaper(SIGNAL_ARGS)
if (PgStatPID != 0 && pid == PgStatPID) if (PgStatPID != 0 && pid == PgStatPID)
{ {
PgStatPID = 0; PgStatPID = 0;
if (exitstatus != 0) if (!EXIT_STATUS_0(exitstatus))
LogChildExit(LOG, _("statistics collector process"), LogChildExit(LOG, _("statistics collector process"),
pid, exitstatus); pid, exitstatus);
if (StartupPID == 0 && !FatalError && Shutdown == NoShutdown) if (StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
@ -2157,7 +2165,7 @@ reaper(SIGNAL_ARGS)
SysLoggerPID = 0; SysLoggerPID = 0;
/* for safety's sake, launch new logger *first* */ /* for safety's sake, launch new logger *first* */
SysLoggerPID = SysLogger_Start(); SysLoggerPID = SysLogger_Start();
if (exitstatus != 0) if (!EXIT_STATUS_0(exitstatus))
LogChildExit(LOG, _("system logger process"), LogChildExit(LOG, _("system logger process"),
pid, exitstatus); pid, exitstatus);
continue; continue;
@ -2229,12 +2237,12 @@ CleanupBackend(int pid,
LogChildExit(DEBUG2, _("server process"), pid, exitstatus); LogChildExit(DEBUG2, _("server process"), pid, exitstatus);
/* /*
* If a backend dies in an ugly way (i.e. exit status not 0) then we must * If a backend dies in an ugly way then we must signal all other backends
* signal all other backends to quickdie. If exit status is zero we * to quickdie. If exit status is zero (normal) or one (FATAL exit), we
* assume everything is hunky dory and simply remove the backend from the * assume everything is all right and simply remove the backend from the
* active backend list. * active backend list.
*/ */
if (exitstatus != 0) if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
{ {
HandleChildCrash(pid, exitstatus, _("server process")); HandleChildCrash(pid, exitstatus, _("server process"));
return; return;

View File

@ -8,7 +8,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.516 2006/10/19 19:52:22 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.517 2006/11/21 00:49:55 tgl Exp $
* *
* NOTES * NOTES
* this is the "main" module of the postgres backend and * this is the "main" module of the postgres backend and
@ -2327,12 +2327,12 @@ quickdie(SIGNAL_ARGS)
* corrupted, so we don't want to try to clean up our transaction. Just * corrupted, so we don't want to try to clean up our transaction. Just
* nail the windows shut and get out of town. * nail the windows shut and get out of town.
* *
* Note we do exit(1) not exit(0). This is to force the postmaster into a * Note we do exit(2) not exit(0). This is to force the postmaster into a
* system reset cycle if some idiot DBA sends a manual SIGQUIT to a random * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
* backend. This is necessary precisely because we don't clean up our * backend. This is necessary precisely because we don't clean up our
* shared memory state. * shared memory state.
*/ */
exit(1); exit(2);
} }
/* /*
@ -2374,7 +2374,7 @@ die(SIGNAL_ARGS)
/* /*
* Timeout or shutdown signal from postmaster during client authentication. * Timeout or shutdown signal from postmaster during client authentication.
* Simply exit(0). * Simply exit(1).
* *
* XXX: possible future improvement: try to send a message indicating * XXX: possible future improvement: try to send a message indicating
* why we are disconnecting. Problem is to be sure we don't block while * why we are disconnecting. Problem is to be sure we don't block while
@ -2383,7 +2383,7 @@ die(SIGNAL_ARGS)
void void
authdie(SIGNAL_ARGS) authdie(SIGNAL_ARGS)
{ {
exit(0); exit(1);
} }
/* /*

View File

@ -42,7 +42,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.175 2006/10/01 22:08:18 tgl Exp $ * $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.176 2006/11/21 00:49:55 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -421,25 +421,23 @@ errfinish(int dummy,...)
* fflush here is just to improve the odds that we get to see the * fflush here is just to improve the odds that we get to see the
* error message, in case things are so hosed that proc_exit crashes. * error message, in case things are so hosed that proc_exit crashes.
* Any other code you might be tempted to add here should probably be * Any other code you might be tempted to add here should probably be
* in an on_proc_exit callback instead. * in an on_proc_exit or on_shmem_exit callback instead.
*/ */
fflush(stdout); fflush(stdout);
fflush(stderr); fflush(stderr);
/* /*
* If proc_exit is already running, we exit with nonzero exit code to * Do normal process-exit cleanup, then return exit code 1 to indicate
* indicate that something's pretty wrong. We also want to exit with * FATAL termination. The postmaster may or may not consider this
* nonzero exit code if not running under the postmaster (for example, * worthy of panic, depending on which subprocess returns it.
* if we are being run from the initdb script, we'd better return an
* error status).
*/ */
proc_exit(proc_exit_inprogress || !IsUnderPostmaster); proc_exit(1);
} }
if (elevel >= PANIC) if (elevel >= PANIC)
{ {
/* /*
* Serious crash time. Postmaster will observe nonzero process exit * Serious crash time. Postmaster will observe SIGABRT process exit
* status and kill the other backends too. * status and kill the other backends too.
* *
* XXX: what if we are *in* the postmaster? abort() won't kill our * XXX: what if we are *in* the postmaster? abort() won't kill our