Adjust elog.c so that elog(FATAL) exits (including cases where ERROR is

promoted to FATAL) end in exit(1) not exit(0).  Then change the postmaster to
allow exit(1) without a system-wide panic, but not for the startup subprocess
or the bgwriter.  There were a couple of places that were using exit(1) to
deliberately force a system-wide panic; adjust these to be exit(2) instead.
This fixes the problem noted back in July that if the startup process exits
with elog(ERROR), the postmaster would think everything is hunky-dory and
proceed to start up.  Alternative solutions such as trying to run the entire
startup process as a critical section seem less clean, primarily because of
the fact that a fair amount of startup code is shared by all postmaster
children in the EXEC_BACKEND case.  We'd need an ugly special case somewhere
near the head of main.c to make it work if it's the child process's
responsibility to determine what happens; and what's the point when the
postmaster already treats different children differently?
This commit is contained in:
Tom Lane 2006-11-21 00:49:55 +00:00
parent 778bb7b60d
commit e82d9e6283
5 changed files with 39 additions and 40 deletions

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.225 2006/10/04 00:29:49 momjian Exp $
* $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.226 2006/11/21 00:49:54 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -421,15 +421,8 @@ BootstrapMain(int argc, char *argv[])
case BS_XLOG_STARTUP:
bootstrap_signals();
StartupXLOG();
/*
* These next two functions don't consider themselves critical,
* but we'd best PANIC anyway if they fail.
*/
START_CRIT_SECTION();
LoadFreeSpaceMap();
BuildFlatFiles(false);
END_CRIT_SECTION();
proc_exit(0); /* startup done */
case BS_XLOG_BGWRITER:

View File

@ -37,7 +37,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.29 2006/10/06 17:13:59 petere Exp $
* $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.30 2006/11/21 00:49:55 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -503,12 +503,12 @@ bg_quickdie(SIGNAL_ARGS)
* corrupted, so we don't want to try to clean up our transaction. Just
* nail the windows shut and get out of town.
*
* Note we do exit(1) not exit(0). This is to force the postmaster into a
* Note we do exit(2) not exit(0). This is to force the postmaster into a
* system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
* backend. This is necessary precisely because we don't clean up our
* shared memory state.
*/
exit(1);
exit(2);
}
/* SIGHUP: set flag to re-read config file at next convenient time */

View File

@ -37,7 +37,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.501 2006/11/05 22:42:09 tgl Exp $
* $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.502 2006/11/21 00:49:55 tgl Exp $
*
* NOTES
*
@ -358,6 +358,10 @@ static void ShmemBackendArrayRemove(pid_t pid);
#define StartupDataBase() StartChildProcess(BS_XLOG_STARTUP)
#define StartBackgroundWriter() StartChildProcess(BS_XLOG_BGWRITER)
/* Macros to check exit status of a child process */
#define EXIT_STATUS_0(st) ((st) == 0)
#define EXIT_STATUS_1(st) (WIFEXITED(st) && WEXITSTATUS(st) == 1)
/*
* Postmaster main entry point
@ -2025,7 +2029,8 @@ reaper(SIGNAL_ARGS)
if (StartupPID != 0 && pid == StartupPID)
{
StartupPID = 0;
if (exitstatus != 0)
/* Note: FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
{
LogChildExit(LOG, _("startup process"),
pid, exitstatus);
@ -2078,7 +2083,8 @@ reaper(SIGNAL_ARGS)
if (BgWriterPID != 0 && pid == BgWriterPID)
{
BgWriterPID = 0;
if (exitstatus == 0 && Shutdown > NoShutdown && !FatalError &&
if (EXIT_STATUS_0(exitstatus) &&
Shutdown > NoShutdown && !FatalError &&
!DLGetHead(BackendList) && AutoVacPID == 0)
{
/*
@ -2096,7 +2102,8 @@ reaper(SIGNAL_ARGS)
}
/*
* Any unexpected exit of the bgwriter is treated as a crash.
* Any unexpected exit of the bgwriter (including FATAL exit)
* is treated as a crash.
*/
HandleChildCrash(pid, exitstatus,
_("background writer process"));
@ -2104,15 +2111,16 @@ reaper(SIGNAL_ARGS)
}
/*
* Was it the autovacuum process? Normal exit can be ignored; we'll
* start a new one at the next iteration of the postmaster's main
* loop, if necessary. An unexpected exit is treated as a crash.
* Was it the autovacuum process? Normal or FATAL exit can be
* ignored; we'll start a new one at the next iteration of the
* postmaster's main loop, if necessary. Any other exit condition
* is treated as a crash.
*/
if (AutoVacPID != 0 && pid == AutoVacPID)
{
AutoVacPID = 0;
autovac_stopped();
if (exitstatus != 0)
if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
HandleChildCrash(pid, exitstatus,
_("autovacuum process"));
continue;
@ -2126,7 +2134,7 @@ reaper(SIGNAL_ARGS)
if (PgArchPID != 0 && pid == PgArchPID)
{
PgArchPID = 0;
if (exitstatus != 0)
if (!EXIT_STATUS_0(exitstatus))
LogChildExit(LOG, _("archiver process"),
pid, exitstatus);
if (XLogArchivingActive() &&
@ -2143,7 +2151,7 @@ reaper(SIGNAL_ARGS)
if (PgStatPID != 0 && pid == PgStatPID)
{
PgStatPID = 0;
if (exitstatus != 0)
if (!EXIT_STATUS_0(exitstatus))
LogChildExit(LOG, _("statistics collector process"),
pid, exitstatus);
if (StartupPID == 0 && !FatalError && Shutdown == NoShutdown)
@ -2157,7 +2165,7 @@ reaper(SIGNAL_ARGS)
SysLoggerPID = 0;
/* for safety's sake, launch new logger *first* */
SysLoggerPID = SysLogger_Start();
if (exitstatus != 0)
if (!EXIT_STATUS_0(exitstatus))
LogChildExit(LOG, _("system logger process"),
pid, exitstatus);
continue;
@ -2229,12 +2237,12 @@ CleanupBackend(int pid,
LogChildExit(DEBUG2, _("server process"), pid, exitstatus);
/*
* If a backend dies in an ugly way (i.e. exit status not 0) then we must
* signal all other backends to quickdie. If exit status is zero we
* assume everything is hunky dory and simply remove the backend from the
* If a backend dies in an ugly way then we must signal all other backends
* to quickdie. If exit status is zero (normal) or one (FATAL exit), we
* assume everything is all right and simply remove the backend from the
* active backend list.
*/
if (exitstatus != 0)
if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
{
HandleChildCrash(pid, exitstatus, _("server process"));
return;

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.516 2006/10/19 19:52:22 tgl Exp $
* $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.517 2006/11/21 00:49:55 tgl Exp $
*
* NOTES
* this is the "main" module of the postgres backend and
@ -2327,12 +2327,12 @@ quickdie(SIGNAL_ARGS)
* corrupted, so we don't want to try to clean up our transaction. Just
* nail the windows shut and get out of town.
*
* Note we do exit(1) not exit(0). This is to force the postmaster into a
* Note we do exit(2) not exit(0). This is to force the postmaster into a
* system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
* backend. This is necessary precisely because we don't clean up our
* shared memory state.
*/
exit(1);
exit(2);
}
/*
@ -2374,7 +2374,7 @@ die(SIGNAL_ARGS)
/*
* Timeout or shutdown signal from postmaster during client authentication.
* Simply exit(0).
* Simply exit(1).
*
* XXX: possible future improvement: try to send a message indicating
* why we are disconnecting. Problem is to be sure we don't block while
@ -2383,7 +2383,7 @@ die(SIGNAL_ARGS)
void
authdie(SIGNAL_ARGS)
{
exit(0);
exit(1);
}
/*

View File

@ -42,7 +42,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.175 2006/10/01 22:08:18 tgl Exp $
* $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.176 2006/11/21 00:49:55 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -421,25 +421,23 @@ errfinish(int dummy,...)
* fflush here is just to improve the odds that we get to see the
* error message, in case things are so hosed that proc_exit crashes.
* Any other code you might be tempted to add here should probably be
* in an on_proc_exit callback instead.
* in an on_proc_exit or on_shmem_exit callback instead.
*/
fflush(stdout);
fflush(stderr);
/*
* If proc_exit is already running, we exit with nonzero exit code to
* indicate that something's pretty wrong. We also want to exit with
* nonzero exit code if not running under the postmaster (for example,
* if we are being run from the initdb script, we'd better return an
* error status).
* Do normal process-exit cleanup, then return exit code 1 to indicate
* FATAL termination. The postmaster may or may not consider this
* worthy of panic, depending on which subprocess returns it.
*/
proc_exit(proc_exit_inprogress || !IsUnderPostmaster);
proc_exit(1);
}
if (elevel >= PANIC)
{
/*
* Serious crash time. Postmaster will observe nonzero process exit
* Serious crash time. Postmaster will observe SIGABRT process exit
* status and kill the other backends too.
*
* XXX: what if we are *in* the postmaster? abort() won't kill our