Fix postmaster's handling of a startup-process crash.

Ordinarily, a failure (unexpected exit status) of the startup subprocess
should be considered fatal, so the postmaster should just close up shop
and quit.  However, if we sent the startup process a SIGQUIT or SIGKILL
signal, the failure is hardly "unexpected", and we should attempt restart;
this is necessary for recovery from ordinary backend crashes in hot-standby
scenarios.  I attempted to implement the latter rule with a two-line patch
in commit 442231d7f7, but it now emerges that
that patch was a few bricks shy of a load: it failed to distinguish the
case of a signaled startup process from the case where the new startup
process crashes before reaching database consistency.  That resulted in
infinitely respawning a new startup process only to have it crash again.

To handle this properly, we really must track whether we have sent the
*current* startup process a kill signal.  Rather than add yet another
ad-hoc boolean to the postmaster's state, I chose to unify this with the
existing RecoveryError flag into an enum tracking the startup process's
state.  That seems more consistent with the postmaster's general state
machine design.

Back-patch to 9.0, like the previous patch.
This commit is contained in:
Tom Lane 2015-07-09 13:22:22 -04:00
parent 6ba365aa46
commit 45811be94e

View File

@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
PgStatPID = 0, PgStatPID = 0,
SysLoggerPID = 0; SysLoggerPID = 0;
/* Startup process's status */
typedef enum
{
STARTUP_NOT_RUNNING,
STARTUP_RUNNING,
STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */
STARTUP_CRASHED
} StartupStatusEnum;
static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
/* Startup/shutdown state */ /* Startup/shutdown state */
#define NoShutdown 0 #define NoShutdown 0
#define SmartShutdown 1 #define SmartShutdown 1
@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
static int Shutdown = NoShutdown; static int Shutdown = NoShutdown;
static bool FatalError = false; /* T if recovering from backend crash */ static bool FatalError = false; /* T if recovering from backend crash */
static bool RecoveryError = false; /* T if WAL recovery failed */
/* /*
* We use a simple state machine to control startup, shutdown, and * We use a simple state machine to control startup, shutdown, and
@ -301,8 +311,6 @@ static bool RecoveryError = false; /* T if WAL recovery failed */
* states, nor in PM_SHUTDOWN states (because we don't enter those states * states, nor in PM_SHUTDOWN states (because we don't enter those states
* when trying to recover from a crash). It can be true in PM_STARTUP state, * when trying to recover from a crash). It can be true in PM_STARTUP state,
* because we don't clear it until we've successfully started WAL redo. * because we don't clear it until we've successfully started WAL redo.
* Similarly, RecoveryError means that we have crashed during recovery, and
* should not try to restart.
*/ */
typedef enum typedef enum
{ {
@ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[])
*/ */
StartupPID = StartupDataBase(); StartupPID = StartupDataBase();
Assert(StartupPID != 0); Assert(StartupPID != 0);
StartupStatus = STARTUP_RUNNING;
pmState = PM_STARTUP; pmState = PM_STARTUP;
/* Some workers may be scheduled to start now */ /* Some workers may be scheduled to start now */
@ -1666,7 +1675,7 @@ ServerLoop(void)
/* If we have lost the archiver, try to start a new one. */ /* If we have lost the archiver, try to start a new one. */
if (PgArchPID == 0 && PgArchStartupAllowed()) if (PgArchPID == 0 && PgArchStartupAllowed())
PgArchPID = pgarch_start(); PgArchPID = pgarch_start();
/* If we need to signal the autovacuum launcher, do so now */ /* If we need to signal the autovacuum launcher, do so now */
if (avlauncher_needs_signal) if (avlauncher_needs_signal)
@ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS)
if (Shutdown > NoShutdown && if (Shutdown > NoShutdown &&
(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
{ {
StartupStatus = STARTUP_NOT_RUNNING;
pmState = PM_WAIT_BACKENDS; pmState = PM_WAIT_BACKENDS;
/* PostmasterStateMachine logic does the rest */ /* PostmasterStateMachine logic does the rest */
continue; continue;
@ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS)
{ {
ereport(LOG, ereport(LOG,
(errmsg("shutdown at recovery target"))); (errmsg("shutdown at recovery target")));
StartupStatus = STARTUP_NOT_RUNNING;
Shutdown = SmartShutdown; Shutdown = SmartShutdown;
TerminateChildren(SIGTERM); TerminateChildren(SIGTERM);
pmState = PM_WAIT_BACKENDS; pmState = PM_WAIT_BACKENDS;
@ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS)
/* /*
* After PM_STARTUP, any unexpected exit (including FATAL exit) of * After PM_STARTUP, any unexpected exit (including FATAL exit) of
* the startup process is catastrophic, so kill other children, * the startup process is catastrophic, so kill other children,
* and set RecoveryError so we don't try to reinitialize after * and set StartupStatus so we don't try to reinitialize after
* they're gone. Exception: if FatalError is already set, that * they're gone. Exception: if StartupStatus is STARTUP_SIGNALED,
* implies we previously sent the startup process a SIGQUIT, so * then we previously sent the startup process a SIGQUIT; so
* that's probably the reason it died, and we do want to try to * that's probably the reason it died, and we do want to try to
* restart in that case. * restart in that case.
*/ */
if (!EXIT_STATUS_0(exitstatus)) if (!EXIT_STATUS_0(exitstatus))
{ {
if (!FatalError) if (StartupStatus == STARTUP_SIGNALED)
RecoveryError = true; StartupStatus = STARTUP_NOT_RUNNING;
else
StartupStatus = STARTUP_CRASHED;
HandleChildCrash(pid, exitstatus, HandleChildCrash(pid, exitstatus,
_("startup process")); _("startup process"));
continue; continue;
@ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS)
/* /*
* Startup succeeded, commence normal operations * Startup succeeded, commence normal operations
*/ */
StartupStatus = STARTUP_NOT_RUNNING;
FatalError = false; FatalError = false;
Assert(AbortStartTime == 0); Assert(AbortStartTime == 0);
ReachedNormalRunning = true; ReachedNormalRunning = true;
@ -2962,7 +2976,7 @@ CleanupBackgroundWorker(int pid,
ReportBackgroundWorkerPID(rw); /* report child death */ ReportBackgroundWorkerPID(rw); /* report child death */
LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG, LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG,
namebuf, pid, exitstatus); namebuf, pid, exitstatus);
return true; return true;
} }
@ -3190,7 +3204,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/* Take care of the startup process too */ /* Take care of the startup process too */
if (pid == StartupPID) if (pid == StartupPID)
{
StartupPID = 0; StartupPID = 0;
StartupStatus = STARTUP_CRASHED;
}
else if (StartupPID != 0 && take_action) else if (StartupPID != 0 && take_action)
{ {
ereport(DEBUG2, ereport(DEBUG2,
@ -3198,6 +3215,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
(SendStop ? "SIGSTOP" : "SIGQUIT"), (SendStop ? "SIGSTOP" : "SIGQUIT"),
(int) StartupPID))); (int) StartupPID)));
signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT)); signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
StartupStatus = STARTUP_SIGNALED;
} }
/* Take care of the bgwriter too */ /* Take care of the bgwriter too */
@ -3589,13 +3607,14 @@ PostmasterStateMachine(void)
} }
/* /*
* If recovery failed, or the user does not want an automatic restart * If the startup process failed, or the user does not want an automatic
* after backend crashes, wait for all non-syslogger children to exit, and * restart after backend crashes, wait for all non-syslogger children to
* then exit postmaster. We don't try to reinitialize when recovery fails, * exit, and then exit postmaster. We don't try to reinitialize when the
* because more than likely it will just fail again and we will keep * startup process fails, because more than likely it will just fail again
* trying forever. * and we will keep trying forever.
*/ */
if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash)) if (pmState == PM_NO_CHILDREN &&
(StartupStatus == STARTUP_CRASHED || !restart_after_crash))
ExitPostmaster(1); ExitPostmaster(1);
/* /*
@ -3615,6 +3634,7 @@ PostmasterStateMachine(void)
StartupPID = StartupDataBase(); StartupPID = StartupDataBase();
Assert(StartupPID != 0); Assert(StartupPID != 0);
StartupStatus = STARTUP_RUNNING;
pmState = PM_STARTUP; pmState = PM_STARTUP;
/* crash recovery started, reset SIGKILL flag */ /* crash recovery started, reset SIGKILL flag */
AbortStartTime = 0; AbortStartTime = 0;
@ -3746,7 +3766,11 @@ TerminateChildren(int signal)
{ {
SignalChildren(signal); SignalChildren(signal);
if (StartupPID != 0) if (StartupPID != 0)
{
signal_child(StartupPID, signal); signal_child(StartupPID, signal);
if (signal == SIGQUIT || signal == SIGKILL)
StartupStatus = STARTUP_SIGNALED;
}
if (BgWriterPID != 0) if (BgWriterPID != 0)
signal_child(BgWriterPID, signal); signal_child(BgWriterPID, signal);
if (CheckpointerPID != 0) if (CheckpointerPID != 0)