From 193e0270752b07f8d0700710a39c4ec367f57339 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 9 Jul 2015 13:22:23 -0400 Subject: [PATCH] Fix postmaster's handling of a startup-process crash. Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d7f71764b8c628044e7ce2225f9aa43b67, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch. --- src/backend/postmaster/postmaster.c | 56 ++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index df8037b498..1bb3138a03 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -249,6 +249,17 @@ static pid_t StartupPID = 0, PgStatPID = 0, SysLoggerPID = 0; +/* Startup process's status */ +typedef enum +{ + STARTUP_NOT_RUNNING, + STARTUP_RUNNING, + STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */ + STARTUP_CRASHED +} StartupStatusEnum; + +static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING; + /* Startup/shutdown state */ #define NoShutdown 0 #define SmartShutdown 1 @@ -258,7 +269,6 @@ static pid_t StartupPID = 0, static int Shutdown = NoShutdown; static bool FatalError = false; /* T if recovering from backend crash */ -static bool RecoveryError = false; /* T if WAL recovery failed */ /* * We use a simple state machine to control startup, shutdown, and @@ -301,8 +311,6 @@ static bool RecoveryError = false; /* T if WAL recovery failed */ * states, nor in PM_SHUTDOWN states (because we don't enter those states * when trying to recover from a crash). It can be true in PM_STARTUP state, * because we don't clear it until we've successfully started WAL redo. - * Similarly, RecoveryError means that we have crashed during recovery, and - * should not try to restart. */ typedef enum { @@ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[]) */ StartupPID = StartupDataBase(); Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; pmState = PM_STARTUP; /* Some workers may be scheduled to start now */ @@ -1666,7 +1675,7 @@ ServerLoop(void) /* If we have lost the archiver, try to start a new one. */ if (PgArchPID == 0 && PgArchStartupAllowed()) - PgArchPID = pgarch_start(); + PgArchPID = pgarch_start(); /* If we need to signal the autovacuum launcher, do so now */ if (avlauncher_needs_signal) @@ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS) if (Shutdown > NoShutdown && (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) { + StartupStatus = STARTUP_NOT_RUNNING; pmState = PM_WAIT_BACKENDS; /* PostmasterStateMachine logic does the rest */ continue; @@ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS) { ereport(LOG, (errmsg("shutdown at recovery target"))); + StartupStatus = STARTUP_NOT_RUNNING; Shutdown = SmartShutdown; TerminateChildren(SIGTERM); pmState = PM_WAIT_BACKENDS; @@ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS) /* * After PM_STARTUP, any unexpected exit (including FATAL exit) of * the startup process is catastrophic, so kill other children, - * and set RecoveryError so we don't try to reinitialize after - * they're gone. Exception: if FatalError is already set, that - * implies we previously sent the startup process a SIGQUIT, so + * and set StartupStatus so we don't try to reinitialize after + * they're gone. Exception: if StartupStatus is STARTUP_SIGNALED, + * then we previously sent the startup process a SIGQUIT; so * that's probably the reason it died, and we do want to try to * restart in that case. */ if (!EXIT_STATUS_0(exitstatus)) { - if (!FatalError) - RecoveryError = true; + if (StartupStatus == STARTUP_SIGNALED) + StartupStatus = STARTUP_NOT_RUNNING; + else + StartupStatus = STARTUP_CRASHED; HandleChildCrash(pid, exitstatus, _("startup process")); continue; @@ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS) /* * Startup succeeded, commence normal operations */ + StartupStatus = STARTUP_NOT_RUNNING; FatalError = false; Assert(AbortStartTime == 0); ReachedNormalRunning = true; @@ -2962,7 +2976,7 @@ CleanupBackgroundWorker(int pid, ReportBackgroundWorkerPID(rw); /* report child death */ LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG, - namebuf, pid, exitstatus); + namebuf, pid, exitstatus); return true; } @@ -3190,7 +3204,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) /* Take care of the startup process too */ if (pid == StartupPID) + { StartupPID = 0; + StartupStatus = STARTUP_CRASHED; + } else if (StartupPID != 0 && take_action) { ereport(DEBUG2, @@ -3198,6 +3215,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) (SendStop ? "SIGSTOP" : "SIGQUIT"), (int) StartupPID))); signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT)); + StartupStatus = STARTUP_SIGNALED; } /* Take care of the bgwriter too */ @@ -3589,13 +3607,14 @@ PostmasterStateMachine(void) } /* - * If recovery failed, or the user does not want an automatic restart - * after backend crashes, wait for all non-syslogger children to exit, and - * then exit postmaster. We don't try to reinitialize when recovery fails, - * because more than likely it will just fail again and we will keep - * trying forever. + * If the startup process failed, or the user does not want an automatic + * restart after backend crashes, wait for all non-syslogger children to + * exit, and then exit postmaster. We don't try to reinitialize when the + * startup process fails, because more than likely it will just fail again + * and we will keep trying forever. */ - if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash)) + if (pmState == PM_NO_CHILDREN && + (StartupStatus == STARTUP_CRASHED || !restart_after_crash)) ExitPostmaster(1); /* @@ -3615,6 +3634,7 @@ PostmasterStateMachine(void) StartupPID = StartupDataBase(); Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; pmState = PM_STARTUP; /* crash recovery started, reset SIGKILL flag */ AbortStartTime = 0; @@ -3746,7 +3766,11 @@ TerminateChildren(int signal) { SignalChildren(signal); if (StartupPID != 0) + { signal_child(StartupPID, signal); + if (signal == SIGQUIT || signal == SIGKILL) + StartupStatus = STARTUP_SIGNALED; + } if (BgWriterPID != 0) signal_child(BgWriterPID, signal); if (CheckpointerPID != 0)