Send SIGKILL to children if they don't die quickly in immediate shutdown

On immediate shutdown, or during a restart-after-crash sequence,
postmaster used to send SIGQUIT (and then abandon ship if shutdown); but
this is not a good strategy if backends don't die because of that
signal.  (This might happen, for example, if a backend gets tangled
trying to malloc() due to gettext(), as in an example illustrated by
MauMau.)  This causes problems when later trying to restart the server,
because some processes are still attached to the shared memory segment.

Instead of just abandoning such backends to their fates, we now have
postmaster hang around for a little while longer, send a SIGKILL after
some reasonable waiting period, and then exit.  This makes immediate
shutdown more reliable.

There is disagreement on whether it's best for postmaster to exit after
sending SIGKILL, or to stick around until all children have reported
death.  If this controversy is resolved differently than what this patch
implements, it's an easy change to make.

Bug reported by MauMau in message 20DAEA8949EC4E2289C6E8E58560DEC0@maumau

MauMau and Álvaro Herrera
This commit is contained in:
Alvaro Herrera 2013-06-28 17:20:53 -04:00
parent 457d6cf049
commit 82233ce7ea
3 changed files with 142 additions and 44 deletions

View File

@ -1362,11 +1362,11 @@ echo -1000 > /proc/self/oom_score_adj
<listitem>
<para>
This is the <firstterm>Immediate Shutdown</firstterm> mode.
The master <command>postgres</command> process will send a
<systemitem>SIGQUIT</systemitem> to all child processes and exit
immediately, without properly shutting itself down. The child processes
likewise exit immediately upon receiving
<systemitem>SIGQUIT</systemitem>. This will lead to recovery (by
The server will send <systemitem>SIGQUIT</systemitem> to all child
processes and wait for them to terminate. Those that don't terminate
within 5 seconds, will be sent <systemitem>SIGKILL</systemitem> by the
master <command>postgres</command> process, which will then terminate
without further waiting. This will lead to recovery (by
replaying the WAL log) upon next start-up. This is recommended
only in emergencies.
</para>

View File

@ -275,6 +275,7 @@ static pid_t StartupPID = 0,
#define NoShutdown 0
#define SmartShutdown 1
#define FastShutdown 2
#define ImmediateShutdown 3
static int Shutdown = NoShutdown;
@ -345,6 +346,10 @@ typedef enum
static PMState pmState = PM_INIT;
/* Start time of abort processing at immediate shutdown or child crash */
static time_t AbortStartTime;
#define SIGKILL_CHILDREN_AFTER_SECS 5
static bool ReachedNormalRunning = false; /* T if we've reached PM_RUN */
bool ClientAuthInProgress = false; /* T during new-client
@ -421,6 +426,7 @@ static void RandomSalt(char *md5Salt);
static void signal_child(pid_t pid, int signal);
static bool SignalSomeChildren(int signal, int targets);
static bool SignalUnconnectedWorkers(int signal);
static void TerminateChildren(int signal);
#define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL)
@ -1427,8 +1433,18 @@ DetermineSleepTime(struct timeval * timeout)
if (Shutdown > NoShutdown ||
(!StartWorkerNeeded && !HaveCrashedWorker))
{
timeout->tv_sec = 60;
timeout->tv_usec = 0;
if (AbortStartTime > 0)
{
/* remaining time, but at least 1 second */
timeout->tv_sec = Min(SIGKILL_CHILDREN_AFTER_SECS -
(time(NULL) - AbortStartTime), 1);
timeout->tv_usec = 0;
}
else
{
timeout->tv_sec = 60;
timeout->tv_usec = 0;
}
return;
}
@ -1660,6 +1676,28 @@ ServerLoop(void)
TouchSocketLockFiles();
last_touch_time = now;
}
/*
* If we already sent SIGQUIT to children and they are slow to shut
* down, it's time to send them SIGKILL. This doesn't happen normally,
* but under certain conditions backends can get stuck while shutting
* down. This is a last measure to get them unwedged.
*
* Note we also do this during recovery from a process crash.
*/
if ((Shutdown >= ImmediateShutdown || (FatalError && !SendStop)) &&
now - AbortStartTime >= SIGKILL_CHILDREN_AFTER_SECS)
{
/* We were gentle with them before. Not anymore */
TerminateChildren(SIGKILL);
/*
* Additionally, unless we're recovering from a process crash, it's
* now the time for postmaster to abandon ship.
*/
if (!FatalError)
ExitPostmaster(1);
}
}
}
@ -2455,30 +2493,27 @@ pmdie(SIGNAL_ARGS)
/*
* Immediate Shutdown:
*
* abort all children with SIGQUIT and exit without attempt to
* properly shut down data base system.
* abort all children with SIGQUIT, wait for them to exit,
* terminate remaining ones with SIGKILL, then exit without
* attempt to properly shut down the data base system.
*/
if (Shutdown >= ImmediateShutdown)
break;
Shutdown = ImmediateShutdown;
ereport(LOG,
(errmsg("received immediate shutdown request")));
SignalChildren(SIGQUIT);
if (StartupPID != 0)
signal_child(StartupPID, SIGQUIT);
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGQUIT);
if (CheckpointerPID != 0)
signal_child(CheckpointerPID, SIGQUIT);
if (WalWriterPID != 0)
signal_child(WalWriterPID, SIGQUIT);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, SIGQUIT);
if (AutoVacPID != 0)
signal_child(AutoVacPID, SIGQUIT);
if (PgArchPID != 0)
signal_child(PgArchPID, SIGQUIT);
if (PgStatPID != 0)
signal_child(PgStatPID, SIGQUIT);
SignalUnconnectedWorkers(SIGQUIT);
ExitPostmaster(0);
TerminateChildren(SIGQUIT);
pmState = PM_WAIT_BACKENDS;
/* set stopwatch for them to die */
AbortStartTime = time(NULL);
/*
* Now wait for backends to exit. If there are none,
* PostmasterStateMachine will take the next step.
*/
PostmasterStateMachine();
break;
}
@ -2952,12 +2987,17 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
dlist_mutable_iter iter;
slist_iter siter;
Backend *bp;
bool take_action;
/*
* Make log entry unless there was a previous crash (if so, nonzero exit
* status is to be expected in SIGQUIT response; don't clutter log)
* We only log messages and send signals if this is the first process crash
* and we're not doing an immediate shutdown; otherwise, we're only here to
* update postmaster's idea of live processes. If we have already signalled
* children, nonzero exit status is to be expected, so don't clutter log.
*/
if (!FatalError)
take_action = !FatalError && Shutdown != ImmediateShutdown;
if (take_action)
{
LogChildExit(LOG, procname, pid, exitstatus);
ereport(LOG,
@ -3003,7 +3043,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
* (-s on command line), then we send SIGSTOP instead, so that we
* can get core dumps from all backends by hand.
*/
if (!FatalError)
if (take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3055,7 +3095,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
if (bp->bkend_type == BACKEND_TYPE_BGWORKER)
continue;
if (!FatalError)
if (take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3069,7 +3109,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/* Take care of the startup process too */
if (pid == StartupPID)
StartupPID = 0;
else if (StartupPID != 0 && !FatalError)
else if (StartupPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3081,7 +3121,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/* Take care of the bgwriter too */
if (pid == BgWriterPID)
BgWriterPID = 0;
else if (BgWriterPID != 0 && !FatalError)
else if (BgWriterPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3093,7 +3133,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/* Take care of the checkpointer too */
if (pid == CheckpointerPID)
CheckpointerPID = 0;
else if (CheckpointerPID != 0 && !FatalError)
else if (CheckpointerPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3105,7 +3145,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/* Take care of the walwriter too */
if (pid == WalWriterPID)
WalWriterPID = 0;
else if (WalWriterPID != 0 && !FatalError)
else if (WalWriterPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3117,7 +3157,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/* Take care of the walreceiver too */
if (pid == WalReceiverPID)
WalReceiverPID = 0;
else if (WalReceiverPID != 0 && !FatalError)
else if (WalReceiverPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3129,7 +3169,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/* Take care of the autovacuum launcher too */
if (pid == AutoVacPID)
AutoVacPID = 0;
else if (AutoVacPID != 0 && !FatalError)
else if (AutoVacPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3144,7 +3184,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
* simplifies the state-machine logic in the case where a shutdown request
* arrives during crash processing.)
*/
if (PgArchPID != 0 && !FatalError)
if (PgArchPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3159,7 +3199,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
* simplifies the state-machine logic in the case where a shutdown request
* arrives during crash processing.)
*/
if (PgStatPID != 0 && !FatalError)
if (PgStatPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
@ -3171,7 +3211,9 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
/* We do NOT restart the syslogger */
FatalError = true;
if (Shutdown != ImmediateShutdown)
FatalError = true;
/* We now transit into a state of waiting for children to die */
if (pmState == PM_RECOVERY ||
pmState == PM_HOT_STANDBY ||
@ -3180,6 +3222,13 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
pmState == PM_WAIT_READONLY ||
pmState == PM_SHUTDOWN)
pmState = PM_WAIT_BACKENDS;
/*
* .. and if this doesn't happen quickly enough, now the clock is ticking
* for us to kill them without mercy.
*/
if (AbortStartTime == 0)
AbortStartTime = time(NULL);
}
/*
@ -3316,7 +3365,7 @@ PostmasterStateMachine(void)
WalWriterPID == 0 &&
AutoVacPID == 0)
{
if (FatalError)
if (Shutdown >= ImmediateShutdown || FatalError)
{
/*
* Start waiting for dead_end children to die. This state
@ -3326,7 +3375,8 @@ PostmasterStateMachine(void)
/*
* We already SIGQUIT'd the archiver and stats processes, if
* any, when we entered FatalError state.
* any, when we started immediate shutdown or entered
* FatalError state.
*/
}
else
@ -3511,6 +3561,7 @@ signal_child(pid_t pid, int signal)
case SIGTERM:
case SIGQUIT:
case SIGSTOP:
case SIGKILL:
if (kill(-pid, signal) < 0)
elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) (-pid), signal);
break;
@ -3597,6 +3648,33 @@ SignalSomeChildren(int signal, int target)
return signaled;
}
/*
* Send a termination signal to children. This considers all of our children
* processes, except syslogger and dead_end backends.
*/
static void
TerminateChildren(int signal)
{
SignalChildren(signal);
if (StartupPID != 0)
signal_child(StartupPID, signal);
if (BgWriterPID != 0)
signal_child(BgWriterPID, signal);
if (CheckpointerPID != 0)
signal_child(CheckpointerPID, signal);
if (WalWriterPID != 0)
signal_child(WalWriterPID, signal);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, signal);
if (AutoVacPID != 0)
signal_child(AutoVacPID, signal);
if (PgArchPID != 0)
signal_child(PgArchPID, signal);
if (PgStatPID != 0)
signal_child(PgStatPID, signal);
SignalUnconnectedWorkers(signal);
}
/*
* BackendStartup -- start backend process
*

View File

@ -38,6 +38,26 @@ pgkill(int pid, int sig)
errno = EINVAL;
return -1;
}
/* special case for SIGKILL: just ask the system to terminate the target */
if (sig == SIGKILL)
{
HANDLE prochandle;
if ((prochandle = OpenProcess(PROCESS_TERMINATE, FALSE, (DWORD) pid)) == NULL)
{
errno = ESRCH;
return -1;
}
if (!TerminateProcess(prochandle, 255))
{
_dosmaperr(GetLastError());
CloseHandle(prochandle);
return -1;
}
CloseHandle(prochandle);
return 0;
}
snprintf(pipename, sizeof(pipename), "\\\\.\\pipe\\pgsignal_%u", pid);
if (CallNamedPipe(pipename, &sigData, 1, &sigRet, 1, &bytes, 1000))