mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-10-01 07:01:19 +02:00
In immediate shutdown, postmaster should not exit till children are gone.
This adjusts commit 82233ce7ea
so that the
postmaster does not exit until all its child processes have exited, even
if the 5-second timeout elapses and we have to send SIGKILL. There is no
great value in having the postmaster process quit sooner, and doing so can
mislead onlookers into thinking that the cluster is fully terminated when
actually some child processes still survive.
This effect might explain recent test failures on buildfarm member hamster,
wherein we failed to restart a cluster just after shutting it down with
"pg_ctl stop -m immediate".
I also did a bit of code review/beautification, including fixing a faulty
use of the Max() macro on a volatile expression.
Back-patch to 9.4. In older branches, the postmaster never waited for
children to exit during immediate shutdowns, and changing that would be
too much of a behavioral change.
This commit is contained in:
parent
da1a9d0f5b
commit
48913db887
@ -1441,10 +1441,11 @@ $ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
|
|||||||
<para>
|
<para>
|
||||||
This is the <firstterm>Immediate Shutdown</firstterm> mode.
|
This is the <firstterm>Immediate Shutdown</firstterm> mode.
|
||||||
The server will send <systemitem>SIGQUIT</systemitem> to all child
|
The server will send <systemitem>SIGQUIT</systemitem> to all child
|
||||||
processes and wait for them to terminate. Those that don't terminate
|
processes and wait for them to terminate. If any do not terminate
|
||||||
within 5 seconds, will be sent <systemitem>SIGKILL</systemitem> by the
|
within 5 seconds, they will be sent <systemitem>SIGKILL</systemitem>.
|
||||||
master <command>postgres</command> process, which will then terminate
|
The master server process exits as soon as all child processes have
|
||||||
without further waiting. This will lead to recovery (by
|
exited, without doing normal database shutdown processing.
|
||||||
|
This will lead to recovery (by
|
||||||
replaying the WAL log) upon next start-up. This is recommended
|
replaying the WAL log) upon next start-up. This is recommended
|
||||||
only in emergencies.
|
only in emergencies.
|
||||||
</para>
|
</para>
|
||||||
|
@ -324,8 +324,10 @@ typedef enum
|
|||||||
|
|
||||||
static PMState pmState = PM_INIT;
|
static PMState pmState = PM_INIT;
|
||||||
|
|
||||||
/* Start time of abort processing at immediate shutdown or child crash */
|
/* Start time of SIGKILL timeout during immediate shutdown or child crash */
|
||||||
static time_t AbortStartTime;
|
/* Zero means timeout is not running */
|
||||||
|
static time_t AbortStartTime = 0;
|
||||||
|
/* Length of said timeout */
|
||||||
#define SIGKILL_CHILDREN_AFTER_SECS 5
|
#define SIGKILL_CHILDREN_AFTER_SECS 5
|
||||||
|
|
||||||
static bool ReachedNormalRunning = false; /* T if we've reached PM_RUN */
|
static bool ReachedNormalRunning = false; /* T if we've reached PM_RUN */
|
||||||
@ -1419,7 +1421,8 @@ checkDataDir(void)
|
|||||||
* In normal conditions we wait at most one minute, to ensure that the other
|
* In normal conditions we wait at most one minute, to ensure that the other
|
||||||
* background tasks handled by ServerLoop get done even when no requests are
|
* background tasks handled by ServerLoop get done even when no requests are
|
||||||
* arriving. However, if there are background workers waiting to be started,
|
* arriving. However, if there are background workers waiting to be started,
|
||||||
* we don't actually sleep so that they are quickly serviced.
|
* we don't actually sleep so that they are quickly serviced. Other exception
|
||||||
|
* cases are as shown in the code.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
DetermineSleepTime(struct timeval * timeout)
|
DetermineSleepTime(struct timeval * timeout)
|
||||||
@ -1433,11 +1436,12 @@ DetermineSleepTime(struct timeval * timeout)
|
|||||||
if (Shutdown > NoShutdown ||
|
if (Shutdown > NoShutdown ||
|
||||||
(!StartWorkerNeeded && !HaveCrashedWorker))
|
(!StartWorkerNeeded && !HaveCrashedWorker))
|
||||||
{
|
{
|
||||||
if (AbortStartTime > 0)
|
if (AbortStartTime != 0)
|
||||||
{
|
{
|
||||||
/* time left to abort; clamp to 0 in case it already expired */
|
/* time left to abort; clamp to 0 in case it already expired */
|
||||||
timeout->tv_sec = Max(SIGKILL_CHILDREN_AFTER_SECS -
|
timeout->tv_sec = SIGKILL_CHILDREN_AFTER_SECS -
|
||||||
(time(NULL) - AbortStartTime), 0);
|
(time(NULL) - AbortStartTime);
|
||||||
|
timeout->tv_sec = Max(timeout->tv_sec, 0);
|
||||||
timeout->tv_usec = 0;
|
timeout->tv_usec = 0;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -1707,20 +1711,13 @@ ServerLoop(void)
|
|||||||
* Note we also do this during recovery from a process crash.
|
* Note we also do this during recovery from a process crash.
|
||||||
*/
|
*/
|
||||||
if ((Shutdown >= ImmediateShutdown || (FatalError && !SendStop)) &&
|
if ((Shutdown >= ImmediateShutdown || (FatalError && !SendStop)) &&
|
||||||
AbortStartTime > 0 &&
|
AbortStartTime != 0 &&
|
||||||
now - AbortStartTime >= SIGKILL_CHILDREN_AFTER_SECS)
|
(now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS)
|
||||||
{
|
{
|
||||||
/* We were gentle with them before. Not anymore */
|
/* We were gentle with them before. Not anymore */
|
||||||
TerminateChildren(SIGKILL);
|
TerminateChildren(SIGKILL);
|
||||||
/* reset flag so we don't SIGKILL again */
|
/* reset flag so we don't SIGKILL again */
|
||||||
AbortStartTime = 0;
|
AbortStartTime = 0;
|
||||||
|
|
||||||
/*
|
|
||||||
* Additionally, unless we're recovering from a process crash,
|
|
||||||
* it's now the time for postmaster to abandon ship.
|
|
||||||
*/
|
|
||||||
if (!FatalError)
|
|
||||||
ExitPostmaster(1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user