Improve client error messages for immediate-stop situations.

Up to now, if the DBA issued "pg_ctl stop -m immediate", the message
sent to clients was the same as for a crash-and-restart situation.
This is confusing, not least because the message claims that the
database will soon be up again, something we have no business
predicting.

Improve things so that we can generate distinct messages for the two
cases (and also recognize an ad-hoc SIGQUIT, should somebody try that).
To do that, add a field to pmsignal.c's shared memory data structure
that the postmaster sets just before broadcasting SIGQUIT to its
children.  No interlocking seems to be necessary; the intervening
signal-sending and signal-receipt should sufficiently serialize accesses
to the field.  Hence, this isn't any riskier than the existing usages
of pmsignal.c.

We might in future extend this idea to improve other
postmaster-to-children signal scenarios, although none of them
currently seem to be as badly overloaded as SIGQUIT.

Discussion: https://postgr.es/m/559291.1608587013@sss.pgh.pa.us
This commit is contained in:
Tom Lane 2020-12-24 12:58:32 -05:00
parent 90fbf7c57d
commit 7e784d1dc1
4 changed files with 86 additions and 16 deletions

View File

@ -218,6 +218,7 @@ int ReservedBackends;
/* The socket(s) we're listening to. */
#define MAXLISTEN 64
static pgsocket ListenSocket[MAXLISTEN];
/*
* These globals control the behavior of the postmaster in case some
* backend dumps core. Normally, it kills all peers of the dead backend
@ -2887,6 +2888,8 @@ pmdie(SIGNAL_ARGS)
sd_notify(0, "STOPPING=1");
#endif
/* tell children to shut down ASAP */
SetQuitSignalReason(PMQUIT_FOR_STOP);
TerminateChildren(SIGQUIT);
pmState = PM_WAIT_BACKENDS;
@ -3464,6 +3467,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
LogChildExit(LOG, procname, pid, exitstatus);
ereport(LOG,
(errmsg("terminating any other active server processes")));
SetQuitSignalReason(PMQUIT_FOR_CRASH);
}
/* Process background workers. */

View File

@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
*
* pmsignal.c
* routines for signaling the postmaster from its child processes
* routines for signaling between the postmaster and its child processes
*
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
@ -55,6 +55,10 @@
* but carries the extra information that the child is a WAL sender.
* WAL senders too start in ACTIVE state, but switch to WALSENDER once they
* start streaming the WAL (and they never go back to ACTIVE after that).
*
* We also have a shared-memory field that is used for communication in
* the opposite direction, from postmaster to children: it tells why the
* postmaster has broadcasted SIGQUIT signals, if indeed it has done so.
*/
#define PM_CHILD_UNUSED 0 /* these values must fit in sig_atomic_t */
@ -65,8 +69,10 @@
/* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */
struct PMSignalData
{
/* per-reason flags */
/* per-reason flags for signaling the postmaster */
sig_atomic_t PMSignalFlags[NUM_PMSIGNALS];
/* global flags for signals from postmaster to children */
QuitSignalReason sigquit_reason; /* why SIGQUIT was sent */
/* per-child-process flags */
int num_child_flags; /* # of entries in PMChildFlags[] */
int next_child_flag; /* next slot to try to assign */
@ -134,6 +140,7 @@ PMSignalShmemInit(void)
if (!found)
{
/* initialize all flags to zeroes */
MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
PMSignalState->num_child_flags = MaxLivePostmasterChildren();
}
@ -171,6 +178,34 @@ CheckPostmasterSignal(PMSignalReason reason)
return false;
}
/*
* SetQuitSignalReason - broadcast the reason for a system shutdown.
* Should be called by postmaster before sending SIGQUIT to children.
*
* Note: in a crash-and-restart scenario, the "reason" field gets cleared
* as a part of rebuilding shared memory; the postmaster need not do it
* explicitly.
*/
void
SetQuitSignalReason(QuitSignalReason reason)
{
PMSignalState->sigquit_reason = reason;
}
/*
* GetQuitSignalReason - obtain the reason for a system shutdown.
* Called by child processes when they receive SIGQUIT.
* If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT.
*/
QuitSignalReason
GetQuitSignalReason(void)
{
/* This is called in signal handlers, so be extra paranoid. */
if (!IsUnderPostmaster || PMSignalState == NULL)
return PMQUIT_NOT_SENT;
return PMSignalState->sigquit_reason;
}
/*
* AssignPostmasterChildSlot - select an unused slot for a new postmaster

View File

@ -67,6 +67,7 @@
#include "rewrite/rewriteHandler.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procsignal.h"
#include "storage/sinval.h"
@ -2752,8 +2753,8 @@ drop_unnamed_stmt(void)
/*
* quickdie() occurs when signaled SIGQUIT by the postmaster.
*
* Some backend has bought the farm,
* so we need to stop what we're doing and exit.
* Either some backend has bought the farm, or we've been told to shut down
* "immediately"; so we need to stop what we're doing and exit.
*/
void
quickdie(SIGNAL_ARGS)
@ -2788,18 +2789,36 @@ quickdie(SIGNAL_ARGS)
* wrong, so there's not much to lose. Assuming the postmaster is still
* running, it will SIGKILL us soon if we get stuck for some reason.
*
* Ideally this should be ereport(FATAL), but then we'd not get control
* back...
* Ideally these should be ereport(FATAL), but then we'd not get control
* back to force the correct type of process exit.
*/
ereport(WARNING,
(errcode(ERRCODE_CRASH_SHUTDOWN),
errmsg("terminating connection because of crash of another server process"),
errdetail("The postmaster has commanded this server process to roll back"
" the current transaction and exit, because another"
" server process exited abnormally and possibly corrupted"
" shared memory."),
errhint("In a moment you should be able to reconnect to the"
" database and repeat your command.")));
switch (GetQuitSignalReason())
{
case PMQUIT_NOT_SENT:
/* Hmm, SIGQUIT arrived out of the blue */
ereport(WARNING,
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection because of unexpected SIGQUIT signal")));
break;
case PMQUIT_FOR_CRASH:
/* A crash-and-restart cycle is in progress */
ereport(WARNING,
(errcode(ERRCODE_CRASH_SHUTDOWN),
errmsg("terminating connection because of crash of another server process"),
errdetail("The postmaster has commanded this server process to roll back"
" the current transaction and exit, because another"
" server process exited abnormally and possibly corrupted"
" shared memory."),
errhint("In a moment you should be able to reconnect to the"
" database and repeat your command.")));
break;
case PMQUIT_FOR_STOP:
/* Immediate-mode stop */
ereport(WARNING,
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating connection due to immediate shutdown command")));
break;
}
/*
* We DO NOT want to run proc_exit() or atexit() callbacks -- we're here

View File

@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
*
* pmsignal.h
* routines for signaling the postmaster from its child processes
* routines for signaling between the postmaster and its child processes
*
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
@ -45,6 +45,16 @@ typedef enum
NUM_PMSIGNALS /* Must be last value of enum! */
} PMSignalReason;
/*
* Reasons why the postmaster would send SIGQUIT to its children.
*/
typedef enum
{
PMQUIT_NOT_SENT = 0, /* postmaster hasn't sent SIGQUIT */
PMQUIT_FOR_CRASH, /* some other backend bought the farm */
PMQUIT_FOR_STOP /* immediate stop was commanded */
} QuitSignalReason;
/* PMSignalData is an opaque struct, details known only within pmsignal.c */
typedef struct PMSignalData PMSignalData;
@ -55,6 +65,8 @@ extern Size PMSignalShmemSize(void);
extern void PMSignalShmemInit(void);
extern void SendPostmasterSignal(PMSignalReason reason);
extern bool CheckPostmasterSignal(PMSignalReason reason);
extern void SetQuitSignalReason(QuitSignalReason reason);
extern QuitSignalReason GetQuitSignalReason(void);
extern int AssignPostmasterChildSlot(void);
extern bool ReleasePostmasterChildSlot(int slot);
extern bool IsPostmasterChildWalSender(int slot);