Fix postmaster's handling of fork failure for a bgworker process.

This corner case didn't behave nicely at all: the postmaster would
(partially) update its state as though the process had started
successfully, and be quite confused thereafter.  Fix it to act
like the worker had crashed, instead.

In passing, refactor so that do_start_bgworker contains all the
state-change logic for bgworker launch, rather than just some of it.

Back-patch as far as 9.4.  9.3 contains similar logic, but it's just
enough different that I don't feel comfortable applying the patch
without more study; and the use of bgworkers in 9.3 was so small
that it doesn't seem worth the extra work.

transam/parallel.c is still entirely unprepared for the possibility
of bgworker startup failure, but that seems like material for a
separate patch.

Discussion: https://postgr.es/m/4905.1492813727@sss.pgh.pa.us
This commit is contained in:
Tom Lane 2017-04-24 12:16:58 -04:00
parent 4b34624daa
commit 4fe04244b5
1 changed files with 77 additions and 33 deletions

View File

@ -420,6 +420,7 @@ static void TerminateChildren(int signal);
#define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL) #define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL)
static int CountChildren(int target); static int CountChildren(int target);
static bool assign_backendlist_entry(RegisteredBgWorker *rw);
static void maybe_start_bgworker(void); static void maybe_start_bgworker(void);
static bool CreateOptsFile(int argc, char *argv[], char *fullprogname); static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
static pid_t StartChildProcess(AuxProcType type); static pid_t StartChildProcess(AuxProcType type);
@ -5531,13 +5532,33 @@ bgworker_forkexec(int shmem_slot)
* Start a new bgworker. * Start a new bgworker.
* Starting time conditions must have been checked already. * Starting time conditions must have been checked already.
* *
* Returns true on success, false on failure.
* In either case, update the RegisteredBgWorker's state appropriately.
*
* This code is heavily based on autovacuum.c, q.v. * This code is heavily based on autovacuum.c, q.v.
*/ */
static void static bool
do_start_bgworker(RegisteredBgWorker *rw) do_start_bgworker(RegisteredBgWorker *rw)
{ {
pid_t worker_pid; pid_t worker_pid;
Assert(rw->rw_pid == 0);
/*
* Allocate and assign the Backend element. Note we must do this before
* forking, so that we can handle out of memory properly.
*
* Treat failure as though the worker had crashed. That way, the
* postmaster will wait a bit before attempting to start it again; if it
* tried again right away, most likely it'd find itself repeating the
* out-of-memory or fork failure condition.
*/
if (!assign_backendlist_entry(rw))
{
rw->rw_crashed_at = GetCurrentTimestamp();
return false;
}
ereport(DEBUG1, ereport(DEBUG1,
(errmsg("starting background worker process \"%s\"", (errmsg("starting background worker process \"%s\"",
rw->rw_worker.bgw_name))); rw->rw_worker.bgw_name)));
@ -5549,9 +5570,17 @@ do_start_bgworker(RegisteredBgWorker *rw)
#endif #endif
{ {
case -1: case -1:
/* in postmaster, fork failed ... */
ereport(LOG, ereport(LOG,
(errmsg("could not fork worker process: %m"))); (errmsg("could not fork worker process: %m")));
return; /* undo what assign_backendlist_entry did */
ReleasePostmasterChildSlot(rw->rw_child_slot);
rw->rw_child_slot = 0;
free(rw->rw_backend);
rw->rw_backend = NULL;
/* mark entry as crashed, so we'll try again later */
rw->rw_crashed_at = GetCurrentTimestamp();
break;
#ifndef EXEC_BACKEND #ifndef EXEC_BACKEND
case 0: case 0:
@ -5575,14 +5604,24 @@ do_start_bgworker(RegisteredBgWorker *rw)
PostmasterContext = NULL; PostmasterContext = NULL;
StartBackgroundWorker(); StartBackgroundWorker();
exit(1); /* should not get here */
break; break;
#endif #endif
default: default:
/* in postmaster, fork successful ... */
rw->rw_pid = worker_pid; rw->rw_pid = worker_pid;
rw->rw_backend->pid = rw->rw_pid; rw->rw_backend->pid = rw->rw_pid;
ReportBackgroundWorkerPID(rw); ReportBackgroundWorkerPID(rw);
break; /* add new worker to lists of backends */
dlist_push_head(&BackendList, &rw->rw_backend->elem);
#ifdef EXEC_BACKEND
ShmemBackendArrayAdd(rw->rw_backend);
#endif
return true;
} }
return false;
} }
/* /*
@ -5629,6 +5668,8 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
* Allocate the Backend struct for a connected background worker, but don't * Allocate the Backend struct for a connected background worker, but don't
* add it to the list of backends just yet. * add it to the list of backends just yet.
* *
* On failure, return false without changing any worker state.
*
* Some info from the Backend is copied into the passed rw. * Some info from the Backend is copied into the passed rw.
*/ */
static bool static bool
@ -5647,8 +5688,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
ereport(LOG, ereport(LOG,
(errcode(ERRCODE_INTERNAL_ERROR), (errcode(ERRCODE_INTERNAL_ERROR),
errmsg("could not generate random cancel key"))); errmsg("could not generate random cancel key")));
rw->rw_crashed_at = GetCurrentTimestamp();
return false; return false;
} }
@ -5658,14 +5697,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
ereport(LOG, ereport(LOG,
(errcode(ERRCODE_OUT_OF_MEMORY), (errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory"))); errmsg("out of memory")));
/*
* The worker didn't really crash, but setting this nonzero makes
* postmaster wait a bit before attempting to start it again; if it
* tried again right away, most likely it'd find itself under the same
* memory pressure.
*/
rw->rw_crashed_at = GetCurrentTimestamp();
return false; return false;
} }
@ -5687,6 +5718,11 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
* As a side effect, the bgworker control variables are set or reset whenever * As a side effect, the bgworker control variables are set or reset whenever
* there are more workers to start after this one, and whenever the overall * there are more workers to start after this one, and whenever the overall
* system state requires it. * system state requires it.
*
* The reason we start at most one worker per call is to avoid consuming the
* postmaster's attention for too long when many such requests are pending.
* As long as StartWorkerNeeded is true, ServerLoop will not block and will
* call this function again after dealing with any other issues.
*/ */
static void static void
maybe_start_bgworker(void) maybe_start_bgworker(void)
@ -5694,13 +5730,19 @@ maybe_start_bgworker(void)
slist_mutable_iter iter; slist_mutable_iter iter;
TimestampTz now = 0; TimestampTz now = 0;
/*
* During crash recovery, we have no need to be called until the state
* transition out of recovery.
*/
if (FatalError) if (FatalError)
{ {
StartWorkerNeeded = false; StartWorkerNeeded = false;
HaveCrashedWorker = false; HaveCrashedWorker = false;
return; /* not yet */ return;
} }
/* Don't need to be called again unless we find a reason for it below */
StartWorkerNeeded = false;
HaveCrashedWorker = false; HaveCrashedWorker = false;
slist_foreach_modify(iter, &BackgroundWorkerList) slist_foreach_modify(iter, &BackgroundWorkerList)
@ -5709,11 +5751,11 @@ maybe_start_bgworker(void)
rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur);
/* already running? */ /* ignore if already running */
if (rw->rw_pid != 0) if (rw->rw_pid != 0)
continue; continue;
/* marked for death? */ /* if marked for death, clean up and remove from list */
if (rw->rw_terminate) if (rw->rw_terminate)
{ {
ForgetBackgroundWorker(&iter); ForgetBackgroundWorker(&iter);
@ -5735,12 +5777,14 @@ maybe_start_bgworker(void)
continue; continue;
} }
/* read system time only when needed */
if (now == 0) if (now == 0)
now = GetCurrentTimestamp(); now = GetCurrentTimestamp();
if (!TimestampDifferenceExceeds(rw->rw_crashed_at, now, if (!TimestampDifferenceExceeds(rw->rw_crashed_at, now,
rw->rw_worker.bgw_restart_time * 1000)) rw->rw_worker.bgw_restart_time * 1000))
{ {
/* Set flag to remember that we have workers to start later */
HaveCrashedWorker = true; HaveCrashedWorker = true;
continue; continue;
} }
@ -5748,35 +5792,35 @@ maybe_start_bgworker(void)
if (bgworker_should_start_now(rw->rw_worker.bgw_start_time)) if (bgworker_should_start_now(rw->rw_worker.bgw_start_time))
{ {
/* reset crash time before calling assign_backendlist_entry */ /* reset crash time before trying to start worker */
rw->rw_crashed_at = 0; rw->rw_crashed_at = 0;
/* /*
* Allocate and assign the Backend element. Note we must do this * Try to start the worker.
* before forking, so that we can handle out of memory properly. *
* On failure, give up processing workers for now, but set
* StartWorkerNeeded so we'll come back here on the next iteration
* of ServerLoop to try again. (We don't want to wait, because
* there might be additional ready-to-run workers.) We could set
* HaveCrashedWorker as well, since this worker is now marked
* crashed, but there's no need because the next run of this
* function will do that.
*/ */
if (!assign_backendlist_entry(rw)) if (!do_start_bgworker(rw))
{
StartWorkerNeeded = true;
return; return;
}
do_start_bgworker(rw); /* sets rw->rw_pid */
dlist_push_head(&BackendList, &rw->rw_backend->elem);
#ifdef EXEC_BACKEND
ShmemBackendArrayAdd(rw->rw_backend);
#endif
/* /*
* Have ServerLoop call us again. Note that there might not * Quit, but have ServerLoop call us again to look for additional
* actually *be* another runnable worker, but we don't care all * ready-to-run workers. There might not be any, but we'll find
* that much; we will find out the next time we run. * out the next time we run.
*/ */
StartWorkerNeeded = true; StartWorkerNeeded = true;
return; return;
} }
} }
/* no runnable worker found */
StartWorkerNeeded = false;
} }
/* /*