Provide a reliable mechanism for terminating a background worker.

Although previously-introduced APIs allow the process that registers a
background worker to obtain the worker's PID, there's no way to prevent
a worker that is not currently running from being restarted.  This
patch introduces a new API TerminateBackgroundWorker() that prevents
the background worker from being restarted, terminates it if it is
currently running, and causes it to be unregistered if or when it is
not running.

Patch by me.  Review by Michael Paquier and KaiGai Kohei.
This commit is contained in:
Robert Haas 2013-10-18 10:21:25 -04:00
parent c2316dcda1
commit 523beaa11b
5 changed files with 96 additions and 19 deletions

View File

@ -184,13 +184,18 @@ typedef struct BackgroundWorker
argument to <function>RegisterDynamicBackgroundWorker</function>. If the argument to <function>RegisterDynamicBackgroundWorker</function>. If the
worker is successfully registered, this pointer will be initialized with an worker is successfully registered, this pointer will be initialized with an
opaque handle that can subsequently be passed to opaque handle that can subsequently be passed to
<function>GetBackgroundWorkerPid(<parameter>BackgroundWorkerHandle *</parameter>, <parameter>pid_t *</parameter>)</function>. <function>GetBackgroundWorkerPid(<parameter>BackgroundWorkerHandle *</parameter>, <parameter>pid_t *</parameter>)</function> or
This function can be used to poll the status of the worker: a return <function>TerminateBackgroundWorker(<parameter>BackgroundWorkerHandle *</parameter>)</function>.
value of <literal>BGWH_NOT_YET_STARTED</> indicates that the worker has not <function>GetBackgroundWorker</> can be used to poll the status of the
yet been started by the postmaster; <literal>BGWH_STOPPED</literal> worker: a return value of <literal>BGWH_NOT_YET_STARTED</> indicates that
indicates that it has been started but is no longer running; and the worker has not yet been started by the postmaster;
<literal>BGWH_STARTED</literal> indicates that it is currently running. <literal>BGWH_STOPPED</literal> indicates that it has been started but is
In this last case, the PID will also be returned via the second argument. no longer running; and <literal>BGWH_STARTED</literal> indicates that it is
currently running. In this last case, the PID will also be returned via the
second argument.
<function>TerminateBackgroundWorker</> causes the postmaster to send
<literal>SIGTERM</> to the worker if it is running, and to unregister it
as soon as it is not.
</para> </para>
<para> <para>

View File

@ -55,6 +55,11 @@ slist_head BackgroundWorkerList = SLIST_STATIC_INIT(BackgroundWorkerList);
* must fully initialize the slot - and insert a write memory barrier - before * must fully initialize the slot - and insert a write memory barrier - before
* marking it as in use. * marking it as in use.
* *
* As an exception, however, even when the slot is in use, regular backends
* may set the 'terminate' flag for a slot, telling the postmaster not
* to restart it. Once the background worker is no longer running, the slot
* will be released for reuse.
*
* In addition to coordinating with the postmaster, backends modifying this * In addition to coordinating with the postmaster, backends modifying this
* data structure must coordinate with each other. Since they can take locks, * data structure must coordinate with each other. Since they can take locks,
* this is straightforward: any backend wishing to manipulate a slot must * this is straightforward: any backend wishing to manipulate a slot must
@ -67,6 +72,7 @@ slist_head BackgroundWorkerList = SLIST_STATIC_INIT(BackgroundWorkerList);
typedef struct BackgroundWorkerSlot typedef struct BackgroundWorkerSlot
{ {
bool in_use; bool in_use;
bool terminate;
pid_t pid; /* InvalidPid = not started yet; 0 = dead */ pid_t pid; /* InvalidPid = not started yet; 0 = dead */
uint64 generation; /* incremented when slot is recycled */ uint64 generation; /* incremented when slot is recycled */
BackgroundWorker worker; BackgroundWorker worker;
@ -134,6 +140,7 @@ BackgroundWorkerShmemInit(void)
rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur);
Assert(slotno < max_worker_processes); Assert(slotno < max_worker_processes);
slot->in_use = true; slot->in_use = true;
slot->terminate = false;
slot->pid = InvalidPid; slot->pid = InvalidPid;
slot->generation = 0; slot->generation = 0;
rw->rw_shmem_slot = slotno; rw->rw_shmem_slot = slotno;
@ -223,14 +230,29 @@ BackgroundWorkerStateChange(void)
*/ */
pg_read_barrier(); pg_read_barrier();
/* /* See whether we already know about this worker. */
* See whether we already know about this worker. If not, we need
* to update our backend-private BackgroundWorkerList to match shared
* memory.
*/
rw = FindRegisteredWorkerBySlotNumber(slotno); rw = FindRegisteredWorkerBySlotNumber(slotno);
if (rw != NULL) if (rw != NULL)
{
/*
* In general, the worker data can't change after it's initially
* registered. However, someone can set the terminate flag.
*/
if (slot->terminate && !rw->rw_terminate)
{
rw->rw_terminate = true;
if (rw->rw_pid != 0)
kill(rw->rw_pid, SIGTERM);
}
continue; continue;
}
/* If it's already flagged as do not restart, just release the slot. */
if (slot->terminate)
{
slot->in_use = false;
continue;
}
/* /*
* Copy the registration data into the registered workers list. * Copy the registration data into the registered workers list.
@ -292,6 +314,7 @@ BackgroundWorkerStateChange(void)
rw->rw_child_slot = 0; rw->rw_child_slot = 0;
rw->rw_crashed_at = 0; rw->rw_crashed_at = 0;
rw->rw_shmem_slot = slotno; rw->rw_shmem_slot = slotno;
rw->rw_terminate = false;
/* Log it! */ /* Log it! */
ereport(LOG, ereport(LOG,
@ -714,6 +737,7 @@ RegisterBackgroundWorker(BackgroundWorker *worker)
rw->rw_pid = 0; rw->rw_pid = 0;
rw->rw_child_slot = 0; rw->rw_child_slot = 0;
rw->rw_crashed_at = 0; rw->rw_crashed_at = 0;
rw->rw_terminate = false;
slist_push_head(&BackgroundWorkerList, &rw->rw_lnode); slist_push_head(&BackgroundWorkerList, &rw->rw_lnode);
} }
@ -764,6 +788,7 @@ RegisterDynamicBackgroundWorker(BackgroundWorker *worker,
memcpy(&slot->worker, worker, sizeof(BackgroundWorker)); memcpy(&slot->worker, worker, sizeof(BackgroundWorker));
slot->pid = InvalidPid; /* indicates not started yet */ slot->pid = InvalidPid; /* indicates not started yet */
slot->generation++; slot->generation++;
slot->terminate = false;
generation = slot->generation; generation = slot->generation;
/* /*
@ -905,3 +930,33 @@ WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *handle, pid_t *pidp)
set_latch_on_sigusr1 = save_set_latch_on_sigusr1; set_latch_on_sigusr1 = save_set_latch_on_sigusr1;
return status; return status;
} }
/*
* Instruct the postmaster to terminate a background worker.
*
* Note that it's safe to do this without regard to whether the worker is
* still running, or even if the worker may already have existed and been
* unregistered.
*/
void
TerminateBackgroundWorker(BackgroundWorkerHandle *handle)
{
BackgroundWorkerSlot *slot;
bool signal_postmaster = false;
Assert(handle->slot < max_worker_processes);
slot = &BackgroundWorkerData->slot[handle->slot];
/* Set terminate flag in shared memory, unless slot has been reused. */
LWLockAcquire(BackgroundWorkerLock, LW_EXCLUSIVE);
if (handle->generation == slot->generation)
{
slot->terminate = true;
signal_postmaster = true;
}
LWLockRelease(BackgroundWorkerLock);
/* Make sure the postmaster notices the change to shared memory. */
if (signal_postmaster)
SendPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE);
}

View File

@ -1463,7 +1463,8 @@ DetermineSleepTime(struct timeval * timeout)
if (rw->rw_crashed_at == 0) if (rw->rw_crashed_at == 0)
continue; continue;
if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART
|| rw->rw_terminate)
{ {
ForgetBackgroundWorker(&siter); ForgetBackgroundWorker(&siter);
continue; continue;
@ -5471,6 +5472,13 @@ maybe_start_bgworker(void)
if (rw->rw_pid != 0) if (rw->rw_pid != 0)
continue; continue;
/* marked for death? */
if (rw->rw_terminate)
{
ForgetBackgroundWorker(&iter);
continue;
}
/* /*
* If this worker has crashed previously, maybe it needs to be * If this worker has crashed previously, maybe it needs to be
* restarted (unless on registration it specified it doesn't want to * restarted (unless on registration it specified it doesn't want to

View File

@ -9,17 +9,22 @@
* worker. Workers can also be registered dynamically at runtime. In either * worker. Workers can also be registered dynamically at runtime. In either
* case, the worker process is forked from the postmaster and runs the * case, the worker process is forked from the postmaster and runs the
* user-supplied "main" function. This code may connect to a database and * user-supplied "main" function. This code may connect to a database and
* run transactions. Once started, it stays active until shutdown or crash; * run transactions. Workers can remain active indefinitely, but will be
* unless the restart interval is declared as BGW_NEVER_RESTART and the * terminated if a shutdown or crash occurs.
* process exits with a return code of 1; workers that do this are
* automatically unregistered by the postmaster.
* *
* If the fork() call fails in the postmaster, it will try again later. Note * If the fork() call fails in the postmaster, it will try again later. Note
* that the failure can only be transient (fork failure due to high load, * that the failure can only be transient (fork failure due to high load,
* memory pressure, too many processes, etc); more permanent problems, like * memory pressure, too many processes, etc); more permanent problems, like
* failure to connect to a database, are detected later in the worker and dealt * failure to connect to a database, are detected later in the worker and dealt
* with just by having the worker exit normally. Postmaster will launch a new * with just by having the worker exit normally. A worker which exits with a
* worker again later. * return code of 0 will be immediately restarted by the postmaster. A worker
* which exits with a return code of 1 will be restarted after the configured
* restart interval, or never if that interval is set to BGW_NEVER_RESTART.
* The TerminateBackgroundWorker() function can be used to terminate a
* dynamically registered background worker; the worker will be sent a SIGTERM
* and will not be restarted after it exits. Whenever the postmaster knows
* that a worker will not be restarted, it unregisters the worker, freeing up
* that worker's slot for use by a new worker.
* *
* Note that there might be more than one worker in a database concurrently, * Note that there might be more than one worker in a database concurrently,
* and the same module may request more than one worker running the same (or * and the same module may request more than one worker running the same (or
@ -107,6 +112,9 @@ extern BgwHandleStatus GetBackgroundWorkerPid(BackgroundWorkerHandle *handle,
extern BgwHandleStatus WaitForBackgroundWorkerStartup(BackgroundWorkerHandle * extern BgwHandleStatus WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *
handle, pid_t *pid); handle, pid_t *pid);
/* Terminate a bgworker */
extern void TerminateBackgroundWorker(BackgroundWorkerHandle *handle);
/* This is valid in a running worker */ /* This is valid in a running worker */
extern BackgroundWorker *MyBgworkerEntry; extern BackgroundWorker *MyBgworkerEntry;

View File

@ -31,6 +31,7 @@ typedef struct RegisteredBgWorker
int rw_child_slot; int rw_child_slot;
TimestampTz rw_crashed_at; /* if not 0, time it last crashed */ TimestampTz rw_crashed_at; /* if not 0, time it last crashed */
int rw_shmem_slot; int rw_shmem_slot;
bool rw_terminate;
slist_node rw_lnode; /* list link */ slist_node rw_lnode; /* list link */
} RegisteredBgWorker; } RegisteredBgWorker;