From 8ff518699f19dd0a5076f5090bac8400b8233f7f Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 11 Apr 2017 12:31:00 -0400 Subject: [PATCH] Fix confusion of max_parallel_workers mechanism following crash. Commit b460f5d6693103076dc554aa7cbb96e1e53074f9 failed to contemplate the possibilit that a parallel worker registered before a crash would be unregistered only after the crash; if that happened, we'd end up with parallel_terminate_count > parallel_register_count and the system would refuse to launch any more parallel workers. The easiest way to fix that seems to be to forget BGW_NEVER_RESTART workers in ResetBackgroundWorkerCrashTimes() rather than leaving them around to be cleaned up after the conclusion of the restart, so that they go away before rather than after shared memory is reset. To make sure that this fix is water-tight, don't allow parallel workers to be anything other than BGW_NEVER_RESTART, so that after recovering from a crash, 0 is guaranteed to be the correct starting value for parallel_register_count. The core code wouldn't do this anyway, but somebody might try to do it in extension code. Report by Thomas Vondra. Patch by me, reviewed by Kuntal Ghosh. Discussion: http://postgr.es/m/CAGz5QC+AVEVS+3rBKRq83AxkJLMZ1peMt4nnrQwczxOrmo3CNw@mail.gmail.com --- src/backend/postmaster/bgworker.c | 48 +++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 0823317b78..6b2385c471 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -515,13 +515,34 @@ ResetBackgroundWorkerCrashTimes(void) rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); - /* - * For workers that should not be restarted, we don't want to lose the - * information that they have crashed; otherwise, they would be - * restarted, which is wrong. - */ - if (rw->rw_worker.bgw_restart_time != BGW_NEVER_RESTART) + if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) + { + /* + * Workers marked BGW_NVER_RESTART shouldn't get relaunched after + * the crash, so forget about them. (If we wait until after the + * crash to forget about them, and they are parallel workers, + * parallel_terminate_count will get incremented after we've + * already zeroed parallel_register_count, which would be bad.) + */ + ForgetBackgroundWorker(&iter); + } + else + { + /* + * The accounting which we do via parallel_register_count and + * parallel_terminate_count would get messed up if a worker marked + * parallel could survive a crash and restart cycle. All such + * workers should be marked BGW_NEVER_RESTART, and thus control + * should never reach this branch. + */ + Assert((rw->rw_worker.bgw_flags & BGWORKER_CLASS_PARALLEL) == 0); + + /* + * Allow this worker to be restarted immediately after we finish + * resetting. + */ rw->rw_crashed_at = 0; + } } } @@ -589,6 +610,21 @@ SanityCheckBackgroundWorker(BackgroundWorker *worker, int elevel) return false; } + /* + * Parallel workers may not be configured for restart, because the + * parallel_register_count/parallel_terminate_count accounting can't + * handle parallel workers lasting through a crash-and-restart cycle. + */ + if (worker->bgw_restart_time != BGW_NEVER_RESTART && + (worker->bgw_flags & BGWORKER_CLASS_PARALLEL) != 0) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": parallel workers may not be configured for restart", + worker->bgw_name))); + return false; + } + return true; }