diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 6cb32fcb60..e181950c0f 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -19,7 +19,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.37 2008/01/01 19:45:51 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.38 2008/01/11 00:54:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -77,12 +77,15 @@ * ---------- */ static time_t last_pgarch_start_time; +static time_t last_sigterm_time = 0; /* * Flags set by interrupt handlers for later service in the main loop. */ static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t got_SIGTERM = false; static volatile sig_atomic_t wakened = false; +static volatile sig_atomic_t ready_to_stop = false; /* ---------- * Local function forward declarations @@ -95,7 +98,9 @@ static pid_t pgarch_forkexec(void); NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]); static void pgarch_exit(SIGNAL_ARGS); static void ArchSigHupHandler(SIGNAL_ARGS); +static void ArchSigTermHandler(SIGNAL_ARGS); static void pgarch_waken(SIGNAL_ARGS); +static void pgarch_waken_stop(SIGNAL_ARGS); static void pgarch_MainLoop(void); static void pgarch_ArchiverCopyLoop(void); static bool pgarch_archiveXlog(char *xlog); @@ -236,16 +241,16 @@ PgArchiverMain(int argc, char *argv[]) /* * Ignore all signals usually bound to some action in the postmaster, - * except for SIGHUP, SIGUSR1 and SIGQUIT. + * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT. */ pqsignal(SIGHUP, ArchSigHupHandler); pqsignal(SIGINT, SIG_IGN); - pqsignal(SIGTERM, SIG_IGN); + pqsignal(SIGTERM, ArchSigTermHandler); pqsignal(SIGQUIT, pgarch_exit); pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, pgarch_waken); - pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGUSR2, pgarch_waken_stop); pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); @@ -267,28 +272,47 @@ PgArchiverMain(int argc, char *argv[]) static void pgarch_exit(SIGNAL_ARGS) { - /* - * For now, we just nail the doors shut and get out of town. It might - * seem cleaner to finish up any pending archive copies, but there's a - * nontrivial risk that init will kill us partway through. - */ - exit(0); + /* SIGQUIT means curl up and die ... */ + exit(1); } -/* SIGHUP: set flag to re-read config file at next convenient time */ +/* SIGHUP signal handler for archiver process */ static void ArchSigHupHandler(SIGNAL_ARGS) { + /* set flag to re-read config file at next convenient time */ got_SIGHUP = true; } +/* SIGTERM signal handler for archiver process */ +static void +ArchSigTermHandler(SIGNAL_ARGS) +{ + /* + * The postmaster never sends us SIGTERM, so we assume that this means + * that init is trying to shut down the whole system. If we hang around + * too long we'll get SIGKILL'd. Set flag to prevent starting any more + * archive commands. + */ + got_SIGTERM = true; +} + /* SIGUSR1 signal handler for archiver process */ static void pgarch_waken(SIGNAL_ARGS) { + /* set flag that there is work to be done */ wakened = true; } +/* SIGUSR2 signal handler for archiver process */ +static void +pgarch_waken_stop(SIGNAL_ARGS) +{ + /* set flag to do a final cycle and shut down afterwards */ + ready_to_stop = true; +} + /* * pgarch_MainLoop * @@ -298,6 +322,7 @@ static void pgarch_MainLoop(void) { time_t last_copy_time = 0; + bool time_to_stop; /* * We run the copy loop immediately upon entry, in case there are @@ -309,6 +334,9 @@ pgarch_MainLoop(void) do { + /* When we get SIGUSR2, we do one more archive cycle, then exit */ + time_to_stop = ready_to_stop; + /* Check for config update */ if (got_SIGHUP) { @@ -316,8 +344,26 @@ pgarch_MainLoop(void) ProcessConfigFile(PGC_SIGHUP); } + /* + * If we've gotten SIGTERM, we normally just sit and do nothing until + * SIGUSR2 arrives. However, that means a random SIGTERM would + * disable archiving indefinitely, which doesn't seem like a good + * idea. If more than 60 seconds pass since SIGTERM, exit anyway, + * so that the postmaster can start a new archiver if needed. + */ + if (got_SIGTERM) + { + time_t curtime = time(NULL); + + if (last_sigterm_time == 0) + last_sigterm_time = curtime; + else if ((unsigned int) (curtime - last_sigterm_time) >= + (unsigned int) 60) + break; + } + /* Do what we're here for */ - if (wakened) + if (wakened || time_to_stop) { wakened = false; pgarch_ArchiverCopyLoop(); @@ -334,7 +380,8 @@ pgarch_MainLoop(void) * sleep into 1-second increments, and check for interrupts after each * nap. */ - while (!(wakened || got_SIGHUP)) + while (!(wakened || ready_to_stop || got_SIGHUP || + !PostmasterIsAlive(true))) { time_t curtime; @@ -344,7 +391,13 @@ pgarch_MainLoop(void) (unsigned int) PGARCH_AUTOWAKE_INTERVAL) wakened = true; } - } while (PostmasterIsAlive(true)); + + /* + * The archiver quits either when the postmaster dies (not expected) + * or after completing one more archiving cycle after receiving + * SIGUSR2. + */ + } while (PostmasterIsAlive(true) && !time_to_stop); } /* @@ -377,8 +430,14 @@ pgarch_ArchiverCopyLoop(void) for (;;) { - /* Abandon processing if we notice our postmaster has died */ - if (!PostmasterIsAlive(true)) + /* + * Do not initiate any more archive commands after receiving + * SIGTERM, nor after the postmaster has died unexpectedly. + * The first condition is to try to keep from having init + * SIGKILL the command, and the second is to avoid conflicts + * with another archiver spawned by a newer postmaster. + */ + if (got_SIGTERM || !PostmasterIsAlive(true)) return; if (pgarch_archiveXlog(xlog)) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index ba6c9b9183..fe1ed795f9 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -37,7 +37,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.550 2008/01/01 19:45:51 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.551 2008/01/11 00:54:09 tgl Exp $ * * NOTES * @@ -244,7 +244,7 @@ static bool FatalError = false; /* T if recovering from backend crash */ * Notice that this state variable does not distinguish *why* we entered * PM_WAIT_BACKENDS or later states --- Shutdown and FatalError must be * consulted to find that out. FatalError is never true in PM_RUN state, nor - * in PM_SHUTDOWN state (because we don't enter that state when trying to + * in PM_SHUTDOWN states (because we don't enter those states when trying to * recover from a crash). It can be true in PM_STARTUP state, because we * don't clear it until we've successfully recovered. */ @@ -255,6 +255,7 @@ typedef enum PM_RUN, /* normal "database is alive" state */ PM_WAIT_BACKENDS, /* waiting for live backends to exit */ PM_SHUTDOWN, /* waiting for bgwriter to do shutdown ckpt */ + PM_SHUTDOWN_2, /* waiting for archiver to finish */ PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */ PM_NO_CHILDREN /* all important children have exited */ } PMState; @@ -1312,12 +1313,8 @@ ServerLoop(void) start_autovac_launcher = false; /* signal processed */ } - /* - * If we have lost the archiver, try to start a new one. We do this - * even if we are shutting down, to allow archiver to take care of any - * remaining WAL files. - */ - if (XLogArchivingActive() && PgArchPID == 0 && pmState >= PM_RUN) + /* If we have lost the archiver, try to start a new one */ + if (XLogArchivingActive() && PgArchPID == 0 && pmState == PM_RUN) PgArchPID = pgarch_start(); /* If we have lost the stats collector, try to start a new one */ @@ -2175,12 +2172,31 @@ reaper(SIGNAL_ARGS) * checkpoint. (If for some reason it didn't, recovery will * occur on next postmaster start.) * - * At this point we should have no normal children left (else - * we'd not be in PM_SHUTDOWN state) but we might have - * dead_end children. + * At this point we should have no normal backend children + * left (else we'd not be in PM_SHUTDOWN state) but we might + * have dead_end children to wait for. + * + * If we have an archiver subprocess, tell it to do a last + * archive cycle and quit; otherwise we can go directly to + * PM_WAIT_DEAD_END state. */ Assert(Shutdown > NoShutdown); - pmState = PM_WAIT_DEAD_END; + + if (PgArchPID != 0) + { + /* Waken archiver for the last time */ + signal_child(PgArchPID, SIGUSR2); + pmState = PM_SHUTDOWN_2; + } + else + pmState = PM_WAIT_DEAD_END; + + /* + * We can also shut down the stats collector now; there's + * nothing left for it to do. + */ + if (PgStatPID != 0) + signal_child(PgStatPID, SIGQUIT); } else { @@ -2227,7 +2243,8 @@ reaper(SIGNAL_ARGS) /* * Was it the archiver? If so, just try to start a new one; no need * to force reset of the rest of the system. (If fail, we'll try - * again in future cycles of the main loop.) + * again in future cycles of the main loop.) But if we were waiting + * for it to shut down, advance to the next shutdown step. */ if (pid == PgArchPID) { @@ -2235,8 +2252,10 @@ reaper(SIGNAL_ARGS) if (!EXIT_STATUS_0(exitstatus)) LogChildExit(LOG, _("archiver process"), pid, exitstatus); - if (XLogArchivingActive() && pmState >= PM_RUN) + if (XLogArchivingActive() && pmState == PM_RUN) PgArchPID = pgarch_start(); + else if (pmState == PM_SHUTDOWN_2) + pmState = PM_WAIT_DEAD_END; continue; } @@ -2563,6 +2582,11 @@ PostmasterStateMachine(void) * change causes ServerLoop to stop creating new ones. */ pmState = PM_WAIT_DEAD_END; + + /* + * We already SIGQUIT'd the archiver and stats processes, + * if any, when we entered FatalError state. + */ } else { @@ -2591,13 +2615,13 @@ PostmasterStateMachine(void) */ FatalError = true; pmState = PM_WAIT_DEAD_END; + + /* Kill the archiver and stats collector too */ + if (PgArchPID != 0) + signal_child(PgArchPID, SIGQUIT); + if (PgStatPID != 0) + signal_child(PgStatPID, SIGQUIT); } - /* Tell pgarch to shut down too; nothing left for it to do */ - if (PgArchPID != 0) - signal_child(PgArchPID, SIGQUIT); - /* Tell pgstat to shut down too; nothing left for it to do */ - if (PgStatPID != 0) - signal_child(PgStatPID, SIGQUIT); } } } @@ -2606,16 +2630,26 @@ PostmasterStateMachine(void) { /* * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty - * (ie, no dead_end children remain). + * (ie, no dead_end children remain), and the archiver and stats + * collector are gone too. + * + * The reason we wait for those two is to protect them against a new + * postmaster starting conflicting subprocesses; this isn't an + * ironclad protection, but it at least helps in the + * shutdown-and-immediately-restart scenario. Note that they have + * already been sent appropriate shutdown signals, either during a + * normal state transition leading up to PM_WAIT_DEAD_END, or during + * FatalError processing. */ - if (!DLGetHead(BackendList)) + if (DLGetHead(BackendList) == NULL && + PgArchPID == 0 && PgStatPID == 0) { /* These other guys should be dead already */ Assert(StartupPID == 0); Assert(BgWriterPID == 0); Assert(WalWriterPID == 0); Assert(AutoVacPID == 0); - /* archiver, stats, and syslogger are not considered here */ + /* syslogger is not considered here */ pmState = PM_NO_CHILDREN; } } @@ -2628,14 +2662,9 @@ PostmasterStateMachine(void) * we got SIGTERM from init --- there may well not be time for recovery * before init decides to SIGKILL us.) * - * Note: we do not wait around for exit of the archiver or stats - * processes. They've been sent SIGQUIT by this point (either when we - * entered PM_SHUTDOWN state, or when we set FatalError, and at least one - * of those must have happened by now). In any case they contain logic to - * commit hara-kiri if they notice the postmaster is gone. Since they - * aren't connected to shared memory, they pose no problem for shutdown. - * The syslogger is not considered either, since it's intended to survive - * till the postmaster exits. + * Note that the syslogger continues to run. It will exit when it sees + * EOF on its input pipe, which happens when there are no more upstream + * processes. */ if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN) { @@ -2652,10 +2681,8 @@ PostmasterStateMachine(void) } /* - * If we need to recover from a crash, wait for all shmem-connected - * children to exit, then reset shmem and StartupDataBase. (We can ignore - * the archiver and stats processes here since they are not connected to - * shmem.) + * If we need to recover from a crash, wait for all non-syslogger + * children to exit, then reset shmem and StartupDataBase. */ if (FatalError && pmState == PM_NO_CHILDREN) { @@ -3782,7 +3809,7 @@ sigusr1_handler(SIGNAL_ARGS) } if (CheckPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER) && - PgArchPID != 0 && Shutdown <= SmartShutdown) + PgArchPID != 0) { /* * Send SIGUSR1 to archiver process, to wake it up and begin archiving