From dde70cc313683e47e71997759c6029b4220f71c0 Mon Sep 17 00:00:00 2001 From: Simon Riggs Date: Wed, 7 Sep 2011 09:09:47 +0100 Subject: [PATCH] Emit cascaded standby message on shutdown only when appropriate. Adds additional test for active walsenders and closes a race condition for when we failover when a new walsender was connecting. Reported and fixed bu Fujii Masao. Review by Heikki Linnakangas --- src/backend/postmaster/postmaster.c | 5 +++-- src/backend/replication/walsender.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 6e231a538f..df4a2aa885 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2328,10 +2328,11 @@ reaper(SIGNAL_ARGS) * XXX should avoid the need for disconnection. When we do, * am_cascading_walsender should be replaced with RecoveryInProgress() */ - if (max_wal_senders > 0) + if (max_wal_senders > 0 && CountChildren(BACKEND_TYPE_WALSND) > 0) { ereport(LOG, - (errmsg("terminating all walsender processes to force cascaded standby(s) to update timeline and reconnect"))); + (errmsg("terminating all walsender processes to force cascaded " + "standby(s) to update timeline and reconnect"))); SignalSomeChildren(SIGUSR2, BACKEND_TYPE_WALSND); } diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 0e8098abf4..474567a204 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -368,6 +368,35 @@ StartReplication(StartReplicationCmd *cmd) MarkPostmasterChildWalSender(); SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE); + /* + * When promoting a cascading standby, postmaster sends SIGUSR2 to + * any cascading walsenders to kill them. But there is a corner-case where + * such walsender fails to receive SIGUSR2 and survives a standby promotion + * unexpectedly. This happens when postmaster sends SIGUSR2 before + * the walsender marks itself as a WAL sender, because postmaster sends + * SIGUSR2 to only the processes marked as a WAL sender. + * + * To avoid this corner-case, if recovery is NOT in progress even though + * the walsender is cascading one, we do the same thing as SIGUSR2 signal + * handler does, i.e., set walsender_ready_to_stop to true. Which causes + * the walsender to end later. + * + * When terminating cascading walsenders, usually postmaster writes + * the log message announcing the terminations. But there is a race condition + * here. If there is no walsender except this process before reaching here, + * postmaster thinks that there is no walsender and suppresses that + * log message. To handle this case, we always emit that log message here. + * This might cause duplicate log messages, but which is less likely to happen, + * so it's not worth writing some code to suppress them. + */ + if (am_cascading_walsender && !RecoveryInProgress()) + { + ereport(LOG, + (errmsg("terminating walsender process to force cascaded standby " + "to update timeline and reconnect"))); + walsender_ready_to_stop = true; + } + /* * We assume here that we're logging enough information in the WAL for * log-shipping, since this is checked in PostmasterMain().