Add GUC to control the time to wait before retrieving WAL after failed attempt.

Previously when the standby server failed to retrieve WAL files from any sources
(i.e., streaming replication, local pg_xlog directory or WAL archive), it always
waited for five seconds (hard-coded) before the next attempt. For example,
this is problematic in warm-standby because restore_command can fail
every five seconds even while new WAL file is expected to be unavailable for
a long time and flood the log files with its error messages.

This commit adds new parameter, wal_retrieve_retry_interval, to control that
wait time.

Alexey Vasiliev and Michael Paquier, reviewed by Andres Freund and me.
This commit is contained in:
Fujii Masao 2015-02-23 20:55:17 +09:00
parent 2a3f6e368b
commit 5d2b45e3f7
5 changed files with 57 additions and 13 deletions

View File

@ -2985,6 +2985,24 @@ include_dir 'conf.d'
</listitem>
</varlistentry>
<varlistentry id="guc-wal-retrieve-retry-interval" xreflabel="wal_retrieve_retry_interval">
<term><varname>wal_retrieve_retry_interval</varname> (<type>integer</type>)
<indexterm>
<primary><varname>wal_retrieve_retry_interval</> configuration parameter</primary>
</indexterm>
</term>
<listitem>
<para>
Specify how long the standby server should wait when WAL data is not
available from any sources (streaming replication,
local <filename>pg_xlog</> or WAL archive) before retrying to
retrieve WAL data. This parameter can only be set in the
<filename>postgresql.conf</> file or on the server command line.
The default value is 5 seconds. Units are milliseconds if not specified.
</para>
</listitem>
</varlistentry>
</variablelist>
</sect2>
</sect1>

View File

@ -93,6 +93,7 @@ int sync_method = DEFAULT_SYNC_METHOD;
int wal_level = WAL_LEVEL_MINIMAL;
int CommitDelay = 0; /* precommit delay in microseconds */
int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
int wal_retrieve_retry_interval = 5000;
#ifdef WAL_DEBUG
bool XLOG_DEBUG = false;
@ -10340,8 +10341,8 @@ static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
bool fetching_ckpt, XLogRecPtr tliRecPtr)
{
static pg_time_t last_fail_time = 0;
pg_time_t now;
static TimestampTz last_fail_time = 0;
TimestampTz now;
/*-------
* Standby mode is implemented by a state machine:
@ -10351,7 +10352,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* 2. Check trigger file
* 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
* 4. Rescan timelines
* 5. Sleep 5 seconds, and loop back to 1.
* 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
*
* Failure to read from the current source advances the state machine to
* the next state.
@ -10490,14 +10491,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* machine, so we've exhausted all the options for
* obtaining the requested WAL. We're going to loop back
* and retry from the archive, but if it hasn't been long
* since last attempt, sleep 5 seconds to avoid
* busy-waiting.
* since last attempt, sleep wal_retrieve_retry_interval
* milliseconds to avoid busy-waiting.
*/
now = (pg_time_t) time(NULL);
if ((now - last_fail_time) < 5)
now = GetCurrentTimestamp();
if (!TimestampDifferenceExceeds(last_fail_time, now,
wal_retrieve_retry_interval))
{
pg_usleep(1000000L * (5 - (now - last_fail_time)));
now = (pg_time_t) time(NULL);
long secs, wait_time;
int usecs;
TimestampDifference(last_fail_time, now, &secs, &usecs);
wait_time = wal_retrieve_retry_interval -
(secs * 1000 + usecs / 1000);
WaitLatch(&XLogCtl->recoveryWakeupLatch,
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
wait_time);
ResetLatch(&XLogCtl->recoveryWakeupLatch);
now = GetCurrentTimestamp();
}
last_fail_time = now;
currentSource = XLOG_FROM_ARCHIVE;
@ -10653,12 +10665,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
}
/*
* Wait for more WAL to arrive. Time out after 5 seconds,
* like when polling the archive, to react to a trigger
* file promptly.
* Wait for more WAL to arrive. Time out after 5 seconds
* to react to a trigger file promptly.
*/
WaitLatch(&XLogCtl->recoveryWakeupLatch,
WL_LATCH_SET | WL_TIMEOUT,
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
5000L);
ResetLatch(&XLogCtl->recoveryWakeupLatch);
break;

View File

@ -2363,6 +2363,18 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
{
{"wal_retrieve_retry_interval", PGC_SIGHUP, REPLICATION_STANDBY,
gettext_noop("Sets the time to wait before retrying to retrieve WAL"
"after a failed attempt."),
NULL,
GUC_UNIT_MS
},
&wal_retrieve_retry_interval,
5000, 1, INT_MAX,
NULL, NULL, NULL
},
{
{"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS,
gettext_noop("Shows the number of pages per write ahead log segment."),

View File

@ -260,6 +260,8 @@
#wal_receiver_timeout = 60s # time that receiver waits for
# communication from master
# in milliseconds; 0 disables
#wal_retrieve_retry_interval = 5s # time to wait before retrying to
# retrieve WAL after a failed attempt
#------------------------------------------------------------------------------

View File

@ -93,6 +93,7 @@ extern int CheckPointSegments;
extern int wal_keep_segments;
extern int XLOGbuffers;
extern int XLogArchiveTimeout;
extern int wal_retrieve_retry_interval;
extern bool XLogArchiveMode;
extern char *XLogArchiveCommand;
extern bool EnableHotStandby;