Commit 5d2b45e3 authored by Fujii Masao's avatar Fujii Masao

Add GUC to control the time to wait before retrieving WAL after failed attempt.

Previously when the standby server failed to retrieve WAL files from any sources
(i.e., streaming replication, local pg_xlog directory or WAL archive), it always
waited for five seconds (hard-coded) before the next attempt. For example,
this is problematic in warm-standby because restore_command can fail
every five seconds even while new WAL file is expected to be unavailable for
a long time and flood the log files with its error messages.

This commit adds new parameter, wal_retrieve_retry_interval, to control that
wait time.

Alexey Vasiliev and Michael Paquier, reviewed by Andres Freund and me.
parent 2a3f6e36
...@@ -2985,6 +2985,24 @@ include_dir 'conf.d' ...@@ -2985,6 +2985,24 @@ include_dir 'conf.d'
</listitem> </listitem>
</varlistentry> </varlistentry>
<varlistentry id="guc-wal-retrieve-retry-interval" xreflabel="wal_retrieve_retry_interval">
<term><varname>wal_retrieve_retry_interval</varname> (<type>integer</type>)
<indexterm>
<primary><varname>wal_retrieve_retry_interval</> configuration parameter</primary>
</indexterm>
</term>
<listitem>
<para>
Specify how long the standby server should wait when WAL data is not
available from any sources (streaming replication,
local <filename>pg_xlog</> or WAL archive) before retrying to
retrieve WAL data. This parameter can only be set in the
<filename>postgresql.conf</> file or on the server command line.
The default value is 5 seconds. Units are milliseconds if not specified.
</para>
</listitem>
</varlistentry>
</variablelist> </variablelist>
</sect2> </sect2>
</sect1> </sect1>
......
...@@ -93,6 +93,7 @@ int sync_method = DEFAULT_SYNC_METHOD; ...@@ -93,6 +93,7 @@ int sync_method = DEFAULT_SYNC_METHOD;
int wal_level = WAL_LEVEL_MINIMAL; int wal_level = WAL_LEVEL_MINIMAL;
int CommitDelay = 0; /* precommit delay in microseconds */ int CommitDelay = 0; /* precommit delay in microseconds */
int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
int wal_retrieve_retry_interval = 5000;
#ifdef WAL_DEBUG #ifdef WAL_DEBUG
bool XLOG_DEBUG = false; bool XLOG_DEBUG = false;
...@@ -10340,8 +10341,8 @@ static bool ...@@ -10340,8 +10341,8 @@ static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
bool fetching_ckpt, XLogRecPtr tliRecPtr) bool fetching_ckpt, XLogRecPtr tliRecPtr)
{ {
static pg_time_t last_fail_time = 0; static TimestampTz last_fail_time = 0;
pg_time_t now; TimestampTz now;
/*------- /*-------
* Standby mode is implemented by a state machine: * Standby mode is implemented by a state machine:
...@@ -10351,7 +10352,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, ...@@ -10351,7 +10352,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* 2. Check trigger file * 2. Check trigger file
* 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
* 4. Rescan timelines * 4. Rescan timelines
* 5. Sleep 5 seconds, and loop back to 1. * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
* *
* Failure to read from the current source advances the state machine to * Failure to read from the current source advances the state machine to
* the next state. * the next state.
...@@ -10490,14 +10491,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, ...@@ -10490,14 +10491,25 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* machine, so we've exhausted all the options for * machine, so we've exhausted all the options for
* obtaining the requested WAL. We're going to loop back * obtaining the requested WAL. We're going to loop back
* and retry from the archive, but if it hasn't been long * and retry from the archive, but if it hasn't been long
* since last attempt, sleep 5 seconds to avoid * since last attempt, sleep wal_retrieve_retry_interval
* busy-waiting. * milliseconds to avoid busy-waiting.
*/ */
now = (pg_time_t) time(NULL); now = GetCurrentTimestamp();
if ((now - last_fail_time) < 5) if (!TimestampDifferenceExceeds(last_fail_time, now,
wal_retrieve_retry_interval))
{ {
pg_usleep(1000000L * (5 - (now - last_fail_time))); long secs, wait_time;
now = (pg_time_t) time(NULL); int usecs;
TimestampDifference(last_fail_time, now, &secs, &usecs);
wait_time = wal_retrieve_retry_interval -
(secs * 1000 + usecs / 1000);
WaitLatch(&XLogCtl->recoveryWakeupLatch,
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
wait_time);
ResetLatch(&XLogCtl->recoveryWakeupLatch);
now = GetCurrentTimestamp();
} }
last_fail_time = now; last_fail_time = now;
currentSource = XLOG_FROM_ARCHIVE; currentSource = XLOG_FROM_ARCHIVE;
...@@ -10653,12 +10665,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, ...@@ -10653,12 +10665,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
} }
/* /*
* Wait for more WAL to arrive. Time out after 5 seconds, * Wait for more WAL to arrive. Time out after 5 seconds
* like when polling the archive, to react to a trigger * to react to a trigger file promptly.
* file promptly.
*/ */
WaitLatch(&XLogCtl->recoveryWakeupLatch, WaitLatch(&XLogCtl->recoveryWakeupLatch,
WL_LATCH_SET | WL_TIMEOUT, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
5000L); 5000L);
ResetLatch(&XLogCtl->recoveryWakeupLatch); ResetLatch(&XLogCtl->recoveryWakeupLatch);
break; break;
......
...@@ -2363,6 +2363,18 @@ static struct config_int ConfigureNamesInt[] = ...@@ -2363,6 +2363,18 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL NULL, NULL, NULL
}, },
{
{"wal_retrieve_retry_interval", PGC_SIGHUP, REPLICATION_STANDBY,
gettext_noop("Sets the time to wait before retrying to retrieve WAL"
"after a failed attempt."),
NULL,
GUC_UNIT_MS
},
&wal_retrieve_retry_interval,
5000, 1, INT_MAX,
NULL, NULL, NULL
},
{ {
{"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS, {"wal_segment_size", PGC_INTERNAL, PRESET_OPTIONS,
gettext_noop("Shows the number of pages per write ahead log segment."), gettext_noop("Shows the number of pages per write ahead log segment."),
......
...@@ -260,6 +260,8 @@ ...@@ -260,6 +260,8 @@
#wal_receiver_timeout = 60s # time that receiver waits for #wal_receiver_timeout = 60s # time that receiver waits for
# communication from master # communication from master
# in milliseconds; 0 disables # in milliseconds; 0 disables
#wal_retrieve_retry_interval = 5s # time to wait before retrying to
# retrieve WAL after a failed attempt
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
......
...@@ -93,6 +93,7 @@ extern int CheckPointSegments; ...@@ -93,6 +93,7 @@ extern int CheckPointSegments;
extern int wal_keep_segments; extern int wal_keep_segments;
extern int XLOGbuffers; extern int XLOGbuffers;
extern int XLogArchiveTimeout; extern int XLogArchiveTimeout;
extern int wal_retrieve_retry_interval;
extern bool XLogArchiveMode; extern bool XLogArchiveMode;
extern char *XLogArchiveCommand; extern char *XLogArchiveCommand;
extern bool EnableHotStandby; extern bool EnableHotStandby;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment