Commit 36da3cfb authored by Simon Riggs's avatar Simon Riggs

Allow time delayed standbys and recovery

Set min_recovery_apply_delay to force a delay in recovery apply for commit and
restore point WAL records. Other records are replayed immediately. Delay is
measured between WAL record time and local standby time.

Robert Haas, Fabrízio de Royes Mello and Simon Riggs
Detailed review by Mitsumasa Kondo
parent 841a6548
......@@ -142,6 +142,56 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows
</listitem>
</varlistentry>
<varlistentry id="min-recovery-apply-delay" xreflabel="min_recovery_apply_delay">
<term><varname>min_recovery_apply_delay</varname> (<type>integer</type>)</term>
<indexterm>
<primary><varname>min_recovery_apply_delay</> recovery parameter</primary>
</indexterm>
<listitem>
<para>
By default, a standby server keeps restoring WAL records from the
primary as soon as possible. It may be useful to have a time-delayed
copy of the data, offering various options to correct data loss errors.
This paramater allows you to delay recovery by a fixed period of time,
specified in milliseconds if no unit is specified. For example, if
you set this parameter to <literal>5min</literal>, the standby will
replay each transaction commit only when the system time on the standby
is at least five minutes past the commit time reported by the master.
</para>
<para>
It is possible that the replication delay between servers exceeds the
value of this parameter, in which case no delay is added.
Note that the delay is calculated between the WAL timestamp as written
on master and the time on the current standby. Delays
in transfer because of networks or cascading replication configurations
may reduce the actual wait time significantly. If the system
clocks on master and standby are not synchronised, this may lead to
recovery applying records earlier than expected but is not a major issue
because the useful settings of the parameter are much larger than
typical time deviation between the servers. Be careful to allow for
different timezone settings on master and standby.
</para>
<para>
The delay occurs only on WAL records for COMMIT and Restore Points.
Other records may be replayed earlier than the specified delay, which
is not an issue for MVCC though may potentially increase the number
of recovery conflicts generated.
</para>
<para>
The delay occurs until the standby is promoted or triggered. After that
the standby will end recovery without further waiting.
</para>
<para>
This parameter is intended for use with streaming replication deployments,
however, if the parameter is specified it will be honoured in all cases.
Synchronous replication is not affected by this setting because there is
not yet any setting to request synchronous apply of transaction commits.
<varname>hot_standby_feedback</> will be delayed by use of this feature
which could lead to bloat on the master; use both together with care.
</para>
</listitem>
</varlistentry>
</variablelist>
</sect1>
......
......@@ -123,6 +123,15 @@
#
#trigger_file = ''
#
# By default, a standby server keeps restoring XLOG records from the
# primary as soon as possible. If you want to explicitly delay the replay of
# committed transactions from the master, specify a recovery apply delay.
# For example, if you set this parameter to 5min, the standby will replay
# each transaction commit only when the system time on the standby is least
# five minutes past the commit time reported by the master.
#
#min_recovery_apply_delay = 0
#
#---------------------------------------------------------------------------
# HOT STANDBY PARAMETERS
#---------------------------------------------------------------------------
......
......@@ -218,6 +218,8 @@ static bool recoveryPauseAtTarget = true;
static TransactionId recoveryTargetXid;
static TimestampTz recoveryTargetTime;
static char *recoveryTargetName;
static int min_recovery_apply_delay = 0;
static TimestampTz recoveryDelayUntilTime;
/* options taken from recovery.conf for XLOG streaming */
static bool StandbyModeRequested = false;
......@@ -728,8 +730,10 @@ static bool holdingAllSlots = false;
static void readRecoveryCommandFile(void);
static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis, bool *delayThis);
static void recoveryPausesHere(void);
static void recoveryApplyDelay(void);
static bool SetRecoveryDelayUntilTime(TimestampTz xtime);
static void SetLatestXTime(TimestampTz xtime);
static void SetCurrentChunkStartTime(TimestampTz xtime);
static void CheckRequiredParameterValues(void);
......@@ -5476,6 +5480,19 @@ readRecoveryCommandFile(void)
(errmsg_internal("trigger_file = '%s'",
TriggerFile)));
}
else if (strcmp(item->name, "min_recovery_apply_delay") == 0)
{
const char *hintmsg;
if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
&hintmsg))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
hintmsg ? errhint("%s", _(hintmsg)) : 0));
ereport(DEBUG2,
(errmsg("min_recovery_apply_delay = '%s'", item->value)));
}
else
ereport(FATAL,
(errmsg("unrecognized recovery parameter \"%s\"",
......@@ -5625,10 +5642,11 @@ exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
* We also track the timestamp of the latest applied COMMIT/ABORT
* record in XLogCtl->recoveryLastXTime, for logging purposes.
* Also, some information is saved in recoveryStopXid et al for use in
* annotating the new timeline's history file.
* annotating the new timeline's history file; and recoveryDelayUntilTime
* is updated, for time-delayed standbys.
*/
static bool
recoveryStopsHere(XLogRecord *record, bool *includeThis)
recoveryStopsHere(XLogRecord *record, bool *includeThis, bool *delayThis)
{
bool stopsHere;
uint8 record_info;
......@@ -5645,6 +5663,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
recordXtime = recordXactCommitData->xact_time;
*delayThis = SetRecoveryDelayUntilTime(recordXactCommitData->xact_time);
}
else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
{
......@@ -5652,6 +5672,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
recordXtime = recordXactCommitData->xact_time;
*delayThis = SetRecoveryDelayUntilTime(recordXactCommitData->xact_time);
}
else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
{
......@@ -5659,6 +5681,13 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
recordXtime = recordXactAbortData->xact_time;
/*
* We deliberately choose not to delay aborts since they have no
* effect on MVCC. We already allow replay of records that don't
* have a timestamp, so there is already opportunity for issues
* caused by early conflicts on standbys.
*/
}
else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
{
......@@ -5667,6 +5696,8 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
recordXtime = recordRestorePointData->rp_time;
strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
*delayThis = SetRecoveryDelayUntilTime(recordRestorePointData->rp_time);
}
else
return false;
......@@ -5833,6 +5864,66 @@ SetRecoveryPause(bool recoveryPause)
SpinLockRelease(&xlogctl->info_lck);
}
static bool
SetRecoveryDelayUntilTime(TimestampTz xtime)
{
if (min_recovery_apply_delay != 0)
{
recoveryDelayUntilTime =
TimestampTzPlusMilliseconds(xtime, min_recovery_apply_delay);
return true;
}
return false;
}
/*
* When min_recovery_apply_delay is set, we wait long enough to make sure
* certain record types are applied at least that interval behind the master.
* See recoveryStopsHere().
*
* Note that the delay is calculated between the WAL record log time and
* the current time on standby. We would prefer to keep track of when this
* standby received each WAL record, which would allow a more consistent
* approach and one not affected by time synchronisation issues, but that
* is significantly more effort and complexity for little actual gain in
* usability.
*/
static void
recoveryApplyDelay(void)
{
while (true)
{
long secs;
int microsecs;
ResetLatch(&XLogCtl->recoveryWakeupLatch);
/* might change the trigger file's location */
HandleStartupProcInterrupts();
if (CheckForStandbyTrigger())
break;
/*
* Wait for difference between GetCurrentTimestamp() and
* recoveryDelayUntilTime
*/
TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
&secs, &microsecs);
if (secs <= 0 && microsecs <=0)
break;
elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
secs, microsecs / 1000);
WaitLatch(&XLogCtl->recoveryWakeupLatch,
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
secs * 1000L + microsecs / 1000);
}
}
/*
* Save timestamp of latest processed commit/abort record.
*
......@@ -6660,6 +6751,7 @@ StartupXLOG(void)
{
bool recoveryContinue = true;
bool recoveryApply = true;
bool recoveryDelay = false;
ErrorContextCallback errcallback;
TimestampTz xtime;
......@@ -6719,7 +6811,7 @@ StartupXLOG(void)
/*
* Have we reached our recovery target?
*/
if (recoveryStopsHere(record, &recoveryApply))
if (recoveryStopsHere(record, &recoveryApply, &recoveryDelay))
{
if (recoveryPauseAtTarget)
{
......@@ -6734,6 +6826,25 @@ StartupXLOG(void)
break;
}
/*
* If we've been asked to lag the master, wait on
* latch until enough time has passed.
*/
if (recoveryDelay)
{
recoveryApplyDelay();
/*
* We test for paused recovery again here. If
* user sets delayed apply, it may be because
* they expect to pause recovery in case of
* problems, so we must test again here otherwise
* pausing during the delay-wait wouldn't work.
*/
if (xlogctl->recoveryPause)
recoveryPausesHere();
}
/* Setup error traceback support for ereport() */
errcallback.callback = rm_redo_error_callback;
errcallback.arg = (void *) record;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment