Commit 4e87c483 authored by Michael Paquier's avatar Michael Paquier

Fix handling of WAL segments ready to be archived during crash recovery

78ea8b5d has fixed an issue related to the recycling of WAL segments on
standbys depending on archive_mode.  However, it has introduced a
regression with the handling of WAL segments ready to be archived during
crash recovery, causing those files to be recycled without getting
archived.

This commit fixes the regression by tracking in shared memory if a live
cluster is either in crash recovery or archive recovery as the handling
of WAL segments ready to be archived is different in both cases (those
WAL segments should not be removed during crash recovery), and by using
this new shared memory state to decide if a segment can be recycled or
not.  Previously, it was not possible to know if a cluster was in crash
recovery or archive recovery as the shared state was able to track only
if recovery was happening or not, leading to the problem.

A set of TAP tests is added to close the gap here, making sure that WAL
segments ready to be archived are correctly handled when a cluster is in
archive or crash recovery with archive_mode set to "on" or "always", for
both standby and primary.

Reported-by: Benoît Lobréau
Author: Jehan-Guillaume de Rorthais
Reviewed-by: Kyotaro Horiguchi, Fujii Masao, Michael Paquier
Discussion: https://postgr.es/m/20200331172229.40ee00dc@firost
Backpatch-through: 9.5
parent 3436c5e2
...@@ -221,8 +221,9 @@ static TimeLineID receiveTLI = 0; ...@@ -221,8 +221,9 @@ static TimeLineID receiveTLI = 0;
static bool lastFullPageWrites; static bool lastFullPageWrites;
/* /*
* Local copy of SharedRecoveryInProgress variable. True actually means "not * Local copy of the state tracked by SharedRecoveryState in shared memory,
* known, need to check the shared state". * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
* means "not known, need to check the shared state".
*/ */
static bool LocalRecoveryInProgress = true; static bool LocalRecoveryInProgress = true;
...@@ -653,10 +654,10 @@ typedef struct XLogCtlData ...@@ -653,10 +654,10 @@ typedef struct XLogCtlData
TimeLineID PrevTimeLineID; TimeLineID PrevTimeLineID;
/* /*
* SharedRecoveryInProgress indicates if we're still in crash or archive * SharedRecoveryState indicates if we're still in crash or archive
* recovery. Protected by info_lck. * recovery. Protected by info_lck.
*/ */
bool SharedRecoveryInProgress; RecoveryState SharedRecoveryState;
/* /*
* SharedHotStandbyActive indicates if we allow hot standby queries to be * SharedHotStandbyActive indicates if we allow hot standby queries to be
...@@ -4434,6 +4435,16 @@ ReadRecord(XLogReaderState *xlogreader, int emode, ...@@ -4434,6 +4435,16 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
updateMinRecoveryPoint = true; updateMinRecoveryPoint = true;
UpdateControlFile(); UpdateControlFile();
/*
* We update SharedRecoveryState while holding the lock on
* ControlFileLock so both states are consistent in shared
* memory.
*/
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
SpinLockRelease(&XLogCtl->info_lck);
LWLockRelease(ControlFileLock); LWLockRelease(ControlFileLock);
CheckRecoveryConsistency(); CheckRecoveryConsistency();
...@@ -5166,7 +5177,7 @@ XLOGShmemInit(void) ...@@ -5166,7 +5177,7 @@ XLOGShmemInit(void)
* in additional info.) * in additional info.)
*/ */
XLogCtl->XLogCacheBlck = XLOGbuffers - 1; XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
XLogCtl->SharedRecoveryInProgress = true; XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
XLogCtl->SharedHotStandbyActive = false; XLogCtl->SharedHotStandbyActive = false;
XLogCtl->SharedPromoteIsTriggered = false; XLogCtl->SharedPromoteIsTriggered = false;
XLogCtl->WalWriterSleeping = false; XLogCtl->WalWriterSleeping = false;
...@@ -6871,7 +6882,13 @@ StartupXLOG(void) ...@@ -6871,7 +6882,13 @@ StartupXLOG(void)
*/ */
dbstate_at_startup = ControlFile->state; dbstate_at_startup = ControlFile->state;
if (InArchiveRecovery) if (InArchiveRecovery)
{
ControlFile->state = DB_IN_ARCHIVE_RECOVERY; ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
SpinLockRelease(&XLogCtl->info_lck);
}
else else
{ {
ereport(LOG, ereport(LOG,
...@@ -6884,6 +6901,10 @@ StartupXLOG(void) ...@@ -6884,6 +6901,10 @@ StartupXLOG(void)
ControlFile->checkPointCopy.ThisTimeLineID, ControlFile->checkPointCopy.ThisTimeLineID,
recoveryTargetTLI))); recoveryTargetTLI)));
ControlFile->state = DB_IN_CRASH_RECOVERY; ControlFile->state = DB_IN_CRASH_RECOVERY;
SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
SpinLockRelease(&XLogCtl->info_lck);
} }
ControlFile->checkPoint = checkPointLoc; ControlFile->checkPoint = checkPointLoc;
ControlFile->checkPointCopy = checkPoint; ControlFile->checkPointCopy = checkPoint;
...@@ -7911,7 +7932,7 @@ StartupXLOG(void) ...@@ -7911,7 +7932,7 @@ StartupXLOG(void)
ControlFile->time = (pg_time_t) time(NULL); ControlFile->time = (pg_time_t) time(NULL);
SpinLockAcquire(&XLogCtl->info_lck); SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->SharedRecoveryInProgress = false; XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
SpinLockRelease(&XLogCtl->info_lck); SpinLockRelease(&XLogCtl->info_lck);
UpdateControlFile(); UpdateControlFile();
...@@ -8057,7 +8078,7 @@ RecoveryInProgress(void) ...@@ -8057,7 +8078,7 @@ RecoveryInProgress(void)
*/ */
volatile XLogCtlData *xlogctl = XLogCtl; volatile XLogCtlData *xlogctl = XLogCtl;
LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress; LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
/* /*
* Initialize TimeLineID and RedoRecPtr when we discover that recovery * Initialize TimeLineID and RedoRecPtr when we discover that recovery
...@@ -8069,8 +8090,8 @@ RecoveryInProgress(void) ...@@ -8069,8 +8090,8 @@ RecoveryInProgress(void)
{ {
/* /*
* If we just exited recovery, make sure we read TimeLineID and * If we just exited recovery, make sure we read TimeLineID and
* RedoRecPtr after SharedRecoveryInProgress (for machines with * RedoRecPtr after SharedRecoveryState (for machines with weak
* weak memory ordering). * memory ordering).
*/ */
pg_memory_barrier(); pg_memory_barrier();
InitXLOGAccess(); InitXLOGAccess();
...@@ -8086,6 +8107,24 @@ RecoveryInProgress(void) ...@@ -8086,6 +8107,24 @@ RecoveryInProgress(void)
} }
} }
/*
* Returns current recovery state from shared memory.
*
* This returned state is kept consistent with the contents of the control
* file. See details about the possible values of RecoveryState in xlog.h.
*/
RecoveryState
GetRecoveryState(void)
{
RecoveryState retval;
SpinLockAcquire(&XLogCtl->info_lck);
retval = XLogCtl->SharedRecoveryState;
SpinLockRelease(&XLogCtl->info_lck);
return retval;
}
/* /*
* Is HotStandby active yet? This is only important in special backends * Is HotStandby active yet? This is only important in special backends
* since normal backends won't ever be able to connect until this returns * since normal backends won't ever be able to connect until this returns
......
...@@ -572,18 +572,25 @@ XLogArchiveCheckDone(const char *xlog) ...@@ -572,18 +572,25 @@ XLogArchiveCheckDone(const char *xlog)
{ {
char archiveStatusPath[MAXPGPATH]; char archiveStatusPath[MAXPGPATH];
struct stat stat_buf; struct stat stat_buf;
bool inRecovery = RecoveryInProgress();
/* The file is always deletable if archive_mode is "off". */
if (!XLogArchivingActive())
return true;
/* /*
* The file is always deletable if archive_mode is "off". On standbys * During archive recovery, the file is deletable if archive_mode is not
* archiving is disabled if archive_mode is "on", and enabled with * "always".
* "always". On a primary, archiving is enabled if archive_mode is "on"
* or "always".
*/ */
if (!((XLogArchivingActive() && !inRecovery) || if (!XLogArchivingAlways() &&
(XLogArchivingAlways() && inRecovery))) GetRecoveryState() == RECOVERY_STATE_ARCHIVE)
return true; return true;
/*
* At this point of the logic, note that we are either a primary with
* archive_mode set to "on" or "always", or a standby with archive_mode
* set to "always".
*/
/* First check for .done --- this means archiver is done with it */ /* First check for .done --- this means archiver is done with it */
StatusFilePath(archiveStatusPath, xlog, ".done"); StatusFilePath(archiveStatusPath, xlog, ".done");
if (stat(archiveStatusPath, &stat_buf) == 0) if (stat(archiveStatusPath, &stat_buf) == 0)
......
...@@ -166,6 +166,14 @@ typedef enum WalLevel ...@@ -166,6 +166,14 @@ typedef enum WalLevel
WAL_LEVEL_LOGICAL WAL_LEVEL_LOGICAL
} WalLevel; } WalLevel;
/* Recovery states */
typedef enum RecoveryState
{
RECOVERY_STATE_CRASH = 0, /* crash recovery */
RECOVERY_STATE_ARCHIVE, /* archive recovery */
RECOVERY_STATE_DONE /* currently in production */
} RecoveryState;
extern PGDLLIMPORT int wal_level; extern PGDLLIMPORT int wal_level;
/* Is WAL archiving enabled (always or only while server is running normally)? */ /* Is WAL archiving enabled (always or only while server is running normally)? */
...@@ -291,6 +299,7 @@ extern const char *xlog_identify(uint8 info); ...@@ -291,6 +299,7 @@ extern const char *xlog_identify(uint8 info);
extern void issue_xlog_fsync(int fd, XLogSegNo segno); extern void issue_xlog_fsync(int fd, XLogSegNo segno);
extern bool RecoveryInProgress(void); extern bool RecoveryInProgress(void);
extern RecoveryState GetRecoveryState(void);
extern bool HotStandbyActive(void); extern bool HotStandbyActive(void);
extern bool HotStandbyActiveInReplay(void); extern bool HotStandbyActiveInReplay(void);
extern bool XLogInsertAllowed(void); extern bool XLogInsertAllowed(void);
......
#
# Tests related to WAL archiving and recovery.
#
use strict;
use warnings;
use PostgresNode;
use TestLib;
use Test::More tests => 16;
use Config;
my $primary = get_new_node('master');
$primary->init(
has_archiving => 1,
allows_streaming => 1);
$primary->append_conf('postgresql.conf', 'autovacuum = off');
$primary->start;
my $primary_data = $primary->data_dir;
# Temporarily use an archive_command value to make the archiver fail,
# knowing that archiving is enabled. Note that we cannot use a command
# that does not exist as in this case the archiver process would just exit
# without reporting the failure to pg_stat_archiver. This also cannot
# use a plain "false" as that's unportable on Windows. So, instead, as
# a portable solution, use an archive command based on a command known to
# work but will fail: copy with an incorrect original path.
my $incorrect_command =
$TestLib::windows_os
? qq{copy "%p_does_not_exist" "%f_does_not_exist"}
: qq{cp "%p_does_not_exist" "%f_does_not_exist"};
$primary->safe_psql(
'postgres', qq{
ALTER SYSTEM SET archive_command TO '$incorrect_command';
SELECT pg_reload_conf();
});
# Save the WAL segment currently in use and switch to a new segment.
# This will be used to track the activity of the archiver.
my $segment_name_1 = $primary->safe_psql('postgres',
q{SELECT pg_walfile_name(pg_current_wal_lsn())});
my $segment_path_1 = "pg_wal/archive_status/$segment_name_1";
my $segment_path_1_ready = "$segment_path_1.ready";
my $segment_path_1_done = "$segment_path_1.done";
$primary->safe_psql(
'postgres', q{
CREATE TABLE mine AS SELECT generate_series(1,10) AS x;
SELECT pg_switch_wal();
CHECKPOINT;
});
# Wait for an archive failure.
$primary->poll_query_until('postgres',
q{SELECT failed_count > 0 FROM pg_stat_archiver}, 't')
or die "Timed out while waiting for archiving to fail";
ok( -f "$primary_data/$segment_path_1_ready",
".ready file exists for WAL segment $segment_name_1 waiting to be archived"
);
ok( !-f "$primary_data/$segment_path_1_done",
".done file does not exist for WAL segment $segment_name_1 waiting to be archived"
);
is( $primary->safe_psql(
'postgres', q{
SELECT archived_count, last_failed_wal
FROM pg_stat_archiver
}),
"0|$segment_name_1",
'pg_stat_archiver failed to archive $segment_name_1');
# Crash the cluster for the next test in charge of checking that non-archived
# WAL segments are not removed.
$primary->stop('immediate');
# Recovery tests for the archiving with a standby partially check
# the recovery behavior when restoring a backup taken using a
# snapshot with no pg_start/stop_backup. In this situation,
# the recovered standby should enter first crash recovery then
# switch to regular archive recovery. Note that the base backup
# is taken here so as archive_command will fail. This is necessary
# for the assumptions of the tests done with the standbys below.
$primary->backup_fs_cold('backup');
$primary->start;
ok( -f "$primary_data/$segment_path_1_ready",
".ready file for WAL segment $segment_name_1 still exists after crash recovery on primary"
);
# Allow WAL archiving again and wait for a success.
$primary->safe_psql(
'postgres', q{
ALTER SYSTEM RESET archive_command;
SELECT pg_reload_conf();
});
$primary->poll_query_until('postgres',
q{SELECT archived_count FROM pg_stat_archiver}, '1')
or die "Timed out while waiting for archiving to finish";
ok(!-f "$primary_data/$segment_path_1_ready",
".ready file for archived WAL segment $segment_name_1 removed");
ok(-f "$primary_data/$segment_path_1_done",
".done file for archived WAL segment $segment_name_1 exists");
is( $primary->safe_psql(
'postgres', q{ SELECT last_archived_wal FROM pg_stat_archiver }),
$segment_name_1,
"archive success reported in pg_stat_archiver for WAL segment $segment_name_1"
);
# Create some WAL activity and a new checkpoint so as the next standby can
# create a restartpoint. As this standby starts in crash recovery because
# of the cold backup taken previously, it needs a clean restartpoint to deal
# with existing status files.
my $segment_name_2 = $primary->safe_psql('postgres',
q{SELECT pg_walfile_name(pg_current_wal_lsn())});
my $segment_path_2 = "pg_wal/archive_status/$segment_name_2";
my $segment_path_2_ready = "$segment_path_2.ready";
my $segment_path_2_done = "$segment_path_2.done";
$primary->safe_psql(
'postgres', q{
INSERT INTO mine SELECT generate_series(10,20) AS x;
SELECT pg_switch_wal();
CHECKPOINT;
});
$primary->poll_query_until('postgres',
q{ SELECT last_archived_wal FROM pg_stat_archiver },
$segment_name_2)
or die "Timed out while waiting for archiving to finish";
# Test standby with archive_mode = on.
my $standby1 = get_new_node('standby');
$standby1->init_from_backup($primary, 'backup', has_restoring => 1);
$standby1->append_conf('postgresql.conf', "archive_mode = on");
my $standby1_data = $standby1->data_dir;
$standby1->start;
$standby1->safe_psql('postgres', q{CHECKPOINT});
# Recovery with archive_mode=on does not keep .ready signal files inherited
# from backup. Note that this WAL segment existed in the backup.
ok( !-f "$standby1_data/$segment_path_1_ready",
".ready file for WAL segment $segment_name_1 present in backup got removed with archive_mode=on on standby"
);
# Recovery with archive_mode=on should not create .ready files.
# Note that this segment did not exist in the backup.
ok( !-f "$standby1_data/$segment_path_2_ready",
".ready file for WAL segment $segment_name_2 not created on standby when archive_mode=on on standby"
);
# Recovery with archive_mode = on creates .done files.
ok( -f "$standby1_data/$segment_path_2_done",
".done file for WAL segment $segment_name_2 created when archive_mode=on on standby"
);
# Test recovery with archive_mode = always, which should always keep
# .ready files if archiving is enabled, though here we want the archive
# command to fail to persist the .ready files. Note that this node
# has inherited the archive command of the previous cold backup that
# will cause archiving failures.
my $standby2 = get_new_node('standby2');
$standby2->init_from_backup($primary, 'backup', has_restoring => 1);
$standby2->append_conf('postgresql.conf', 'archive_mode = always');
my $standby2_data = $standby2->data_dir;
$standby2->start;
$standby2->safe_psql('postgres', q{CHECKPOINT});
ok( -f "$standby2_data/$segment_path_1_ready",
".ready file for WAL segment $segment_name_1 existing in backup is kept with archive_mode=always on standby"
);
ok( -f "$standby2_data/$segment_path_2_ready",
".ready file for WAL segment $segment_name_2 created with archive_mode=always on standby"
);
# Reset statistics of the archiver for the next checks.
$standby2->safe_psql('postgres', q{SELECT pg_stat_reset_shared('archiver')});
# Now crash the cluster to check that recovery step does not
# remove non-archived WAL segments on a standby where archiving
# is enabled.
$standby2->stop('immediate');
$standby2->start;
ok( -f "$standby2_data/$segment_path_1_ready",
"WAL segment still ready to archive after crash recovery on standby with archive_mode=always"
);
# Allow WAL archiving again, and wait for the segments to be archived.
$standby2->safe_psql(
'postgres', q{
ALTER SYSTEM RESET archive_command;
SELECT pg_reload_conf();
});
$standby2->poll_query_until('postgres',
q{SELECT last_archived_wal FROM pg_stat_archiver},
$segment_name_2)
or die "Timed out while waiting for archiving to finish";
is( $standby2->safe_psql(
'postgres', q{SELECT archived_count FROM pg_stat_archiver}),
'2',
"correct number of WAL segments archived from standby");
ok( !-f "$standby2_data/$segment_path_1_ready"
&& !-f "$standby2_data/$segment_path_2_ready",
".ready files removed after archive success with archive_mode=always on standby"
);
ok( -f "$standby2_data/$segment_path_1_done"
&& -f "$standby2_data/$segment_path_2_done",
".done files created after archive success with archive_mode=always on standby"
);
...@@ -1983,6 +1983,7 @@ RecordCacheEntry ...@@ -1983,6 +1983,7 @@ RecordCacheEntry
RecordCompareData RecordCompareData
RecordIOData RecordIOData
RecoveryLockListsEntry RecoveryLockListsEntry
RecoveryState
RecoveryTargetTimeLineGoal RecoveryTargetTimeLineGoal
RecoveryTargetType RecoveryTargetType
RectBox RectBox
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment