Commit 9c4f5192 authored by Heikki Linnakangas's avatar Heikki Linnakangas

Allow pg_rewind to use a standby server as the source system.

Using a hot standby server as the source has not been possible, because
pg_rewind creates a temporary table in the source system, to hold the
list of file ranges that need to be fetched. Refactor it to queue up the
file fetch requests in pg_rewind's memory, so that the temporary table
is no longer needed.

Also update the logic to compute 'minRecoveryPoint' correctly, when the
source is a standby server.

Reviewed-by: Kyotaro Horiguchi, Soumyadeep Chakraborty
Discussion: https://www.postgresql.org/message-id/0c5b3783-af52-3ee5-f8fa-6e794061f70d%40iki.fi
parent 1b2b19f7
......@@ -173,7 +173,7 @@ PostgreSQL documentation
with a role having sufficient permissions to execute the functions
used by <application>pg_rewind</application> on the source server
(see Notes section for details) or a superuser role. This option
requires the source server to be running and not in recovery mode.
requires the source server to be running and accepting connections.
</para>
</listitem>
</varlistentry>
......
This diff is collapsed.
......@@ -50,6 +50,7 @@ static void disconnect_atexit(void);
static ControlFileData ControlFile_target;
static ControlFileData ControlFile_source;
static ControlFileData ControlFile_source_after;
const char *progname;
int WalSegSz;
......@@ -486,6 +487,8 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
XLogRecPtr endrec;
TimeLineID endtli;
ControlFileData ControlFile_new;
size_t size;
char *buffer;
/*
* Execute the actions in the file map, fetching data from the source
......@@ -552,40 +555,104 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
}
}
/*
* We've now copied the list of file ranges that we need to fetch to the
* temporary table. Now, actually fetch all of those ranges.
*/
/* Complete any remaining range-fetches that we queued up above. */
source->finish_fetch(source);
close_target_file();
progress_report(true);
/*
* Fetch the control file from the source last. This ensures that the
* minRecoveryPoint is up-to-date.
*/
buffer = source->fetch_file(source, "global/pg_control", &size);
digestControlFile(&ControlFile_source_after, buffer, size);
pg_free(buffer);
/*
* Sanity check: If the source is a local system, the control file should
* not have changed since we started.
*
* XXX: We assume it hasn't been modified, but actually, what could go
* wrong? The logic handles a libpq source that's modified concurrently,
* why not a local datadir?
*/
if (datadir_source &&
memcmp(&ControlFile_source, &ControlFile_source_after,
sizeof(ControlFileData)) != 0)
{
pg_fatal("source system was modified while pg_rewind was running");
}
if (showprogress)
pg_log_info("creating backup label and updating control file");
createBackupLabel(chkptredo, chkpttli, chkptrec);
/*
* Update control file of target. Make it ready to perform archive
* recovery when restarting.
* Create a backup label file, to tell the target where to begin the WAL
* replay. Normally, from the last common checkpoint between the source
* and the target. But if the source is a standby server, it's possible
* that the last common checkpoint is *after* the standby's restartpoint.
* That implies that the source server has applied the checkpoint record,
* but hasn't perfomed a corresponding restartpoint yet. Make sure we
* start at the restartpoint's redo point in that case.
*
* Like in an online backup, it's important that we replay all the WAL
* that was generated while we copied the files over. To enforce that, set
* 'minRecoveryPoint' in the control file.
* Use the old version of the source's control file for this. The server
* might have finished the restartpoint after we started copying files,
* but we must begin from the redo point at the time that started copying.
*/
memcpy(&ControlFile_new, &ControlFile_source, sizeof(ControlFileData));
if (ControlFile_source.checkPointCopy.redo < chkptredo)
{
chkptredo = ControlFile_source.checkPointCopy.redo;
chkpttli = ControlFile_source.checkPointCopy.ThisTimeLineID;
chkptrec = ControlFile_source.checkPoint;
}
createBackupLabel(chkptredo, chkpttli, chkptrec);
/*
* Update control file of target, to tell the target how far it must
* replay the WAL (minRecoveryPoint).
*/
if (connstr_source)
{
endrec = source->get_current_wal_insert_lsn(source);
endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
/*
* The source is a live server. Like in an online backup, it's
* important that we recover all the WAL that was generated while we
* were copying files.
*/
if (ControlFile_source_after.state == DB_IN_ARCHIVE_RECOVERY)
{
/*
* Source is a standby server. We must replay to its
* minRecoveryPoint.
*/
endrec = ControlFile_source_after.minRecoveryPoint;
endtli = ControlFile_source_after.minRecoveryPointTLI;
}
else
{
/*
* Source is a production, non-standby, server. We must replay to
* the last WAL insert location.
*/
if (ControlFile_source_after.state != DB_IN_PRODUCTION)
pg_fatal("source system was in unexpected state at end of rewind");
endrec = source->get_current_wal_insert_lsn(source);
endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
}
}
else
{
endrec = ControlFile_source.checkPoint;
endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;
/*
* Source is a local data directory. It should've shut down cleanly,
* and we must replay to the latest shutdown checkpoint.
*/
endrec = ControlFile_source_after.checkPoint;
endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID;
}
memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
ControlFile_new.minRecoveryPoint = endrec;
ControlFile_new.minRecoveryPointTLI = endtli;
ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
......
......@@ -40,10 +40,22 @@ sub run_test
"in standby1";
append_to_file "$test_standby_datadir/tst_standby_dir/standby_file2",
"in standby2";
mkdir "$test_standby_datadir/tst_standby_dir/standby_subdir/";
append_to_file
"$test_standby_datadir/tst_standby_dir/standby_subdir/standby_file3",
"$test_standby_datadir/tst_standby_dir/standby_file3 with 'quotes'",
"in standby3";
append_to_file
"$test_standby_datadir/tst_standby_dir/standby_file4 with double\"quote",
"in standby4";
append_to_file
"$test_standby_datadir/tst_standby_dir/standby_file5 with back\\slash",
"in standby5";
append_to_file
"$test_standby_datadir/tst_standby_dir/standby_file6_with_backslash\\\"and_double-quote",
"in standby6";
mkdir "$test_standby_datadir/tst_standby_dir/standby_subdir/";
append_to_file
"$test_standby_datadir/tst_standby_dir/standby_subdir/standby_file7",
"in standby7";
mkdir "$test_primary_datadir/tst_primary_dir";
append_to_file "$test_primary_datadir/tst_primary_dir/primary_file1",
......@@ -58,7 +70,9 @@ sub run_test
RewindTest::promote_standby();
RewindTest::run_pg_rewind($test_mode);
# List files in the data directory after rewind.
# List files in the data directory after rewind. All the files that
# were present in the standby should be present after rewind, and
# all the files that were added on the primary should be removed.
my @paths;
find(
sub {
......@@ -78,8 +92,12 @@ sub run_test
"$test_primary_datadir/tst_standby_dir",
"$test_primary_datadir/tst_standby_dir/standby_file1",
"$test_primary_datadir/tst_standby_dir/standby_file2",
"$test_primary_datadir/tst_standby_dir/standby_file3 with 'quotes'",
"$test_primary_datadir/tst_standby_dir/standby_file4 with double\"quote",
"$test_primary_datadir/tst_standby_dir/standby_file5 with back\\slash",
"$test_primary_datadir/tst_standby_dir/standby_file6_with_backslash\\\"and_double-quote",
"$test_primary_datadir/tst_standby_dir/standby_subdir",
"$test_primary_datadir/tst_standby_dir/standby_subdir/standby_file3"
"$test_primary_datadir/tst_standby_dir/standby_subdir/standby_file7"
],
"file lists match");
......
#
# Test using a standby server as the source.
#
# This sets up three nodes: A, B and C. First, A is the primary,
# B follows A, and C follows B:
#
# A (primary) <--- B (standby) <--- C (standby)
#
#
# Then we promote C, and insert some divergent rows in A and C:
#
# A (primary) <--- B (standby) C (primary)
#
#
# Finally, we run pg_rewind on C, to re-point it at B again:
#
# A (primary) <--- B (standby) <--- C (standby)
#
#
# The test is similar to the basic tests, but since we're dealing with
# three nodes, not two, we cannot use most of the RewindTest functions
# as is.
use strict;
use warnings;
use TestLib;
use Test::More tests => 3;
use FindBin;
use lib $FindBin::RealBin;
use File::Copy;
use PostgresNode;
use RewindTest;
my $tmp_folder = TestLib::tempdir;
my $node_a;
my $node_b;
my $node_c;
# Set up node A, as primary
#
# A (primary)
setup_cluster('a');
start_primary();
$node_a = $node_primary;
# Create a test table and insert a row in primary.
$node_a->safe_psql('postgres', "CREATE TABLE tbl1 (d text)");
$node_a->safe_psql('postgres', "INSERT INTO tbl1 VALUES ('in A')");
primary_psql("CHECKPOINT");
# Set up node B and C, as cascaded standbys
#
# A (primary) <--- B (standby) <--- C (standby)
$node_a->backup('my_backup');
$node_b = get_new_node('node_b');
$node_b->init_from_backup($node_a, 'my_backup', has_streaming => 1);
$node_b->set_standby_mode();
$node_b->start;
$node_b->backup('my_backup');
$node_c = get_new_node('node_c');
$node_c->init_from_backup($node_b, 'my_backup', has_streaming => 1);
$node_c->set_standby_mode();
$node_c->start;
# Insert additional data on A, and wait for both standbys to catch up.
$node_a->safe_psql('postgres',
"INSERT INTO tbl1 values ('in A, before promotion')");
$node_a->safe_psql('postgres', 'CHECKPOINT');
my $lsn = $node_a->lsn('insert');
$node_a->wait_for_catchup('node_b', 'write', $lsn);
$node_b->wait_for_catchup('node_c', 'write', $lsn);
# Promote C
#
# A (primary) <--- B (standby) C (primary)
$node_c->promote;
$node_c->safe_psql('postgres', "checkpoint");
# Insert a row in A. This causes A/B and C to have "diverged", so that it's
# no longer possible to just apply the standy's logs over primary directory
# - you need to rewind.
$node_a->safe_psql('postgres',
"INSERT INTO tbl1 VALUES ('in A, after C was promoted')");
# Also insert a new row in the standby, which won't be present in the
# old primary.
$node_c->safe_psql('postgres',
"INSERT INTO tbl1 VALUES ('in C, after C was promoted')");
#
# All set up. We're ready to run pg_rewind.
#
my $node_c_pgdata = $node_c->data_dir;
# Stop the node and be ready to perform the rewind.
$node_c->stop('fast');
# Keep a temporary postgresql.conf or it would be overwritten during the rewind.
copy(
"$node_c_pgdata/postgresql.conf",
"$tmp_folder/node_c-postgresql.conf.tmp");
{
# Temporarily unset PGAPPNAME so that the server doesn't
# inherit it. Otherwise this could affect libpqwalreceiver
# connections in confusing ways.
local %ENV = %ENV;
delete $ENV{PGAPPNAME};
# Do rewind using a remote connection as source, generating
# recovery configuration automatically.
command_ok(
[
'pg_rewind', "--debug",
"--source-server", $node_b->connstr('postgres'),
"--target-pgdata=$node_c_pgdata", "--no-sync",
"--write-recovery-conf"
],
'pg_rewind remote');
}
# Now move back postgresql.conf with old settings
move(
"$tmp_folder/node_c-postgresql.conf.tmp",
"$node_c_pgdata/postgresql.conf");
# Restart the node.
$node_c->start;
# set RewindTest::node_primary to point to the rewinded node, so that we can
# use check_query()
$node_primary = $node_c;
# Run some checks to verify that C has been successfully rewound,
# and connected back to follow B.
check_query(
'SELECT * FROM tbl1',
qq(in A
in A, before promotion
in A, after C was promoted
),
'table content after rewind');
# Insert another row, and observe that it's cascaded from A to B to C.
$node_a->safe_psql('postgres',
"INSERT INTO tbl1 values ('in A, after rewind')");
$lsn = $node_a->lsn('insert');
$node_b->wait_for_catchup('node_c', 'write', $lsn);
check_query(
'SELECT * FROM tbl1',
qq(in A
in A, before promotion
in A, after C was promoted
in A, after rewind
),
'table content after rewind and insert');
# clean up
$node_a->teardown_node;
$node_b->teardown_node;
$node_c->teardown_node;
exit(0);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment