Improve handling and coverage of --no-ensure-shutdown in pg_rewind

This includes a couple of changes around the new behavior of pg_rewind which enforces recovery to happen once on a cluster not shut down cleanly: - Some comments and documentation improvements. - Shutdown the cluster to rewind with immediate mode in all the tests, this allows to check after the forced recovery behavior which is wanted as new default. - Use -F for the forced recovery step, so as postgres does not use fsync. This was useless as a final sync is done once the tool is done. Author: Michael Paquier Reviewed-by: Alexey Kondratov Discussion: https://postgr.es/m/20191004083721.GA1829@paquier.xyz

Improve handling and coverage of --no-ensure-shutdown in pg_rewind
This includes a couple of changes around the new behavior of pg_rewind which enforces recovery to happen once on a cluster not shut down cleanly: - Some comments and documentation improvements. - Shutdown the cluster to rewind with immediate mode in all the tests, this allows to check after the forced recovery behavior which is wanted as new default. - Use -F for the forced recovery step, so as postgres does not use fsync. This was useless as a final sync is done once the tool is done. Author: Michael Paquier Reviewed-by: Alexey Kondratov Discussion: https://postgr.es/m/20191004083721.GA1829@paquier.xyz
caa07835 · Michael Paquier · 732457b5 · caa07835 · caa07835 · caa07835
Commit caa07835 authored Oct 07, 2019 by Michael Paquier
4 changed files
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -169,9 +169,11 @@ PostgreSQL documentation
      <term><option>--no-ensure-shutdown</option></term>
      <listitem>
       <para>
-        <application>pg_rewind</application> verifies that the target server
+        <application>pg_rewind</application> requires that the target server
-        is cleanly shutdown before rewinding; by default, if it isn't, it
+        is cleanly shut down before rewinding. By default, if the target server
-        starts the server in single-user mode to complete crash recovery.
+        is not shut down cleanly, <application>pg_rewind</application> starts
+        the target server in single-user mode to complete crash recovery first,
+        and stops it.
        By passing this option, <application>pg_rewind</application> skips
        this and errors out immediately if the server is not cleanly shut
        down. Users are expected to handle the situation themselves in that

--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -270,11 +270,12 @@ main(int argc, char **argv)
 	pg_free(buffer);
 	/*
-	 * If the target instance was not cleanly shut down, run a single-user
+	 * If the target instance was not cleanly shut down, start and stop the
-	 * postgres session really quickly and reload the control file to get the
+	 * target cluster once in single-user mode to enforce recovery to finish,
-	 * new state. Note if no_ensure_shutdown is specified, pg_rewind won't do
+	 * ensuring that the cluster can be used by pg_rewind.  Note that if
-	 * that automatically. That means users need to do themselves in advance,
+	 * no_ensure_shutdown is specified, pg_rewind ignores this step, and users
-	 * else pg_rewind will soon quit, see sanityChecks().
+	 * need to make sure by themselves that the target cluster is in a clean
+	 * state.
 	 */
 	if (!no_ensure_shutdown &&
 		ControlFile_target.state != DB_SHUTDOWNED &&
@@ -847,8 +848,12 @@ ensureCleanShutdown(const char *argv0)
 	if (dry_run)
 		return;
-	/* finally run postgres in single-user mode */
+	/*
-	snprintf(cmd, MAXCMDLEN, "\"%s\" --single -D \"%s\" template1 < \"%s\"",
+	 * Finally run postgres in single-user mode.  There is no need to use
+	 * fsync here.  This makes the recovery faster, and the target data folder
+	 * is synced at the end anyway.
+	 */
+	snprintf(cmd, MAXCMDLEN, "\"%s\" --single -F -D \"%s\" template1 < \"%s\"",
 			 exec_path, datadir_target, DEVNULL);
 	if (system(cmd) != 0)

--- a/src/bin/pg_rewind/t/005_same_timeline.pl
+++ b/src/bin/pg_rewind/t/005_same_timeline.pl
+#
+# Test that running pg_rewind with the source and target clusters
+# on the same timeline runs successfully.
+#
 use strict;
 use warnings;
 use TestLib;

--- a/src/bin/pg_rewind/t/RewindTest.pm
+++ b/src/bin/pg_rewind/t/RewindTest.pm
@@ -227,8 +227,10 @@ sub run_pg_rewind
 	# Append the rewind-specific role to the connection string.
 	$standby_connstr = "$standby_connstr user=rewind_user";
-	# Stop the master and be ready to perform the rewind
+	# Stop the master and be ready to perform the rewind.  The cluster
-	$node_master->stop;
+	# needs recovery to finish once, and pg_rewind makes sure that it
+	# happens automatically.
+	$node_master->stop('immediate');
 	# At this point, the rewind processing is ready to run.
 	# We now have a very simple scenario with a few diverged WAL record.
@@ -260,19 +262,21 @@ sub run_pg_rewind
 	}
 	elsif ($test_mode eq "remote")
 	{
+		# Do rewind using a remote connection as source, generating
-		# Do rewind using a remote connection as source
+		# recovery configuration automatically.
 		command_ok(
 			[
 				'pg_rewind',                      "--debug",
 				"--source-server",                $standby_connstr,
-				"--target-pgdata=$master_pgdata", "-R",
+				"--target-pgdata=$master_pgdata", "--no-sync",
-				"--no-sync"
+				"--write-recovery-conf"
 			],
 			'pg_rewind remote');
-		# Check that standby.signal has been created.
+		# Check that standby.signal is here as recovery configuration
-		ok(-e "$master_pgdata/standby.signal");
+		# was requested.
+		ok( -e "$master_pgdata/standby.signal",
+			'standby.signal created after pg_rewind');
 		# Now, when pg_rewind apparently succeeded with minimal permissions,
 		# add REPLICATION privilege.  So we could test that new standby