Commit 5c995139 authored by Michael Paquier's avatar Michael Paquier

Fix various checksum check problems for pg_verify_checksums and base backups

Three issues are fixed in this patch:
- Base backups forgot to ignore files specific to EXEC_BACKEND, leading
to spurious warnings when checksums are enabled, per analysis from me.
- pg_verify_checksums forgot about files specific to EXEC_BACKEND,
leading to failures of the tool on any such build, particularly Windows.
This error was originally found by newly-introduced TAP tests in various
buildfarm members using EXEC_BACKEND.
- pg_verify_checksums forgot to count for temporary files and temporary
paths, which could be valid relation files, without checksums, per
report from Andres Freund.  More tests are added to cover this case.

A new test case which emulates corruption for a file in a different
tablespace is added, coming from from Michael Banck, while I have coded
the main code and refactored the test code.

Author: Michael Banck, Michael Paquier
Reviewed-by: Stephen Frost, David Steele
Discussion: https://postgr.es/m/20181021134206.GA14282@paquier.xyz
parent a1c91dd1
...@@ -189,12 +189,19 @@ static const char *excludeFiles[] = ...@@ -189,12 +189,19 @@ static const char *excludeFiles[] =
/* /*
* List of files excluded from checksum validation. * List of files excluded from checksum validation.
*
* Note: this list should be kept in sync with what pg_verify_checksums.c
* includes.
*/ */
static const char *const noChecksumFiles[] = { static const char *const noChecksumFiles[] = {
"pg_control", "pg_control",
"pg_filenode.map", "pg_filenode.map",
"pg_internal.init", "pg_internal.init",
"PG_VERSION", "PG_VERSION",
#ifdef EXEC_BACKEND
"config_exec_params",
"config_exec_params.new",
#endif
NULL, NULL,
}; };
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "storage/bufpage.h" #include "storage/bufpage.h"
#include "storage/checksum.h" #include "storage/checksum.h"
#include "storage/checksum_impl.h" #include "storage/checksum_impl.h"
#include "storage/fd.h"
static int64 files = 0; static int64 files = 0;
...@@ -49,11 +50,20 @@ usage(void) ...@@ -49,11 +50,20 @@ usage(void)
printf(_("Report bugs to <pgsql-bugs@postgresql.org>.\n")); printf(_("Report bugs to <pgsql-bugs@postgresql.org>.\n"));
} }
/*
* List of files excluded from checksum validation.
*
* Note: this list should be kept in sync with what basebackup.c includes.
*/
static const char *const skip[] = { static const char *const skip[] = {
"pg_control", "pg_control",
"pg_filenode.map", "pg_filenode.map",
"pg_internal.init", "pg_internal.init",
"PG_VERSION", "PG_VERSION",
#ifdef EXEC_BACKEND
"config_exec_params",
"config_exec_params.new",
#endif
NULL, NULL,
}; };
...@@ -62,13 +72,10 @@ skipfile(const char *fn) ...@@ -62,13 +72,10 @@ skipfile(const char *fn)
{ {
const char *const *f; const char *const *f;
if (strcmp(fn, ".") == 0 ||
strcmp(fn, "..") == 0)
return true;
for (f = skip; *f; f++) for (f = skip; *f; f++)
if (strcmp(*f, fn) == 0) if (strcmp(*f, fn) == 0)
return true; return true;
return false; return false;
} }
...@@ -146,9 +153,22 @@ scan_directory(const char *basedir, const char *subdir) ...@@ -146,9 +153,22 @@ scan_directory(const char *basedir, const char *subdir)
char fn[MAXPGPATH]; char fn[MAXPGPATH];
struct stat st; struct stat st;
if (skipfile(de->d_name)) if (strcmp(de->d_name, ".") == 0 ||
strcmp(de->d_name, "..") == 0)
continue;
/* Skip temporary files */
if (strncmp(de->d_name,
PG_TEMP_FILE_PREFIX,
strlen(PG_TEMP_FILE_PREFIX)) == 0)
continue; continue;
/* Skip temporary folders */
if (strncmp(de->d_name,
PG_TEMP_FILES_DIR,
strlen(PG_TEMP_FILES_DIR)) == 0)
return;
snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name); snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
if (lstat(fn, &st) < 0) if (lstat(fn, &st) < 0)
{ {
...@@ -163,6 +183,9 @@ scan_directory(const char *basedir, const char *subdir) ...@@ -163,6 +183,9 @@ scan_directory(const char *basedir, const char *subdir)
*segmentpath; *segmentpath;
BlockNumber segmentno = 0; BlockNumber segmentno = 0;
if (skipfile(de->d_name))
continue;
/* /*
* Cut off at the segment boundary (".") to get the segment number * Cut off at the segment boundary (".") to get the segment number
* in order to mix it into the checksum. Then also cut off at the * in order to mix it into the checksum. Then also cut off at the
......
...@@ -5,7 +5,74 @@ use strict; ...@@ -5,7 +5,74 @@ use strict;
use warnings; use warnings;
use PostgresNode; use PostgresNode;
use TestLib; use TestLib;
use Test::More tests => 36; use Test::More tests => 45;
# Utility routine to create and check a table with corrupted checksums
# on a wanted tablespace. Note that this stops and starts the node
# multiple times to perform the checks, leaving the node started
# at the end.
sub check_relation_corruption
{
my $node = shift;
my $table = shift;
my $tablespace = shift;
my $pgdata = $node->data_dir;
$node->safe_psql('postgres',
"SELECT a INTO $table FROM generate_series(1,10000) AS a;
ALTER TABLE $table SET (autovacuum_enabled=false);");
$node->safe_psql('postgres',
"ALTER TABLE ".$table." SET TABLESPACE ".$tablespace.";");
my $file_corrupted = $node->safe_psql('postgres',
"SELECT pg_relation_filepath('$table');");
my $relfilenode_corrupted = $node->safe_psql('postgres',
"SELECT relfilenode FROM pg_class WHERE relname = '$table';");
# Set page header and block size
my $pageheader_size = 24;
my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
$node->stop;
# Checksums are correct for single relfilenode as the table is not
# corrupted yet.
command_ok(['pg_verify_checksums', '-D', $pgdata,
'-r', $relfilenode_corrupted],
"succeeds for single relfilenode on tablespace $tablespace with offline cluster");
# Time to create some corruption
open my $file, '+<', "$pgdata/$file_corrupted";
seek($file, $pageheader_size, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
close $file;
# Checksum checks on single relfilenode fail
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata, '-r',
$relfilenode_corrupted],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
"fails with corrupted data for single relfilenode on tablespace $tablespace");
# Global checksum checks fail as well
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
"fails with corrupted data on tablespace $tablespace");
# Drop corrupted table again and make sure there is no more corruption.
$node->start;
$node->safe_psql('postgres', "DROP TABLE $table;");
$node->stop;
$node->command_ok(['pg_verify_checksums', '-D', $pgdata],
"succeeds again after table drop on tablespace $tablespace");
$node->start;
return;
}
# Initialize node with checksums enabled. # Initialize node with checksums enabled.
my $node = get_new_node('node_checksum'); my $node = get_new_node('node_checksum');
...@@ -27,6 +94,12 @@ append_to_file "$pgdata/global/99999_init.123", ""; ...@@ -27,6 +94,12 @@ append_to_file "$pgdata/global/99999_init.123", "";
append_to_file "$pgdata/global/99999_fsm.123", ""; append_to_file "$pgdata/global/99999_fsm.123", "";
append_to_file "$pgdata/global/99999_vm.123", ""; append_to_file "$pgdata/global/99999_vm.123", "";
# These are temporary files and folders with dummy contents, which
# should be ignored by the scan.
append_to_file "$pgdata/global/pgsql_tmp_123", "foo";
mkdir "$pgdata/global/pgsql_tmp";
append_to_file "$pgdata/global/pgsql_tmp/1.1", "foo";
# Checksums pass on a newly-created cluster # Checksums pass on a newly-created cluster
command_ok(['pg_verify_checksums', '-D', $pgdata], command_ok(['pg_verify_checksums', '-D', $pgdata],
"succeeds with offline cluster"); "succeeds with offline cluster");
...@@ -36,47 +109,16 @@ $node->start; ...@@ -36,47 +109,16 @@ $node->start;
command_fails(['pg_verify_checksums', '-D', $pgdata], command_fails(['pg_verify_checksums', '-D', $pgdata],
"fails with online cluster"); "fails with online cluster");
# Create table to corrupt and get its relfilenode # Check corruption of table on default tablespace.
$node->safe_psql('postgres', check_relation_corruption($node, 'corrupt1', 'pg_default');
"SELECT a INTO corrupt1 FROM generate_series(1,10000) AS a;
ALTER TABLE corrupt1 SET (autovacuum_enabled=false);");
my $file_corrupted = $node->safe_psql('postgres',
"SELECT pg_relation_filepath('corrupt1')");
my $relfilenode_corrupted = $node->safe_psql('postgres',
"SELECT relfilenode FROM pg_class WHERE relname = 'corrupt1';");
# Set page header and block size
my $pageheader_size = 24;
my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
$node->stop;
# Checksums are correct for single relfilenode as the table is not
# corrupted yet.
command_ok(['pg_verify_checksums', '-D', $pgdata,
'-r', $relfilenode_corrupted],
"succeeds for single relfilenode with offline cluster");
# Time to create some corruption
open my $file, '+<', "$pgdata/$file_corrupted";
seek($file, $pageheader_size, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
close $file;
# Global checksum checks fail # Create tablespace to check corruptions in a non-default tablespace.
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata], my $basedir = $node->basedir;
1, my $tablespace_dir = "$basedir/ts_corrupt_dir";
[qr/Bad checksums:.*1/], mkdir ($tablespace_dir);
[qr/checksum verification failed/], $node->safe_psql('postgres',
'fails with corrupted data'); "CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
check_relation_corruption($node, 'corrupt2', 'ts_corrupt');
# Checksum checks on single relfilenode fail
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata, '-r',
$relfilenode_corrupted],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
'fails for corrupted data on single relfilenode');
# Utility routine to check that pg_verify_checksums is able to detect # Utility routine to check that pg_verify_checksums is able to detect
# correctly-named relation files filled with some corrupted data. # correctly-named relation files filled with some corrupted data.
...@@ -101,6 +143,9 @@ sub fail_corrupt ...@@ -101,6 +143,9 @@ sub fail_corrupt
return; return;
} }
# Stop instance for the follow-up checks.
$node->stop;
# Authorized relation files filled with corrupted data cause the # Authorized relation files filled with corrupted data cause the
# checksum checks to fail. Make sure to use file names different # checksum checks to fail. Make sure to use file names different
# than the previous ones. # than the previous ones.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment