Commit 4eb77d50 authored by Magnus Hagander's avatar Magnus Hagander

Validate page level checksums in base backups

When base backups are run over the replication protocol (for example
using pg_basebackup), verify the checksums of all data blocks if
checksums are enabled. If checksum failures are encountered, log them
as warnings but don't abort the backup.

This becomes the default behaviour in pg_basebackup (provided checksums
are enabled on the server), so add a switch (-k) to disable the checks
if necessary.

Author: Michael Banck
Reviewed-By: Magnus Hagander, David Steele
Discussion: https://postgr.es/m/20180228180856.GE13784@nighthawk.caipicrew.dd-dns.de
parent 4923550c
......@@ -2382,7 +2382,7 @@ The commands accepted in replication mode are:
</varlistentry>
<varlistentry>
<term><literal>BASE_BACKUP</literal> [ <literal>LABEL</literal> <replaceable>'label'</replaceable> ] [ <literal>PROGRESS</literal> ] [ <literal>FAST</literal> ] [ <literal>WAL</literal> ] [ <literal>NOWAIT</literal> ] [ <literal>MAX_RATE</literal> <replaceable>rate</replaceable> ] [ <literal>TABLESPACE_MAP</literal> ]
<term><literal>BASE_BACKUP</literal> [ <literal>LABEL</literal> <replaceable>'label'</replaceable> ] [ <literal>PROGRESS</literal> ] [ <literal>FAST</literal> ] [ <literal>WAL</literal> ] [ <literal>NOWAIT</literal> ] [ <literal>MAX_RATE</literal> <replaceable>rate</replaceable> ] [ <literal>TABLESPACE_MAP</literal> ] [ <literal>NOVERIFY_CHECKSUMS</literal> ]
<indexterm><primary>BASE_BACKUP</primary></indexterm>
</term>
<listitem>
......@@ -2481,6 +2481,17 @@ The commands accepted in replication mode are:
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>NOVERIFY_CHECKSUMS</literal></term>
<listitem>
<para>
By default, checksums are verified during a base backup if they are
enabled. Specifying <literal>NOVERIFY_CHECKSUMS</literal> disables
this verification.
</para>
</listitem>
</varlistentry>
</variablelist>
</para>
<para>
......
......@@ -506,6 +506,22 @@ PostgreSQL documentation
</listitem>
</varlistentry>
<varlistentry>
<term><option>-k</option></term>
<term><option>--no-verify-checksums</option></term>
<listitem>
<para>
Disables verification of checksums, if they are enabled on the server
the base backup is taken from.
</para>
<para>
By default, checksums are verified and checksum failures will result in
a non-zero exit status. However, the base backup will not be removed in
this case, as if the <literal>--no-clean</literal> option was used.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-v</option></term>
<term><option>--verbose</option></term>
......
......@@ -31,6 +31,8 @@
#include "replication/basebackup.h"
#include "replication/walsender.h"
#include "replication/walsender_private.h"
#include "storage/bufpage.h"
#include "storage/checksum.h"
#include "storage/dsm_impl.h"
#include "storage/fd.h"
#include "storage/ipc.h"
......@@ -70,6 +72,7 @@ static void parse_basebackup_options(List *options, basebackup_options *opt);
static void SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli);
static int compareWalFileNames(const void *a, const void *b);
static void throttle(size_t increment);
static bool is_checksummed_file(const char *fullpath, const char *filename);
/* Was the backup currently in-progress initiated in recovery mode? */
static bool backup_started_in_recovery = false;
......@@ -99,6 +102,15 @@ static TimeOffset elapsed_min_unit;
/* The last check of the transfer rate. */
static TimestampTz throttled_last;
/* The starting XLOG position of the base backup. */
static XLogRecPtr startptr;
/* Total number of checksum failures during base backup. */
static int64 total_checksum_failures;
/* Do not verify checksums. */
static bool noverify_checksums = false;
/*
* The contents of these directories are removed or recreated during server
* start so they are not included in backups. The directories themselves are
......@@ -175,6 +187,18 @@ static const char *excludeFiles[] =
NULL
};
/*
* List of files excluded from checksum validation.
*/
static const char *noChecksumFiles[] = {
"pg_control",
"pg_filenode.map",
"pg_internal.init",
"PG_VERSION",
NULL,
};
/*
* Called when ERROR or FATAL happens in perform_base_backup() after
* we have started the backup - make sure we end it!
......@@ -194,7 +218,6 @@ base_backup_cleanup(int code, Datum arg)
static void
perform_base_backup(basebackup_options *opt)
{
XLogRecPtr startptr;
TimeLineID starttli;
XLogRecPtr endptr;
TimeLineID endtli;
......@@ -210,6 +233,8 @@ perform_base_backup(basebackup_options *opt)
labelfile = makeStringInfo();
tblspc_map_file = makeStringInfo();
total_checksum_failures = 0;
startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
labelfile, &tablespaces,
tblspc_map_file,
......@@ -568,6 +593,17 @@ perform_base_backup(basebackup_options *opt)
pq_putemptymessage('c');
}
SendXlogRecPtrResult(endptr, endtli);
if (total_checksum_failures)
{
if (total_checksum_failures > 1)
ereport(WARNING,
(errmsg("%ld total checksum verification failures", total_checksum_failures)));
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("checksum verification failure during base backup")));
}
}
/*
......@@ -597,6 +633,7 @@ parse_basebackup_options(List *options, basebackup_options *opt)
bool o_wal = false;
bool o_maxrate = false;
bool o_tablespace_map = false;
bool o_noverify_checksums = false;
MemSet(opt, 0, sizeof(*opt));
foreach(lopt, options)
......@@ -676,6 +713,15 @@ parse_basebackup_options(List *options, basebackup_options *opt)
opt->sendtblspcmapfile = true;
o_tablespace_map = true;
}
else if (strcmp(defel->defname, "noverify_checksums") == 0)
{
if (o_noverify_checksums)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("duplicate option \"%s\"", defel->defname)));
noverify_checksums = true;
o_noverify_checksums = true;
}
else
elog(ERROR, "option \"%s\" not recognized",
defel->defname);
......@@ -1257,6 +1303,33 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
return size;
}
/*
* Check if a file should have its checksum validated.
* We validate checksums on files in regular tablespaces
* (including global and default) only, and in those there
* are some files that are explicitly excluded.
*/
static bool
is_checksummed_file(const char *fullpath, const char *filename)
{
const char **f;
/* Check that the file is in a tablespace */
if (strncmp(fullpath, "./global/", 9) == 0 ||
strncmp(fullpath, "./base/", 7) == 0 ||
strncmp(fullpath, "/", 1) == 0)
{
/* Compare file against noChecksumFiles skiplist */
for (f = noChecksumFiles; *f; f++)
if (strcmp(*f, filename) == 0)
return false;
return true;
}
else
return false;
}
/*****
* Functions for handling tar file format
*
......@@ -1277,10 +1350,20 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
bool missing_ok)
{
FILE *fp;
BlockNumber blkno = 0;
bool block_retry = false;
char buf[TAR_SEND_SIZE];
uint16 checksum;
int checksum_failures = 0;
size_t cnt;
int i;
pgoff_t len = 0;
char *page;
size_t pad;
PageHeader phdr;
int segmentno = 0;
char *segmentpath;
bool verify_checksum = false;
fp = AllocateFile(readfilename, "rb");
if (fp == NULL)
......@@ -1294,8 +1377,142 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
_tarWriteHeader(tarfilename, NULL, statbuf, false);
if (!noverify_checksums && DataChecksumsEnabled())
{
char *filename;
/*
* Get the filename (excluding path). As last_dir_separator()
* includes the last directory separator, we chop that off by
* incrementing the pointer.
*/
filename = last_dir_separator(readfilename) + 1;
if (is_checksummed_file(readfilename, filename))
{
verify_checksum = true;
/*
* Cut off at the segment boundary (".") to get the segment number
* in order to mix it into the checksum.
*/
segmentpath = strstr(filename, ".");
if (segmentpath != NULL)
{
segmentno = atoi(segmentpath + 1);
if (segmentno == 0)
ereport(ERROR,
(errmsg("invalid segment number %d in file \"%s\"",
segmentno, filename)));
}
}
}
while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0)
{
if (verify_checksum)
{
/*
* The checksums are verified at block level, so we iterate over
* the buffer in chunks of BLCKSZ, after making sure that
* TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple
* of BLCKSZ bytes.
*/
Assert(TAR_SEND_SIZE % BLCKSZ == 0);
if (cnt % BLCKSZ != 0)
{
ereport(WARNING,
(errmsg("cannot verify checksum in file \"%s\", block "
"%d: read buffer size %d and page size %d "
"differ",
readfilename, blkno, (int) cnt, BLCKSZ)));
verify_checksum = false;
continue;
}
for (i = 0; i < cnt / BLCKSZ; i++)
{
page = buf + BLCKSZ * i;
/*
* Only check pages which have not been modified since the
* start of the base backup. Otherwise, they might have been
* written only halfway and the checksum would not be valid.
* However, replaying WAL would reinstate the correct page in
* this case.
*/
if (PageGetLSN(page) < startptr)
{
checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
phdr = (PageHeader) page;
if (phdr->pd_checksum != checksum)
{
/*
* Retry the block on the first failure. It's
* possible that we read the first 4K page of the
* block just before postgres updated the entire block
* so it ends up looking torn to us. We only need to
* retry once because the LSN should be updated to
* something we can ignore on the next pass. If the
* error happens again then it is a true validation
* failure.
*/
if (block_retry == false)
{
/* Reread the failed block */
if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
{
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fseek in file \"%s\": %m",
readfilename)));
}
if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
{
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not reread block %d of file \"%s\": %m",
blkno, readfilename)));
}
if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
{
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fseek in file \"%s\": %m",
readfilename)));
}
/* Set flag so we know a retry was attempted */
block_retry = true;
/* Reset loop to validate the block again */
i--;
continue;
}
checksum_failures++;
if (checksum_failures <= 5)
ereport(WARNING,
(errmsg("checksum verification failed in "
"file \"%s\", block %d: calculated "
"%X but expected %X",
readfilename, blkno, checksum,
phdr->pd_checksum)));
if (checksum_failures == 5)
ereport(WARNING,
(errmsg("further checksum verification "
"failures in file \"%s\" will not "
"be reported", readfilename)));
}
}
block_retry = false;
blkno++;
}
}
/* Send the chunk as a CopyData message */
if (pq_putmessage('d', buf, cnt))
ereport(ERROR,
......@@ -1341,6 +1558,14 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
FreeFile(fp);
if (checksum_failures > 1)
{
ereport(WARNING,
(errmsg("file \"%s\" has a total of %d checksum verification "
"failures", readfilename, checksum_failures)));
}
total_checksum_failures += checksum_failures;
return true;
}
......
......@@ -77,6 +77,7 @@ static SQLCmd *make_sqlcmd(void);
%token K_MAX_RATE
%token K_WAL
%token K_TABLESPACE_MAP
%token K_NOVERIFY_CHECKSUMS
%token K_TIMELINE
%token K_PHYSICAL
%token K_LOGICAL
......@@ -154,7 +155,7 @@ var_name: IDENT { $$ = $1; }
/*
* BASE_BACKUP [LABEL '<label>'] [PROGRESS] [FAST] [WAL] [NOWAIT]
* [MAX_RATE %d] [TABLESPACE_MAP]
* [MAX_RATE %d] [TABLESPACE_MAP] [NOVERIFY_CHECKSUMS]
*/
base_backup:
K_BASE_BACKUP base_backup_opt_list
......@@ -208,6 +209,11 @@ base_backup_opt:
$$ = makeDefElem("tablespace_map",
(Node *)makeInteger(true), -1);
}
| K_NOVERIFY_CHECKSUMS
{
$$ = makeDefElem("noverify_checksums",
(Node *)makeInteger(true), -1);
}
;
create_replication_slot:
......
......@@ -92,6 +92,7 @@ PROGRESS { return K_PROGRESS; }
MAX_RATE { return K_MAX_RATE; }
WAL { return K_WAL; }
TABLESPACE_MAP { return K_TABLESPACE_MAP; }
NOVERIFY_CHECKSUMS { return K_NOVERIFY_CHECKSUMS; }
TIMELINE { return K_TIMELINE; }
START_REPLICATION { return K_START_REPLICATION; }
CREATE_REPLICATION_SLOT { return K_CREATE_REPLICATION_SLOT; }
......
......@@ -39,6 +39,7 @@
#include "replication/basebackup.h"
#include "streamutil.h"
#define ERRCODE_DATA_CORRUPTED "XX001"
typedef struct TablespaceListCell
{
......@@ -81,6 +82,7 @@ static char *xlog_dir = NULL;
static char format = 'p'; /* p(lain)/t(ar) */
static char *label = "pg_basebackup base backup";
static bool noclean = false;
static bool checksum_failure = false;
static bool showprogress = false;
static int verbose = 0;
static int compresslevel = 0;
......@@ -95,6 +97,7 @@ static char *replication_slot = NULL;
static bool temp_replication_slot = true;
static bool create_slot = false;
static bool no_slot = false;
static bool verify_checksums = true;
static bool success = false;
static bool made_new_pgdata = false;
......@@ -155,7 +158,7 @@ cleanup_directories_atexit(void)
if (success || in_log_streamer)
return;
if (!noclean)
if (!noclean && !checksum_failure)
{
if (made_new_pgdata)
{
......@@ -195,7 +198,7 @@ cleanup_directories_atexit(void)
}
else
{
if (made_new_pgdata || found_existing_pgdata)
if ((made_new_pgdata || found_existing_pgdata) && !checksum_failure)
fprintf(stderr,
_("%s: data directory \"%s\" not removed at user's request\n"),
progname, basedir);
......@@ -206,7 +209,7 @@ cleanup_directories_atexit(void)
progname, xlog_dir);
}
if (made_tablespace_dirs || found_tablespace_dirs)
if ((made_tablespace_dirs || found_tablespace_dirs) && !checksum_failure)
fprintf(stderr,
_("%s: changes to tablespace directories will not be undone\n"),
progname);
......@@ -360,6 +363,8 @@ usage(void)
printf(_(" -P, --progress show progress information\n"));
printf(_(" -S, --slot=SLOTNAME replication slot to use\n"));
printf(_(" --no-slot prevent creation of temporary replication slot\n"));
printf(_(" -k, --no-verify-checksums\n"
" do not verify checksums\n"));
printf(_(" -v, --verbose output verbose messages\n"));
printf(_(" -V, --version output version information, then exit\n"));
printf(_(" -?, --help show this help, then exit\n"));
......@@ -1808,14 +1813,15 @@ BaseBackup(void)
}
basebkp =
psprintf("BASE_BACKUP LABEL '%s' %s %s %s %s %s %s",
psprintf("BASE_BACKUP LABEL '%s' %s %s %s %s %s %s %s",
escaped_label,
showprogress ? "PROGRESS" : "",
includewal == FETCH_WAL ? "WAL" : "",
fastcheckpoint ? "FAST" : "",
includewal == NO_WAL ? "" : "NOWAIT",
maxrate_clause ? maxrate_clause : "",
format == 't' ? "TABLESPACE_MAP" : "");
format == 't' ? "TABLESPACE_MAP" : "",
verify_checksums ? "" : "NOVERIFY_CHECKSUMS");
if (PQsendQuery(conn, basebkp) == 0)
{
......@@ -1970,8 +1976,20 @@ BaseBackup(void)
res = PQgetResult(conn);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
fprintf(stderr, _("%s: final receive failed: %s"),
progname, PQerrorMessage(conn));
const char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
if (sqlstate &&
strcmp(sqlstate, ERRCODE_DATA_CORRUPTED) == 0)
{
fprintf(stderr, _("%s: checksum error occured\n"),
progname);
checksum_failure = true;
}
else
{
fprintf(stderr, _("%s: final receive failed: %s"),
progname, PQerrorMessage(conn));
}
disconnect_and_exit(1);
}
......@@ -2140,6 +2158,7 @@ main(int argc, char **argv)
{"progress", no_argument, NULL, 'P'},
{"waldir", required_argument, NULL, 1},
{"no-slot", no_argument, NULL, 2},
{"no-verify-checksums", no_argument, NULL, 'k'},
{NULL, 0, NULL, 0}
};
int c;
......@@ -2166,7 +2185,7 @@ main(int argc, char **argv)
atexit(cleanup_directories_atexit);
while ((c = getopt_long(argc, argv, "CD:F:r:RS:T:X:l:nNzZ:d:c:h:p:U:s:wWvP",
while ((c = getopt_long(argc, argv, "CD:F:r:RS:T:X:l:nNzZ:d:c:h:p:U:s:wWkvP",
long_options, &option_index)) != -1)
{
switch (c)
......@@ -2308,6 +2327,9 @@ main(int argc, char **argv)
case 'P':
showprogress = true;
break;
case 'k':
verify_checksums = false;
break;
default:
/*
......
......@@ -5,7 +5,7 @@ use Config;
use File::Basename qw(basename dirname);
use PostgresNode;
use TestLib;
use Test::More tests => 93;
use Test::More tests => 104;
program_help_ok('pg_basebackup');
program_version_ok('pg_basebackup');
......@@ -16,7 +16,7 @@ my $tempdir = TestLib::tempdir;
my $node = get_new_node('main');
# Initialize node without replication settings
$node->init;
$node->init(extra => [ '--data-checksums' ]);
$node->start;
my $pgdata = $node->data_dir;
......@@ -402,3 +402,61 @@ like(
slurp_file("$tempdir/backupxs_sl_R/recovery.conf"),
qr/^primary_slot_name = 'slot1'\n/m,
'recovery.conf sets primary_slot_name');
my $checksum = $node->safe_psql('postgres', 'SHOW data_checksums;');
is($checksum, 'on', 'checksums are enabled');
# get relfilenodes of relations to corrupt
my $pg_class = $node->safe_psql('postgres',
q{SELECT pg_relation_filepath('pg_class')}
);
my $pg_index = $node->safe_psql('postgres',
q{SELECT pg_relation_filepath('pg_index')}
);
# induce corruption
open $file, '+<', "$pgdata/$pg_class";
seek($file, 4000, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
close $file;
$node->command_checks_all([ 'pg_basebackup', '-D', "$tempdir/backup_corrupt"],
1,
[qr{^$}],
[qr/^WARNING.*checksum verification failed/s],
'pg_basebackup reports checksum mismatch'
);
# induce further corruption in 5 more blocks
open $file, '+<', "$pgdata/$pg_class";
my @offsets = (12192, 20384, 28576, 36768, 44960);
foreach my $offset (@offsets) {
seek($file, $offset, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
}
close $file;
$node->command_checks_all([ 'pg_basebackup', '-D', "$tempdir/backup_corrupt2"],
1,
[qr{^$}],
[qr/^WARNING.*further.*failures.*will.not.be.reported/s],
'pg_basebackup does not report more than 5 checksum mismatches'
);
# induce corruption in a second file
open $file, '+<', "$pgdata/$pg_index";
seek($file, 4000, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
close $file;
$node->command_checks_all([ 'pg_basebackup', '-D', "$tempdir/backup_corrupt3"],
1,
[qr{^$}],
[qr/^WARNING.*7 total checksum verification failures/s],
'pg_basebackup correctly report the total number of checksum mismatches'
);
# do not verify checksums, should return ok
$node->command_ok(
[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt4", '-k' ],
'pg_basebackup with -k does not report checksum mismatch');
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment