Commit 6b9e875f authored by Magnus Hagander's avatar Magnus Hagander

Track block level checksum failures in pg_stat_database

This adds a column that counts how many checksum failures have occurred
on files belonging to a specific database. Both checksum failures
during normal backend processing and those created when a base backup
detects a checksum failure are counted.

Author: Magnus Hagander
Reviewed by: Julien Rouhaud
parent 3c592630
...@@ -2508,6 +2508,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i ...@@ -2508,6 +2508,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
<entry><type>bigint</type></entry> <entry><type>bigint</type></entry>
<entry>Number of deadlocks detected in this database</entry> <entry>Number of deadlocks detected in this database</entry>
</row> </row>
<row>
<entry><structfield>checksum_failures</structfield></entry>
<entry><type>bigint</type></entry>
<entry>Number of data page checksum failures detected in this database</entry>
</row>
<row> <row>
<entry><structfield>blk_read_time</structfield></entry> <entry><structfield>blk_read_time</structfield></entry>
<entry><type>double precision</type></entry> <entry><type>double precision</type></entry>
......
...@@ -823,6 +823,7 @@ CREATE VIEW pg_stat_database AS ...@@ -823,6 +823,7 @@ CREATE VIEW pg_stat_database AS
pg_stat_get_db_temp_files(D.oid) AS temp_files, pg_stat_get_db_temp_files(D.oid) AS temp_files,
pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes, pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes,
pg_stat_get_db_deadlocks(D.oid) AS deadlocks, pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures,
pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time, pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time, pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset
......
...@@ -334,6 +334,7 @@ static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len); ...@@ -334,6 +334,7 @@ static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
/* ------------------------------------------------------------ /* ------------------------------------------------------------
...@@ -1518,6 +1519,40 @@ pgstat_report_deadlock(void) ...@@ -1518,6 +1519,40 @@ pgstat_report_deadlock(void)
pgstat_send(&msg, sizeof(msg)); pgstat_send(&msg, sizeof(msg));
} }
/* --------
* pgstat_report_checksum_failures_in_db(dboid, failure_count) -
*
* Tell the collector about one or more checksum failures.
* --------
*/
void
pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
{
PgStat_MsgChecksumFailure msg;
if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
return;
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
msg.m_databaseid = dboid;
msg.m_failurecount = failurecount;
pgstat_send(&msg, sizeof(msg));
}
/* --------
* pgstat_report_checksum_failure() -
*
* Tell the collector about a checksum failure.
* --------
*/
void
pgstat_report_checksum_failure(void)
{
pgstat_report_checksum_failures_in_db(MyDatabaseId, 1);
}
/* -------- /* --------
* pgstat_report_tempfile() - * pgstat_report_tempfile() -
* *
...@@ -4455,6 +4490,10 @@ PgstatCollectorMain(int argc, char *argv[]) ...@@ -4455,6 +4490,10 @@ PgstatCollectorMain(int argc, char *argv[])
pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len); pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
break; break;
case PGSTAT_MTYPE_CHECKSUMFAILURE:
pgstat_recv_checksum_failure((PgStat_MsgChecksumFailure *) &msg, len);
break;
default: default:
break; break;
} }
...@@ -4554,6 +4593,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry) ...@@ -4554,6 +4593,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
dbentry->n_temp_files = 0; dbentry->n_temp_files = 0;
dbentry->n_temp_bytes = 0; dbentry->n_temp_bytes = 0;
dbentry->n_deadlocks = 0; dbentry->n_deadlocks = 0;
dbentry->n_checksum_failures = 0;
dbentry->n_block_read_time = 0; dbentry->n_block_read_time = 0;
dbentry->n_block_write_time = 0; dbentry->n_block_write_time = 0;
...@@ -6196,6 +6236,22 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len) ...@@ -6196,6 +6236,22 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
dbentry->n_deadlocks++; dbentry->n_deadlocks++;
} }
/* ----------
* pgstat_recv_checksum_failure() -
*
* Process a CHECKSUMFAILURE message.
* ----------
*/
static void
pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
{
PgStat_StatDBEntry *dbentry;
dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
dbentry->n_checksum_failures += msg->m_failurecount;
}
/* ---------- /* ----------
* pgstat_recv_tempfile() - * pgstat_recv_tempfile() -
* *
......
...@@ -58,7 +58,7 @@ typedef struct ...@@ -58,7 +58,7 @@ typedef struct
static int64 sendDir(const char *path, int basepathlen, bool sizeonly, static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
List *tablespaces, bool sendtblspclinks); List *tablespaces, bool sendtblspclinks);
static bool sendFile(const char *readfilename, const char *tarfilename, static bool sendFile(const char *readfilename, const char *tarfilename,
struct stat *statbuf, bool missing_ok); struct stat *statbuf, bool missing_ok, Oid dboid);
static void sendFileWithContent(const char *filename, const char *content); static void sendFileWithContent(const char *filename, const char *content);
static int64 _tarWriteHeader(const char *filename, const char *linktarget, static int64 _tarWriteHeader(const char *filename, const char *linktarget,
struct stat *statbuf, bool sizeonly); struct stat *statbuf, bool sizeonly);
...@@ -342,7 +342,7 @@ perform_base_backup(basebackup_options *opt) ...@@ -342,7 +342,7 @@ perform_base_backup(basebackup_options *opt)
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", errmsg("could not stat file \"%s\": %m",
XLOG_CONTROL_FILE))); XLOG_CONTROL_FILE)));
sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false); sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid);
} }
else else
sendTablespace(ti->path, false); sendTablespace(ti->path, false);
...@@ -592,7 +592,7 @@ perform_base_backup(basebackup_options *opt) ...@@ -592,7 +592,7 @@ perform_base_backup(basebackup_options *opt)
(errcode_for_file_access(), (errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", pathbuf))); errmsg("could not stat file \"%s\": %m", pathbuf)));
sendFile(pathbuf, pathbuf, &statbuf, false); sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid);
/* unconditionally mark file as archived */ /* unconditionally mark file as archived */
StatusFilePath(pathbuf, fname, ".done"); StatusFilePath(pathbuf, fname, ".done");
...@@ -1302,7 +1302,7 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces, ...@@ -1302,7 +1302,7 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
if (!sizeonly) if (!sizeonly)
sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
true); true, isDbDir ? pg_atoi(lastDir + 1, sizeof(Oid), 0) : InvalidOid);
if (sent || sizeonly) if (sent || sizeonly)
{ {
...@@ -1358,12 +1358,15 @@ is_checksummed_file(const char *fullpath, const char *filename) ...@@ -1358,12 +1358,15 @@ is_checksummed_file(const char *fullpath, const char *filename)
* *
* If 'missing_ok' is true, will not throw an error if the file is not found. * If 'missing_ok' is true, will not throw an error if the file is not found.
* *
* If dboid is anything other than InvalidOid then any checksum failures detected
* will get reported to the stats collector.
*
* Returns true if the file was successfully sent, false if 'missing_ok', * Returns true if the file was successfully sent, false if 'missing_ok',
* and the file did not exist. * and the file did not exist.
*/ */
static bool static bool
sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf, sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf,
bool missing_ok) bool missing_ok, Oid dboid)
{ {
FILE *fp; FILE *fp;
BlockNumber blkno = 0; BlockNumber blkno = 0;
...@@ -1580,6 +1583,9 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf ...@@ -1580,6 +1583,9 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
ereport(WARNING, ereport(WARNING,
(errmsg("file \"%s\" has a total of %d checksum verification " (errmsg("file \"%s\" has a total of %d checksum verification "
"failures", readfilename, checksum_failures))); "failures", readfilename, checksum_failures)));
if (dboid != InvalidOid)
pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
} }
total_checksum_failures += checksum_failures; total_checksum_failures += checksum_failures;
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "access/htup_details.h" #include "access/htup_details.h"
#include "access/itup.h" #include "access/itup.h"
#include "access/xlog.h" #include "access/xlog.h"
#include "pgstat.h"
#include "storage/checksum.h" #include "storage/checksum.h"
#include "utils/memdebug.h" #include "utils/memdebug.h"
#include "utils/memutils.h" #include "utils/memutils.h"
...@@ -151,6 +152,8 @@ PageIsVerified(Page page, BlockNumber blkno) ...@@ -151,6 +152,8 @@ PageIsVerified(Page page, BlockNumber blkno)
errmsg("page verification failed, calculated checksum %u but expected %u", errmsg("page verification failed, calculated checksum %u but expected %u",
checksum, p->pd_checksum))); checksum, p->pd_checksum)));
pgstat_report_checksum_failure();
if (header_sane && ignore_checksum_failure) if (header_sane && ignore_checksum_failure)
return true; return true;
} }
......
...@@ -1497,6 +1497,21 @@ pg_stat_get_db_deadlocks(PG_FUNCTION_ARGS) ...@@ -1497,6 +1497,21 @@ pg_stat_get_db_deadlocks(PG_FUNCTION_ARGS)
PG_RETURN_INT64(result); PG_RETURN_INT64(result);
} }
Datum
pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS)
{
Oid dbid = PG_GETARG_OID(0);
int64 result;
PgStat_StatDBEntry *dbentry;
if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
result = 0;
else
result = (int64) (dbentry->n_checksum_failures);
PG_RETURN_INT64(result);
}
Datum Datum
pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS) pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS)
{ {
......
...@@ -53,6 +53,6 @@ ...@@ -53,6 +53,6 @@
*/ */
/* yyyymmddN */ /* yyyymmddN */
#define CATALOG_VERSION_NO 201903063 #define CATALOG_VERSION_NO 201903091
#endif #endif
...@@ -5227,6 +5227,10 @@ ...@@ -5227,6 +5227,10 @@
proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r', proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r',
prorettype => 'int8', proargtypes => 'oid', prorettype => 'int8', proargtypes => 'oid',
prosrc => 'pg_stat_get_db_deadlocks' }, prosrc => 'pg_stat_get_db_deadlocks' },
{ oid => '3426', descr => 'statistics: checksum failures detected in database',
proname => 'pg_stat_get_db_checksum_failures', provolatile => 's', proparallel => 'r',
prorettype => 'int8', proargtypes => 'oid',
prosrc => 'pg_stat_get_db_checksum_failures' },
{ oid => '3074', descr => 'statistics: last reset for a database', { oid => '3074', descr => 'statistics: last reset for a database',
proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's', proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's',
proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid', proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid',
......
...@@ -64,7 +64,8 @@ typedef enum StatMsgType ...@@ -64,7 +64,8 @@ typedef enum StatMsgType
PGSTAT_MTYPE_FUNCPURGE, PGSTAT_MTYPE_FUNCPURGE,
PGSTAT_MTYPE_RECOVERYCONFLICT, PGSTAT_MTYPE_RECOVERYCONFLICT,
PGSTAT_MTYPE_TEMPFILE, PGSTAT_MTYPE_TEMPFILE,
PGSTAT_MTYPE_DEADLOCK PGSTAT_MTYPE_DEADLOCK,
PGSTAT_MTYPE_CHECKSUMFAILURE
} StatMsgType; } StatMsgType;
/* ---------- /* ----------
...@@ -530,6 +531,18 @@ typedef struct PgStat_MsgDeadlock ...@@ -530,6 +531,18 @@ typedef struct PgStat_MsgDeadlock
Oid m_databaseid; Oid m_databaseid;
} PgStat_MsgDeadlock; } PgStat_MsgDeadlock;
/* ----------
* PgStat_MsgChecksumFailure Sent by the backend to tell the collector
* about checksum failures noticed.
* ----------
*/
typedef struct PgStat_MsgChecksumFailure
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
int m_failurecount;
} PgStat_MsgChecksumFailure;
/* ---------- /* ----------
* PgStat_Msg Union over all possible messages. * PgStat_Msg Union over all possible messages.
...@@ -593,6 +606,7 @@ typedef struct PgStat_StatDBEntry ...@@ -593,6 +606,7 @@ typedef struct PgStat_StatDBEntry
PgStat_Counter n_temp_files; PgStat_Counter n_temp_files;
PgStat_Counter n_temp_bytes; PgStat_Counter n_temp_bytes;
PgStat_Counter n_deadlocks; PgStat_Counter n_deadlocks;
PgStat_Counter n_checksum_failures;
PgStat_Counter n_block_read_time; /* times in microseconds */ PgStat_Counter n_block_read_time; /* times in microseconds */
PgStat_Counter n_block_write_time; PgStat_Counter n_block_write_time;
...@@ -1200,6 +1214,8 @@ extern void pgstat_report_analyze(Relation rel, ...@@ -1200,6 +1214,8 @@ extern void pgstat_report_analyze(Relation rel,
extern void pgstat_report_recovery_conflict(int reason); extern void pgstat_report_recovery_conflict(int reason);
extern void pgstat_report_deadlock(void); extern void pgstat_report_deadlock(void);
extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
extern void pgstat_report_checksum_failure(void);
extern void pgstat_initialize(void); extern void pgstat_initialize(void);
extern void pgstat_bestart(void); extern void pgstat_bestart(void);
......
...@@ -1817,6 +1817,7 @@ pg_stat_database| SELECT d.oid AS datid, ...@@ -1817,6 +1817,7 @@ pg_stat_database| SELECT d.oid AS datid,
pg_stat_get_db_temp_files(d.oid) AS temp_files, pg_stat_get_db_temp_files(d.oid) AS temp_files,
pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes, pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes,
pg_stat_get_db_deadlocks(d.oid) AS deadlocks, pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures,
pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time, pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time, pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment