Commit 8582b4d0 authored by Michael Paquier's avatar Michael Paquier

Improve handling of corrupted two-phase state files at recovery

When a corrupted two-phase state file is found by WAL replay, be it for
crash recovery or archive recovery, then the file is simply skipped and
a WARNING is logged to the user, causing the transaction to be silently
lost.  Facing an on-disk WAL file which is corrupted is as likely to
happen as what is stored in WAL records, but WAL records are already
able to fail hard if there is a CRC mismatch.  On-disk two-phase state
files, on the contrary, are simply ignored if corrupted.  Note that when
restoring the initial two-phase data state at recovery, files newer than
the horizon XID are discarded hence no files present in pg_twophase/
should be torned and have been made durable by a previous checkpoint, so
recovery should never see any corrupted two-phase state file by design.

The situation got better since 978b2f65 which has added two-phase state
information directly in WAL instead of using on-disk files, so the risk
is limited to two-phase transactions which live across at least one
checkpoint for long periods.  Backups having legit two-phase state files
on-disk could also lose silently transactions when restored if things
get corrupted.

This behavior exists since two-phase commit has been introduced, no
back-patch is done for now per the lack of complaints about this
problem.

Author: Michael Paquier
Discussion: https://postgr.es/m/20180709050309.GM1467@paquier.xyz
parent 7b6b167f
...@@ -1207,10 +1207,12 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, ...@@ -1207,10 +1207,12 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
* Read and validate the state file for xid. * Read and validate the state file for xid.
* *
* If it looks OK (has a valid magic number and CRC), return the palloc'd * If it looks OK (has a valid magic number and CRC), return the palloc'd
* contents of the file. Otherwise return NULL. * contents of the file, issuing an error when finding corrupted data. If
* missing_ok is true, which indicates that missing files can be safely
* ignored, then return NULL. This state can be reached when doing recovery.
*/ */
static char * static char *
ReadTwoPhaseFile(TransactionId xid, bool give_warnings) ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
{ {
char path[MAXPGPATH]; char path[MAXPGPATH];
char *buf; char *buf;
...@@ -1227,11 +1229,12 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) ...@@ -1227,11 +1229,12 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
if (fd < 0) if (fd < 0)
{ {
if (give_warnings) if (missing_ok && errno == ENOENT)
ereport(WARNING, return NULL;
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path))); ereport(ERROR,
return NULL; (errcode_for_file_access(),
errmsg("could not open file \"%s\": %m", path)));
} }
/* /*
...@@ -1241,35 +1244,27 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) ...@@ -1241,35 +1244,27 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
* even on a valid file. * even on a valid file.
*/ */
if (fstat(fd, &stat)) if (fstat(fd, &stat))
{ ereport(ERROR,
int save_errno = errno; (errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", path)));
CloseTransientFile(fd);
if (give_warnings)
{
errno = save_errno;
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m", path)));
}
return NULL;
}
if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) + if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) + MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
sizeof(pg_crc32c)) || sizeof(pg_crc32c)) ||
stat.st_size > MaxAllocSize) stat.st_size > MaxAllocSize)
{ ereport(ERROR,
CloseTransientFile(fd); (errcode(ERRCODE_DATA_CORRUPTED),
return NULL; errmsg_plural("incorrect size of file \"%s\": %zu byte",
} "incorrect size of file \"%s\": %zu bytes",
(Size) stat.st_size, path,
(Size) stat.st_size)));
crc_offset = stat.st_size - sizeof(pg_crc32c); crc_offset = stat.st_size - sizeof(pg_crc32c);
if (crc_offset != MAXALIGN(crc_offset)) if (crc_offset != MAXALIGN(crc_offset))
{ ereport(ERROR,
CloseTransientFile(fd); (errcode(ERRCODE_DATA_CORRUPTED),
return NULL; errmsg("incorrect alignment of CRC offset for file \"%s\"",
} path)));
/* /*
* OK, slurp in the file. * OK, slurp in the file.
...@@ -1280,37 +1275,31 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) ...@@ -1280,37 +1275,31 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
r = read(fd, buf, stat.st_size); r = read(fd, buf, stat.st_size);
if (r != stat.st_size) if (r != stat.st_size)
{ {
int save_errno = errno; if (r < 0)
ereport(ERROR,
pgstat_report_wait_end(); (errcode_for_file_access(),
CloseTransientFile(fd); errmsg("could not read file \"%s\": %m", path)));
if (give_warnings) else
{ ereport(ERROR,
if (r < 0) (errmsg("could not read file \"%s\": read %d of %zu",
{ path, r, (Size) stat.st_size)));
errno = save_errno;
ereport(WARNING,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m", path)));
}
else
ereport(WARNING,
(errmsg("could not read file \"%s\": read %d of %zu",
path, r, (Size) stat.st_size)));
}
pfree(buf);
return NULL;
} }
pgstat_report_wait_end(); pgstat_report_wait_end();
CloseTransientFile(fd); CloseTransientFile(fd);
hdr = (TwoPhaseFileHeader *) buf; hdr = (TwoPhaseFileHeader *) buf;
if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size) if (hdr->magic != TWOPHASE_MAGIC)
{ ereport(ERROR,
pfree(buf); (errcode(ERRCODE_DATA_CORRUPTED),
return NULL; errmsg("invalid magic number stored in file \"%s\"",
} path)));
if (hdr->total_len != stat.st_size)
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid size stored in file \"%s\"",
path)));
INIT_CRC32C(calc_crc); INIT_CRC32C(calc_crc);
COMP_CRC32C(calc_crc, buf, crc_offset); COMP_CRC32C(calc_crc, buf, crc_offset);
...@@ -1319,10 +1308,10 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings) ...@@ -1319,10 +1308,10 @@ ReadTwoPhaseFile(TransactionId xid, bool give_warnings)
file_crc = *((pg_crc32c *) (buf + crc_offset)); file_crc = *((pg_crc32c *) (buf + crc_offset));
if (!EQ_CRC32C(calc_crc, file_crc)) if (!EQ_CRC32C(calc_crc, file_crc))
{ ereport(ERROR,
pfree(buf); (errcode(ERRCODE_DATA_CORRUPTED),
return NULL; errmsg("calculated CRC checksum does not match value stored in file \"%s\"",
} path)));
return buf; return buf;
} }
...@@ -1431,7 +1420,7 @@ StandbyTransactionIdIsPrepared(TransactionId xid) ...@@ -1431,7 +1420,7 @@ StandbyTransactionIdIsPrepared(TransactionId xid)
return false; /* nothing to do */ return false; /* nothing to do */
/* Read and validate file */ /* Read and validate file */
buf = ReadTwoPhaseFile(xid, false); buf = ReadTwoPhaseFile(xid, true);
if (buf == NULL) if (buf == NULL)
return false; return false;
...@@ -1479,7 +1468,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) ...@@ -1479,7 +1468,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
* to disk if for some reason they have lived for a long time. * to disk if for some reason they have lived for a long time.
*/ */
if (gxact->ondisk) if (gxact->ondisk)
buf = ReadTwoPhaseFile(xid, true); buf = ReadTwoPhaseFile(xid, false);
else else
XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL); XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
...@@ -1874,6 +1863,10 @@ restoreTwoPhaseData(void) ...@@ -1874,6 +1863,10 @@ restoreTwoPhaseData(void)
* write a WAL entry, and so there might be no evidence in WAL of those * write a WAL entry, and so there might be no evidence in WAL of those
* subxact XIDs. * subxact XIDs.
* *
* On corrupted two-phase files, fail immediately. Keeping around broken
* entries and let replay continue causes harm on the system, and a new
* backup should be rolled in.
*
* Our other responsibility is to determine and return the oldest valid XID * Our other responsibility is to determine and return the oldest valid XID
* among the prepared xacts (if none, return ShmemVariableCache->nextXid). * among the prepared xacts (if none, return ShmemVariableCache->nextXid).
* This is needed to synchronize pg_subtrans startup properly. * This is needed to synchronize pg_subtrans startup properly.
...@@ -2164,15 +2157,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, ...@@ -2164,15 +2157,7 @@ ProcessTwoPhaseBuffer(TransactionId xid,
if (fromdisk) if (fromdisk)
{ {
/* Read and validate file */ /* Read and validate file */
buf = ReadTwoPhaseFile(xid, true); buf = ReadTwoPhaseFile(xid, false);
if (buf == NULL)
{
ereport(WARNING,
(errmsg("removing corrupt two-phase state file for transaction %u",
xid)));
RemoveTwoPhaseFile(xid, true);
return NULL;
}
} }
else else
{ {
...@@ -2185,21 +2170,15 @@ ProcessTwoPhaseBuffer(TransactionId xid, ...@@ -2185,21 +2170,15 @@ ProcessTwoPhaseBuffer(TransactionId xid,
if (!TransactionIdEquals(hdr->xid, xid)) if (!TransactionIdEquals(hdr->xid, xid))
{ {
if (fromdisk) if (fromdisk)
{ ereport(ERROR,
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
(errmsg("removing corrupt two-phase state file for transaction %u", errmsg("corrupted two-phase state file for transaction \"%u\"",
xid))); xid)));
RemoveTwoPhaseFile(xid, true);
}
else else
{ ereport(ERROR,
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
(errmsg("removing corrupt two-phase state from memory for transaction %u", errmsg("corrupted two-phase state in memory for transaction \"%u\"",
xid))); xid)));
PrepareRedoRemove(xid, true);
}
pfree(buf);
return NULL;
} }
/* /*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment