Commit 4b0d28de authored by Simon Riggs's avatar Simon Riggs

Remove secondary checkpoint

Previously server reserved WAL for last two checkpoints,
which used too much disk space for small servers.

Bumps PG_CONTROL_VERSION

Author: Simon Riggs <simon@2ndQuadrant.com>
Reviewed-by: default avatarMichael Paquier <michael.paquier@gmail.com>
parent 98267ee8
......@@ -568,7 +568,7 @@ tar -cf backup.tar /usr/local/pgsql/data
normally creates just a few segment files and then
<quote>recycles</quote> them by renaming no-longer-needed segment files
to higher segment numbers. It's assumed that segment files whose
contents precede the checkpoint-before-last are no longer of
contents precede the last checkpoint are no longer of
interest and can be recycled.
</para>
......
......@@ -17948,11 +17948,6 @@ SELECT collation for ('foo' COLLATE "de_DE");
<entry><type>pg_lsn</type></entry>
</row>
<row>
<entry><literal>prior_lsn</literal></entry>
<entry><type>pg_lsn</type></entry>
</row>
<row>
<entry><literal>redo_lsn</literal></entry>
<entry><type>pg_lsn</type></entry>
......
......@@ -2221,13 +2221,18 @@ CalculateCheckpointSegments(void)
* Calculate the distance at which to trigger a checkpoint, to avoid
* exceeding max_wal_size_mb. This is based on two assumptions:
*
* a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
* a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
* WAL for two checkpoint cycles to allow us to recover from the
* secondary checkpoint if the first checkpoint failed, though we
* only did this on the master anyway, not on standby. Keeping just
* one checkpoint simplifies processing and reduces disk space in
* many smaller databases.)
* b) during checkpoint, we consume checkpoint_completion_target *
* number of segments consumed between checkpoints.
*-------
*/
target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
(2.0 + CheckPointCompletionTarget);
(1.0 + CheckPointCompletionTarget);
/* round down */
CheckPointSegments = (int) target;
......@@ -2279,23 +2284,8 @@ XLOGfileslop(XLogRecPtr PriorRedoPtr)
* To estimate where the next checkpoint will finish, assume that the
* system runs steadily consuming CheckPointDistanceEstimate bytes between
* every checkpoint.
*
* The reason this calculation is done from the prior checkpoint, not the
* one that just finished, is that this behaves better if some checkpoint
* cycles are abnormally short, like if you perform a manual checkpoint
* right after a timed one. The manual checkpoint will make almost a full
* cycle's worth of WAL segments available for recycling, because the
* segments from the prior's prior, fully-sized checkpoint cycle are no
* longer needed. However, the next checkpoint will make only few segments
* available for recycling, the ones generated between the timed
* checkpoint and the manual one right after that. If at the manual
* checkpoint we only retained enough segments to get us to the next timed
* one, and removed the rest, then at the next checkpoint we would not
* have enough segments around for recycling, to get us to the checkpoint
* after that. Basing the calculations on the distance from the prior redo
* pointer largely fixes that problem.
*/
distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
*/
distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
/* add 10% for good measure. */
distance *= 1.10;
......@@ -6593,30 +6583,17 @@ StartupXLOG(void)
(errmsg("checkpoint record is at %X/%X",
(uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
}
else if (StandbyMode)
else
{
/*
* The last valid checkpoint record required for a streaming
* recovery exists in neither standby nor the primary.
* We used to attempt to go back to a secondary checkpoint
* record here, but only when not in standby_mode. We now
* just fail if we can't read the last checkpoint because
* this allows us to simplify processing around checkpoints.
*/
ereport(PANIC,
(errmsg("could not locate a valid checkpoint record")));
}
else
{
checkPointLoc = ControlFile->prevCheckPoint;
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
if (record != NULL)
{
ereport(LOG,
(errmsg("using previous checkpoint record at %X/%X",
(uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
InRecovery = true; /* force recovery even if SHUTDOWNED */
}
else
ereport(PANIC,
(errmsg("could not locate a valid checkpoint record")));
}
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
}
......@@ -6845,7 +6822,6 @@ StartupXLOG(void)
recoveryTargetTLI)));
ControlFile->state = DB_IN_CRASH_RECOVERY;
}
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = checkPointLoc;
ControlFile->checkPointCopy = checkPoint;
if (InArchiveRecovery)
......@@ -7619,12 +7595,11 @@ StartupXLOG(void)
{
if (fast_promote)
{
checkPointLoc = ControlFile->prevCheckPoint;
checkPointLoc = ControlFile->checkPoint;
/*
* Confirm the last checkpoint is available for us to recover
* from if we fail. Note that we don't check for the secondary
* checkpoint since that isn't available in most base backups.
* from if we fail.
*/
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
if (record != NULL)
......@@ -8090,7 +8065,7 @@ LocalSetXLogInsertAllowed(void)
* Subroutine to try to fetch and validate a prior checkpoint record.
*
* whichChkpt identifies the checkpoint (merely for reporting purposes).
* 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
* 1 for "primary", 0 for "other" (backup_label)
*/
static XLogRecord *
ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
......@@ -8110,10 +8085,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid primary checkpoint link in control file")));
break;
case 2:
ereport(LOG,
(errmsg("invalid secondary checkpoint link in control file")));
break;
default:
ereport(LOG,
(errmsg("invalid checkpoint link in backup_label file")));
......@@ -8135,10 +8106,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid primary checkpoint record")));
break;
case 2:
ereport(LOG,
(errmsg("invalid secondary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid checkpoint record")));
......@@ -8154,10 +8121,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid resource manager ID in primary checkpoint record")));
break;
case 2:
ereport(LOG,
(errmsg("invalid resource manager ID in secondary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid resource manager ID in checkpoint record")));
......@@ -8175,10 +8138,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid xl_info in primary checkpoint record")));
break;
case 2:
ereport(LOG,
(errmsg("invalid xl_info in secondary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid xl_info in checkpoint record")));
......@@ -8194,10 +8153,6 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
ereport(LOG,
(errmsg("invalid length of primary checkpoint record")));
break;
case 2:
ereport(LOG,
(errmsg("invalid length of secondary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid length of checkpoint record")));
......@@ -8933,8 +8888,7 @@ CreateCheckPoint(int flags)
(errmsg("concurrent write-ahead log activity while database system is shutting down")));
/*
* Remember the prior checkpoint's redo pointer, used later to determine
* the point where the log can be truncated.
* Remember the prior checkpoint's redo ptr for UpdateCheckPointDistanceEstimate()
*/
PriorRedoPtr = ControlFile->checkPointCopy.redo;
......@@ -8944,7 +8898,6 @@ CreateCheckPoint(int flags)
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
ControlFile->time = (pg_time_t) time(NULL);
......@@ -8982,8 +8935,7 @@ CreateCheckPoint(int flags)
smgrpostckpt();
/*
* Delete old log files (those no longer needed even for previous
* checkpoint or the standbys in XLOG streaming).
* Delete old log files and recycle them
*/
if (PriorRedoPtr != InvalidXLogRecPtr)
{
......@@ -8992,7 +8944,8 @@ CreateCheckPoint(int flags)
/* Update the average distance between checkpoints. */
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
XLByteToSeg(PriorRedoPtr, _logSegNo, wal_segment_size);
/* Trim from the last checkpoint, not the last - 1 */
XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
KeepLogSeg(recptr, &_logSegNo);
_logSegNo--;
RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
......@@ -9258,8 +9211,7 @@ CreateRestartPoint(int flags)
CheckPointGuts(lastCheckPoint.redo, flags);
/*
* Remember the prior checkpoint's redo pointer, used later to determine
* the point at which we can truncate the log.
* Remember the prior checkpoint's redo ptr for UpdateCheckPointDistanceEstimate()
*/
PriorRedoPtr = ControlFile->checkPointCopy.redo;
......@@ -9273,7 +9225,6 @@ CreateRestartPoint(int flags)
if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
{
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = lastCheckPointRecPtr;
ControlFile->checkPointCopy = lastCheckPoint;
ControlFile->time = (pg_time_t) time(NULL);
......
......@@ -93,41 +93,39 @@ pg_control_checkpoint(PG_FUNCTION_ARGS)
tupdesc = CreateTemplateTupleDesc(19, false);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "checkpoint_lsn",
LSNOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prior_lsn",
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "redo_lsn",
LSNOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "redo_lsn",
LSNOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "redo_wal_file",
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "redo_wal_file",
TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 5, "timeline_id",
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "timeline_id",
INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 6, "prev_timeline_id",
TupleDescInitEntry(tupdesc, (AttrNumber) 5, "prev_timeline_id",
INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 7, "full_page_writes",
TupleDescInitEntry(tupdesc, (AttrNumber) 6, "full_page_writes",
BOOLOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 8, "next_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 7, "next_xid",
TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 9, "next_oid",
TupleDescInitEntry(tupdesc, (AttrNumber) 8, "next_oid",
OIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 10, "next_multixact_id",
TupleDescInitEntry(tupdesc, (AttrNumber) 9, "next_multixact_id",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 11, "next_multi_offset",
TupleDescInitEntry(tupdesc, (AttrNumber) 10, "next_multi_offset",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 12, "oldest_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 11, "oldest_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 13, "oldest_xid_dbid",
TupleDescInitEntry(tupdesc, (AttrNumber) 12, "oldest_xid_dbid",
OIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 14, "oldest_active_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 13, "oldest_active_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 15, "oldest_multi_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 14, "oldest_multi_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 16, "oldest_multi_dbid",
TupleDescInitEntry(tupdesc, (AttrNumber) 15, "oldest_multi_dbid",
OIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 17, "oldest_commit_ts_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 16, "oldest_commit_ts_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 18, "newest_commit_ts_xid",
TupleDescInitEntry(tupdesc, (AttrNumber) 17, "newest_commit_ts_xid",
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 19, "checkpoint_time",
TupleDescInitEntry(tupdesc, (AttrNumber) 18, "checkpoint_time",
TIMESTAMPTZOID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
......@@ -149,62 +147,59 @@ pg_control_checkpoint(PG_FUNCTION_ARGS)
values[0] = LSNGetDatum(ControlFile->checkPoint);
nulls[0] = false;
values[1] = LSNGetDatum(ControlFile->prevCheckPoint);
values[1] = LSNGetDatum(ControlFile->checkPointCopy.redo);
nulls[1] = false;
values[2] = LSNGetDatum(ControlFile->checkPointCopy.redo);
values[2] = CStringGetTextDatum(xlogfilename);
nulls[2] = false;
values[3] = CStringGetTextDatum(xlogfilename);
values[3] = Int32GetDatum(ControlFile->checkPointCopy.ThisTimeLineID);
nulls[3] = false;
values[4] = Int32GetDatum(ControlFile->checkPointCopy.ThisTimeLineID);
values[4] = Int32GetDatum(ControlFile->checkPointCopy.PrevTimeLineID);
nulls[4] = false;
values[5] = Int32GetDatum(ControlFile->checkPointCopy.PrevTimeLineID);
values[5] = BoolGetDatum(ControlFile->checkPointCopy.fullPageWrites);
nulls[5] = false;
values[6] = BoolGetDatum(ControlFile->checkPointCopy.fullPageWrites);
nulls[6] = false;
values[7] = CStringGetTextDatum(psprintf("%u:%u",
values[6] = CStringGetTextDatum(psprintf("%u:%u",
ControlFile->checkPointCopy.nextXidEpoch,
ControlFile->checkPointCopy.nextXid));
nulls[6] = false;
values[7] = ObjectIdGetDatum(ControlFile->checkPointCopy.nextOid);
nulls[7] = false;
values[8] = ObjectIdGetDatum(ControlFile->checkPointCopy.nextOid);
values[8] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMulti);
nulls[8] = false;
values[9] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMulti);
values[9] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMultiOffset);
nulls[9] = false;
values[10] = TransactionIdGetDatum(ControlFile->checkPointCopy.nextMultiOffset);
values[10] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestXid);
nulls[10] = false;
values[11] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestXid);
values[11] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestXidDB);
nulls[11] = false;
values[12] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestXidDB);
values[12] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestActiveXid);
nulls[12] = false;
values[13] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestActiveXid);
values[13] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestMulti);
nulls[13] = false;
values[14] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestMulti);
values[14] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestMultiDB);
nulls[14] = false;
values[15] = ObjectIdGetDatum(ControlFile->checkPointCopy.oldestMultiDB);
values[15] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestCommitTsXid);
nulls[15] = false;
values[16] = TransactionIdGetDatum(ControlFile->checkPointCopy.oldestCommitTsXid);
values[16] = TransactionIdGetDatum(ControlFile->checkPointCopy.newestCommitTsXid);
nulls[16] = false;
values[17] = TransactionIdGetDatum(ControlFile->checkPointCopy.newestCommitTsXid);
nulls[17] = false;
values[18] = TimestampTzGetDatum(
values[17] = TimestampTzGetDatum(
time_t_to_timestamptz(ControlFile->checkPointCopy.time));
nulls[18] = false;
nulls[17] = false;
htup = heap_form_tuple(tupdesc, values, nulls);
......
......@@ -222,9 +222,6 @@ main(int argc, char *argv[])
printf(_("Latest checkpoint location: %X/%X\n"),
(uint32) (ControlFile->checkPoint >> 32),
(uint32) ControlFile->checkPoint);
printf(_("Prior checkpoint location: %X/%X\n"),
(uint32) (ControlFile->prevCheckPoint >> 32),
(uint32) ControlFile->prevCheckPoint);
printf(_("Latest checkpoint's REDO location: %X/%X\n"),
(uint32) (ControlFile->checkPointCopy.redo >> 32),
(uint32) ControlFile->checkPointCopy.redo);
......
......@@ -876,7 +876,6 @@ RewriteControlFile(void)
ControlFile.state = DB_SHUTDOWNED;
ControlFile.time = (pg_time_t) time(NULL);
ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
ControlFile.prevCheckPoint = 0;
ControlFile.minRecoveryPoint = 0;
ControlFile.minRecoveryPointTLI = 0;
ControlFile.backupStartPoint = 0;
......
......@@ -21,7 +21,7 @@
/* Version identifier for this pg_control format */
#define PG_CONTROL_VERSION 1003
#define PG_CONTROL_VERSION 1100
/* Nonce key length, see below */
#define MOCK_AUTH_NONCE_LEN 32
......@@ -127,7 +127,6 @@ typedef struct ControlFileData
DBState state; /* see enum above */
pg_time_t time; /* time stamp of last pg_control update */
XLogRecPtr checkPoint; /* last check point record ptr */
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
CheckPoint checkPointCopy; /* copy of last check point record */
......
......@@ -5500,7 +5500,7 @@ DESCR("pg_config binary as a function");
DATA(insert OID = 3441 ( pg_control_system PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{23,23,20,1184}" "{o,o,o,o}" "{pg_control_version,catalog_version_no,system_identifier,pg_control_last_modified}" _null_ _null_ pg_control_system _null_ _null_ _null_ ));
DESCR("pg_controldata general state information as a function");
DATA(insert OID = 3442 ( pg_control_checkpoint PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{3220,3220,3220,25,23,23,16,25,26,28,28,28,26,28,28,26,28,28,1184}" "{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}" "{checkpoint_lsn,prior_lsn,redo_lsn,redo_wal_file,timeline_id,prev_timeline_id,full_page_writes,next_xid,next_oid,next_multixact_id,next_multi_offset,oldest_xid,oldest_xid_dbid,oldest_active_xid,oldest_multi_xid,oldest_multi_dbid,oldest_commit_ts_xid,newest_commit_ts_xid,checkpoint_time}" _null_ _null_ pg_control_checkpoint _null_ _null_ _null_ ));
DATA(insert OID = 3442 ( pg_control_checkpoint PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{3220,3220,25,23,23,16,25,26,28,28,28,26,28,28,26,28,28,1184}" "{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}" "{checkpoint_lsn,redo_lsn,redo_wal_file,timeline_id,prev_timeline_id,full_page_writes,next_xid,next_oid,next_multixact_id,next_multi_offset,oldest_xid,oldest_xid_dbid,oldest_active_xid,oldest_multi_xid,oldest_multi_dbid,oldest_commit_ts_xid,newest_commit_ts_xid,checkpoint_time}" _null_ _null_ pg_control_checkpoint _null_ _null_ _null_ ));
DESCR("pg_controldata checkpoint state information as a function");
DATA(insert OID = 3443 ( pg_control_recovery PGNSP PGUID 12 1 0 0 0 f f f f t f v s 0 0 2249 "" "{3220,23,3220,3220,16}" "{o,o,o,o,o}" "{min_recovery_end_lsn,min_recovery_end_timeline,backup_start_lsn,backup_end_lsn,end_of_backup_record_required}" _null_ _null_ pg_control_recovery _null_ _null_ _null_ ));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment