Split work of bgwriter between 2 processes: bgwriter and checkpointer.

bgwriter is now a much less important process, responsible for page cleaning duties only. checkpointer is now responsible for checkpoints and so has a key role in shutdown. Later patches will correct doc references to the now old idea that bgwriter performs checkpoints. Has beneficial effect on performance at high write rates, but mainly refactoring to more easily allow changes for power reduction by simplifying previously tortuous code around required to allow page cleaning and checkpointing to time slice in the same process. Patch by me, Review by Dickson Guedes

Split work of bgwriter between 2 processes: bgwriter and checkpointer.
bgwriter is now a much less important process, responsible for page cleaning duties only. checkpointer is now responsible for checkpoints and so has a key role in shutdown. Later patches will correct doc references to the now old idea that bgwriter performs checkpoints. Has beneficial effect on performance at high write rates, but mainly refactoring to more easily allow changes for power reduction by simplifying previously tortuous code around required to allow page cleaning and checkpointing to time slice in the same process. Patch by me, Review by Dickson Guedes
806a2aee · Simon Riggs · 589adb86 · 806a2aee · 806a2aee · 806a2aee
Commit 806a2aee authored Nov 01, 2011 by Simon Riggs
11 changed files
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -315,6 +315,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			case BgWriterProcess:
 				statmsg = "writer process";
 				break;
+			case CheckpointerProcess:
+				statmsg = "checkpointer process";
+				break;
 			case WalWriterProcess:
 				statmsg = "wal writer process";
 				break;
@@ -415,6 +418,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			BackgroundWriterMain();
 			proc_exit(1);		/* should never return */
+		case CheckpointerProcess:
+			/* don't set signals, checkpointer has its own agenda */
+			CheckpointerMain();
+			proc_exit(1);		/* should never return */
 		case WalWriterProcess:
 			/* don't set signals, walwriter has its own agenda */
 			InitXLOGAccess();

--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
-	syslogger.o walwriter.o
+	syslogger.o walwriter.o checkpointer.o
 include $(top_srcdir)/src/backend/common.mk
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -10,20 +10,13 @@
 * still empowered to issue writes if the bgwriter fails to maintain enough
 * clean shared buffers.
 *
- * The bgwriter is also charged with handling all checkpoints.	It will
+ * As of Postgres 9.2 the bgwriter no longer handles checkpoints.
- * automatically dispatch a checkpoint after a certain amount of time has
- * elapsed since the last one, and it can be signaled to perform requested
- * checkpoints as well.  (The GUC parameter that mandates a checkpoint every
- * so many WAL segments is implemented by having backends signal the bgwriter
- * when they fill WAL segments; the bgwriter itself doesn't watch for the
- * condition.)
 *
 * The bgwriter is started by the postmaster as soon as the startup subprocess
 * finishes, or as soon as recovery begins if we are doing archive recovery.
 * It remains alive until the postmaster commands it to terminate.
- * Normal termination is by SIGUSR2, which instructs the bgwriter to execute
+ * Normal termination is by SIGUSR2, which instructs the bgwriter to exit(0).
- * a shutdown checkpoint and then exit(0).	(All backends must be stopped
+ * Emergency termination is by SIGQUIT; like any
- * before SIGUSR2 is issued!)  Emergency termination is by SIGQUIT; like any
 * backend, the bgwriter will simply abort and exit on SIGQUIT.
 *
 * If the bgwriter exits unexpectedly, the postmaster treats that the same
@@ -54,7 +47,6 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
-#include "replication/syncrep.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
@@ -67,96 +59,15 @@
 #include "utils/resowner.h"
-/*----------
- * Shared memory area for communication between bgwriter and backends
- *
- * The ckpt counters allow backends to watch for completion of a checkpoint
- * request they send.  Here's how it works:
- *	* At start of a checkpoint, bgwriter reads (and clears) the request flags
- *	  and increments ckpt_started, while holding ckpt_lck.
- *	* On completion of a checkpoint, bgwriter sets ckpt_done to
- *	  equal ckpt_started.
- *	* On failure of a checkpoint, bgwriter increments ckpt_failed
- *	  and sets ckpt_done to equal ckpt_started.
- *
- * The algorithm for backends is:
- *	1. Record current values of ckpt_failed and ckpt_started, and
- *	   set request flags, while holding ckpt_lck.
- *	2. Send signal to request checkpoint.
- *	3. Sleep until ckpt_started changes.  Now you know a checkpoint has
- *	   begun since you started this algorithm (although *not* that it was
- *	   specifically initiated by your signal), and that it is using your flags.
- *	4. Record new value of ckpt_started.
- *	5. Sleep until ckpt_done >= saved value of ckpt_started.  (Use modulo
- *	   arithmetic here in case counters wrap around.)  Now you know a
- *	   checkpoint has started and completed, but not whether it was
- *	   successful.
- *	6. If ckpt_failed is different from the originally saved value,
- *	   assume request failed; otherwise it was definitely successful.
- *
- * ckpt_flags holds the OR of the checkpoint request flags sent by all
- * requesting backends since the last checkpoint start.  The flags are
- * chosen so that OR'ing is the correct way to combine multiple requests.
- *
- * num_backend_writes is used to count the number of buffer writes performed
- * by non-bgwriter processes.  This counter should be wide enough that it
- * can't overflow during a single bgwriter cycle.  num_backend_fsync
- * counts the subset of those writes that also had to do their own fsync,
- * because the background writer failed to absorb their request.
- *
- * The requests array holds fsync requests sent by backends and not yet
- * absorbed by the bgwriter.
- *
- * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and
- * the requests fields are protected by BgWriterCommLock.
- *----------
- */
-typedef struct
-{
-	RelFileNodeBackend rnode;
-	ForkNumber	forknum;
-	BlockNumber segno;			/* see md.c for special values */
-	/* might add a real request-type field later; not needed yet */
-} BgWriterRequest;
-typedef struct
-{
-	pid_t		bgwriter_pid;	/* PID of bgwriter (0 if not started) */
-	slock_t		ckpt_lck;		/* protects all the ckpt_* fields */
-	int			ckpt_started;	/* advances when checkpoint starts */
-	int			ckpt_done;		/* advances when checkpoint done */
-	int			ckpt_failed;	/* advances when checkpoint fails */
-	int			ckpt_flags;		/* checkpoint flags, as defined in xlog.h */
-	uint32		num_backend_writes;		/* counts non-bgwriter buffer writes */
-	uint32		num_backend_fsync;		/* counts non-bgwriter fsync calls */
-	int			num_requests;	/* current # of requests */
-	int			max_requests;	/* allocated array size */
-	BgWriterRequest requests[1];	/* VARIABLE LENGTH ARRAY */
-} BgWriterShmemStruct;
-static BgWriterShmemStruct *BgWriterShmem;
-/* interval for calling AbsorbFsyncRequests in CheckpointWriteDelay */
-#define WRITES_PER_ABSORB		1000
 /*
 * GUC parameters
 */
 int			BgWriterDelay = 200;
-int			CheckPointTimeout = 300;
-int			CheckPointWarning = 30;
-double		CheckPointCompletionTarget = 0.5;
 /*
 * Flags set by interrupt handlers for later service in the main loop.
 */
 static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t checkpoint_requested = false;
 static volatile sig_atomic_t shutdown_requested = false;
 /*
@@ -164,29 +75,14 @@ static volatile sig_atomic_t shutdown_requested = false;
 */
 static bool am_bg_writer = false;
-static bool ckpt_active = false;
-/* these values are valid when ckpt_active is true: */
-static pg_time_t ckpt_start_time;
-static XLogRecPtr ckpt_start_recptr;
-static double ckpt_cached_elapsed;
-static pg_time_t last_checkpoint_time;
-static pg_time_t last_xlog_switch_time;
 /* Prototypes for private functions */
-static void CheckArchiveTimeout(void);
 static void BgWriterNap(void);
-static bool IsCheckpointOnSchedule(double progress);
-static bool ImmediateCheckpointRequested(void);
-static bool CompactBgwriterRequestQueue(void);
 /* Signal handlers */
 static void bg_quickdie(SIGNAL_ARGS);
 static void BgSigHupHandler(SIGNAL_ARGS);
-static void ReqCheckpointHandler(SIGNAL_ARGS);
 static void ReqShutdownHandler(SIGNAL_ARGS);
@@ -202,7 +98,6 @@ BackgroundWriterMain(void)
 	sigjmp_buf	local_sigjmp_buf;
 	MemoryContext bgwriter_context;
-	BgWriterShmem->bgwriter_pid = MyProcPid;
 	am_bg_writer = true;
 	/*
@@ -228,13 +123,13 @@ BackgroundWriterMain(void)
 	 * process to participate in ProcSignal signalling.
 	 */
 	pqsignal(SIGHUP, BgSigHupHandler);	/* set flag to read config file */
-	pqsignal(SIGINT, ReqCheckpointHandler);		/* request checkpoint */
+	pqsignal(SIGINT, SIG_IGN);			/* as of 9.2 no longer requests checkpoint */
-	pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
+	pqsignal(SIGTERM, ReqShutdownHandler); 	/* shutdown */
 	pqsignal(SIGQUIT, bg_quickdie);		/* hard crash time */
 	pqsignal(SIGALRM, SIG_IGN);
 	pqsignal(SIGPIPE, SIG_IGN);
 	pqsignal(SIGUSR1, SIG_IGN);			/* reserve for ProcSignal */
-	pqsignal(SIGUSR2, ReqShutdownHandler);		/* request shutdown */
+	pqsignal(SIGUSR2, SIG_IGN);			/* request shutdown */
 	/*
 	 * Reset some signals that are accepted by postmaster but not here
@@ -248,11 +143,6 @@ BackgroundWriterMain(void)
 	/* We allow SIGQUIT (quickdie) at all times */
 	sigdelset(&BlockSig, SIGQUIT);
-	/*
-	 * Initialize so that first time-driven event happens at the correct time.
-	 */
-	last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
 	/*
 	 * Create a resource owner to keep track of our resources (currently only
 	 * buffer pins).
@@ -305,20 +195,6 @@ BackgroundWriterMain(void)
 		AtEOXact_Files();
 		AtEOXact_HashTables(false);
-		/* Warn any waiting backends that the checkpoint failed. */
-		if (ckpt_active)
-		{
-			/* use volatile pointer to prevent code rearrangement */
-			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
-			SpinLockAcquire(&bgs->ckpt_lck);
-			bgs->ckpt_failed++;
-			bgs->ckpt_done = bgs->ckpt_started;
-			SpinLockRelease(&bgs->ckpt_lck);
-			ckpt_active = false;
-		}
 		/*
 		 * Now return to normal top-level context and clear ErrorContext for
 		 * next time.
@@ -361,19 +237,11 @@ BackgroundWriterMain(void)
 	if (RecoveryInProgress())
 		ThisTimeLineID = GetRecoveryTargetTLI();
-	/* Do this once before starting the loop, then just at SIGHUP time. */
-	SyncRepUpdateSyncStandbysDefined();
 	/*
 	 * Loop forever
 	 */
 	for (;;)
 	{
-		bool		do_checkpoint = false;
-		int			flags = 0;
-		pg_time_t	now;
-		int			elapsed_secs;
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
@@ -381,23 +249,11 @@ BackgroundWriterMain(void)
 		if (!PostmasterIsAlive())
 			exit(1);
-		/*
-		 * Process any requests or signals received recently.
-		 */
-		AbsorbFsyncRequests();
 		if (got_SIGHUP)
 		{
 			got_SIGHUP = false;
 			ProcessConfigFile(PGC_SIGHUP);
 			/* update global shmem state for sync rep */
-			SyncRepUpdateSyncStandbysDefined();
-		}
-		if (checkpoint_requested)
-		{
-			checkpoint_requested = false;
-			do_checkpoint = true;
-			BgWriterStats.m_requested_checkpoints++;
 		}
 		if (shutdown_requested)
 		{
@@ -406,203 +262,20 @@ BackgroundWriterMain(void)
 			 * control back to the sigsetjmp block above
 			 */
 			ExitOnAnyError = true;
-			/* Close down the database */
-			ShutdownXLOG(0, 0);
 			/* Normal exit from the bgwriter is here */
 			proc_exit(0);		/* done */
 		}
 		/*
-		 * Force a checkpoint if too much time has elapsed since the last one.
+		 * Do one cycle of dirty-buffer writing.
-		 * Note that we count a timed checkpoint in stats only when this
-		 * occurs without an external request, but we set the CAUSE_TIME flag
-		 * bit even if there is also an external request.
-		 */
-		now = (pg_time_t) time(NULL);
-		elapsed_secs = now - last_checkpoint_time;
-		if (elapsed_secs >= CheckPointTimeout)
-		{
-			if (!do_checkpoint)
-				BgWriterStats.m_timed_checkpoints++;
-			do_checkpoint = true;
-			flags |= CHECKPOINT_CAUSE_TIME;
-		}
-		/*
-		 * Do a checkpoint if requested, otherwise do one cycle of
-		 * dirty-buffer writing.
 		 */
-		if (do_checkpoint)
-		{
-			bool		ckpt_performed = false;
-			bool		do_restartpoint;
-			/* use volatile pointer to prevent code rearrangement */
-			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
-			/*
-			 * Check if we should perform a checkpoint or a restartpoint. As a
-			 * side-effect, RecoveryInProgress() initializes TimeLineID if
-			 * it's not set yet.
-			 */
-			do_restartpoint = RecoveryInProgress();
-			/*
-			 * Atomically fetch the request flags to figure out what kind of a
-			 * checkpoint we should perform, and increase the started-counter
-			 * to acknowledge that we've started a new checkpoint.
-			 */
-			SpinLockAcquire(&bgs->ckpt_lck);
-			flags |= bgs->ckpt_flags;
-			bgs->ckpt_flags = 0;
-			bgs->ckpt_started++;
-			SpinLockRelease(&bgs->ckpt_lck);
-			/*
-			 * The end-of-recovery checkpoint is a real checkpoint that's
-			 * performed while we're still in recovery.
-			 */
-			if (flags & CHECKPOINT_END_OF_RECOVERY)
-				do_restartpoint = false;
-			/*
-			 * We will warn if (a) too soon since last checkpoint (whatever
-			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
-			 * since the last checkpoint start.  Note in particular that this
-			 * implementation will not generate warnings caused by
-			 * CheckPointTimeout < CheckPointWarning.
-			 */
-			if (!do_restartpoint &&
-				(flags & CHECKPOINT_CAUSE_XLOG) &&
-				elapsed_secs < CheckPointWarning)
-				ereport(LOG,
-						(errmsg_plural("checkpoints are occurring too frequently (%d second apart)",
-				"checkpoints are occurring too frequently (%d seconds apart)",
-									   elapsed_secs,
-									   elapsed_secs),
-						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
-			/*
-			 * Initialize bgwriter-private variables used during checkpoint.
-			 */
-			ckpt_active = true;
-			if (!do_restartpoint)
-				ckpt_start_recptr = GetInsertRecPtr();
-			ckpt_start_time = now;
-			ckpt_cached_elapsed = 0;
-			/*
-			 * Do the checkpoint.
-			 */
-			if (!do_restartpoint)
-			{
-				CreateCheckPoint(flags);
-				ckpt_performed = true;
-			}
-			else
-				ckpt_performed = CreateRestartPoint(flags);
-			/*
-			 * After any checkpoint, close all smgr files.	This is so we
-			 * won't hang onto smgr references to deleted files indefinitely.
-			 */
-			smgrcloseall();
-			/*
-			 * Indicate checkpoint completion to any waiting backends.
-			 */
-			SpinLockAcquire(&bgs->ckpt_lck);
-			bgs->ckpt_done = bgs->ckpt_started;
-			SpinLockRelease(&bgs->ckpt_lck);
-			if (ckpt_performed)
-			{
-				/*
-				 * Note we record the checkpoint start time not end time as
-				 * last_checkpoint_time.  This is so that time-driven
-				 * checkpoints happen at a predictable spacing.
-				 */
-				last_checkpoint_time = now;
-			}
-			else
-			{
-				/*
-				 * We were not able to perform the restartpoint (checkpoints
-				 * throw an ERROR in case of error).  Most likely because we
-				 * have not received any new checkpoint WAL records since the
-				 * last restartpoint. Try again in 15 s.
-				 */
-				last_checkpoint_time = now - CheckPointTimeout + 15;
-			}
-			ckpt_active = false;
-		}
-		else
 		BgBufferSync();
-		/* Check for archive_timeout and switch xlog files if necessary. */
-		CheckArchiveTimeout();
 		/* Nap for the configured time. */
 		BgWriterNap();
 	}
 }
-/*
- * CheckArchiveTimeout -- check for archive_timeout and switch xlog files
- *
- * This will switch to a new WAL file and force an archive file write
- * if any activity is recorded in the current WAL file, including just
- * a single checkpoint record.
- */
-static void
-CheckArchiveTimeout(void)
-{
-	pg_time_t	now;
-	pg_time_t	last_time;
-	if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
-		return;
-	now = (pg_time_t) time(NULL);
-	/* First we do a quick check using possibly-stale local state. */
-	if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
-		return;
-	/*
-	 * Update local state ... note that last_xlog_switch_time is the last time
-	 * a switch was performed *or requested*.
-	 */
-	last_time = GetLastSegSwitchTime();
-	last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
-	/* Now we can do the real check */
-	if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
-	{
-		XLogRecPtr	switchpoint;
-		/* OK, it's time to switch */
-		switchpoint = RequestXLogSwitch();
-		/*
-		 * If the returned pointer points exactly to a segment boundary,
-		 * assume nothing happened.
-		 */
-		if ((switchpoint.xrecoff % XLogSegSize) != 0)
-			ereport(DEBUG1,
-				(errmsg("transaction log switch forced (archive_timeout=%d)",
-						XLogArchiveTimeout)));
-		/*
-		 * Update state in any case, so we don't retry constantly when the
-		 * system is idle.
-		 */
-		last_xlog_switch_time = now;
-	}
-}
 /*
 * BgWriterNap -- Nap for the configured time or until a signal is received.
 */
@@ -624,185 +297,24 @@ BgWriterNap(void)
 	 * respond reasonably promptly when someone signals us, break down the
 	 * sleep into 1-second increments, and check for interrupts after each
 	 * nap.
-	 *
-	 * We absorb pending requests after each short sleep.
 	 */
-	if (bgwriter_lru_maxpages > 0 || ckpt_active)
+	if (bgwriter_lru_maxpages > 0)
 		udelay = BgWriterDelay * 1000L;
-	else if (XLogArchiveTimeout > 0)
-		udelay = 1000000L;		/* One second */
 	else
 		udelay = 10000000L;		/* Ten seconds */
 	while (udelay > 999999L)
 	{
-		if (got_SIGHUP || shutdown_requested ||
+		if (got_SIGHUP || shutdown_requested)
-		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
 			break;
 		pg_usleep(1000000L);
-		AbsorbFsyncRequests();
 		udelay -= 1000000L;
 	}
-	if (!(got_SIGHUP || shutdown_requested ||
+	if (!(got_SIGHUP || shutdown_requested))
-	  (ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested)))
 		pg_usleep(udelay);
 }
-/*
- * Returns true if an immediate checkpoint request is pending.	(Note that
- * this does not check the *current* checkpoint's IMMEDIATE flag, but whether
- * there is one pending behind it.)
- */
-static bool
-ImmediateCheckpointRequested(void)
-{
-	if (checkpoint_requested)
-	{
-		volatile BgWriterShmemStruct *bgs = BgWriterShmem;
-		/*
-		 * We don't need to acquire the ckpt_lck in this case because we're
-		 * only looking at a single flag bit.
-		 */
-		if (bgs->ckpt_flags & CHECKPOINT_IMMEDIATE)
-			return true;
-	}
-	return false;
-}
-/*
- * CheckpointWriteDelay -- yield control to bgwriter during a checkpoint
- *
- * This function is called after each page write performed by BufferSync().
- * It is responsible for keeping the bgwriter's normal activities in
- * progress during a long checkpoint, and for throttling BufferSync()'s
- * write rate to hit checkpoint_completion_target.
- *
- * The checkpoint request flags should be passed in; currently the only one
- * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
- *
- * 'progress' is an estimate of how much of the work has been done, as a
- * fraction between 0.0 meaning none, and 1.0 meaning all done.
- */
-void
-CheckpointWriteDelay(int flags, double progress)
-{
-	static int	absorb_counter = WRITES_PER_ABSORB;
-	/* Do nothing if checkpoint is being executed by non-bgwriter process */
-	if (!am_bg_writer)
-		return;
-	/*
-	 * Perform the usual bgwriter duties and take a nap, unless we're behind
-	 * schedule, in which case we just try to catch up as quickly as possible.
-	 */
-	if (!(flags & CHECKPOINT_IMMEDIATE) &&
-		!shutdown_requested &&
-		!ImmediateCheckpointRequested() &&
-		IsCheckpointOnSchedule(progress))
-	{
-		if (got_SIGHUP)
-		{
-			got_SIGHUP = false;
-			ProcessConfigFile(PGC_SIGHUP);
-			/* update global shmem state for sync rep */
-			SyncRepUpdateSyncStandbysDefined();
-		}
-		AbsorbFsyncRequests();
-		absorb_counter = WRITES_PER_ABSORB;
-		BgBufferSync();
-		CheckArchiveTimeout();
-		BgWriterNap();
-	}
-	else if (--absorb_counter <= 0)
-	{
-		/*
-		 * Absorb pending fsync requests after each WRITES_PER_ABSORB write
-		 * operations even when we don't sleep, to prevent overflow of the
-		 * fsync request queue.
-		 */
-		AbsorbFsyncRequests();
-		absorb_counter = WRITES_PER_ABSORB;
-	}
-}
-/*
- * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint
- *		 in time?
- *
- * Compares the current progress against the time/segments elapsed since last
- * checkpoint, and returns true if the progress we've made this far is greater
- * than the elapsed time/segments.
- */
-static bool
-IsCheckpointOnSchedule(double progress)
-{
-	XLogRecPtr	recptr;
-	struct timeval now;
-	double		elapsed_xlogs,
-				elapsed_time;
-	Assert(ckpt_active);
-	/* Scale progress according to checkpoint_completion_target. */
-	progress *= CheckPointCompletionTarget;
-	/*
-	 * Check against the cached value first. Only do the more expensive
-	 * calculations once we reach the target previously calculated. Since
-	 * neither time or WAL insert pointer moves backwards, a freshly
-	 * calculated value can only be greater than or equal to the cached value.
-	 */
-	if (progress < ckpt_cached_elapsed)
-		return false;
-	/*
-	 * Check progress against WAL segments written and checkpoint_segments.
-	 *
-	 * We compare the current WAL insert location against the location
-	 * computed before calling CreateCheckPoint. The code in XLogInsert that
-	 * actually triggers a checkpoint when checkpoint_segments is exceeded
-	 * compares against RedoRecptr, so this is not completely accurate.
-	 * However, it's good enough for our purposes, we're only calculating an
-	 * estimate anyway.
-	 */
-	if (!RecoveryInProgress())
-	{
-		recptr = GetInsertRecPtr();
-		elapsed_xlogs =
-			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
-			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
-			CheckPointSegments;
-		if (progress < elapsed_xlogs)
-		{
-			ckpt_cached_elapsed = elapsed_xlogs;
-			return false;
-		}
-	}
-	/*
-	 * Check progress against time elapsed and checkpoint_timeout.
-	 */
-	gettimeofday(&now, NULL);
-	elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +
-					now.tv_usec / 1000000.0) / CheckPointTimeout;
-	if (progress < elapsed_time)
-	{
-		ckpt_cached_elapsed = elapsed_time;
-		return false;
-	}
-	/* It looks like we're on schedule. */
-	return true;
-}
 /* --------------------------------
 *		signal handler routines
 * --------------------------------
@@ -847,441 +359,9 @@ BgSigHupHandler(SIGNAL_ARGS)
 	got_SIGHUP = true;
 }
-/* SIGINT: set flag to run a normal checkpoint right away */
-static void
-ReqCheckpointHandler(SIGNAL_ARGS)
-{
-	checkpoint_requested = true;
-}
 /* SIGUSR2: set flag to run a shutdown checkpoint and exit */
 static void
 ReqShutdownHandler(SIGNAL_ARGS)
 {
 	shutdown_requested = true;
 }
-/* --------------------------------
- *		communication with backends
- * --------------------------------
- */
-/*
- * BgWriterShmemSize
- *		Compute space needed for bgwriter-related shared memory
- */
-Size
-BgWriterShmemSize(void)
-{
-	Size		size;
-	/*
-	 * Currently, the size of the requests[] array is arbitrarily set equal to
-	 * NBuffers.  This may prove too large or small ...
-	 */
-	size = offsetof(BgWriterShmemStruct, requests);
-	size = add_size(size, mul_size(NBuffers, sizeof(BgWriterRequest)));
-	return size;
-}
-/*
- * BgWriterShmemInit
- *		Allocate and initialize bgwriter-related shared memory
- */
-void
-BgWriterShmemInit(void)
-{
-	bool		found;
-	BgWriterShmem = (BgWriterShmemStruct *)
-		ShmemInitStruct("Background Writer Data",
-						BgWriterShmemSize(),
-						&found);
-	if (!found)
-	{
-		/* First time through, so initialize */
-		MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
-		SpinLockInit(&BgWriterShmem->ckpt_lck);
-		BgWriterShmem->max_requests = NBuffers;
-	}
-}
-/*
- * RequestCheckpoint
- *		Called in backend processes to request a checkpoint
- *
- * flags is a bitwise OR of the following:
- *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
- *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
- *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
- *		ignoring checkpoint_completion_target parameter.
- *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
- *		CHECKPOINT_END_OF_RECOVERY).
- *	CHECKPOINT_WAIT: wait for completion before returning (otherwise,
- *		just signal bgwriter to do it, and return).
- *	CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
- *		(This affects logging, and in particular enables CheckPointWarning.)
- */
-void
-RequestCheckpoint(int flags)
-{
-	/* use volatile pointer to prevent code rearrangement */
-	volatile BgWriterShmemStruct *bgs = BgWriterShmem;
-	int			ntries;
-	int			old_failed,
-				old_started;
-	/*
-	 * If in a standalone backend, just do it ourselves.
-	 */
-	if (!IsPostmasterEnvironment)
-	{
-		/*
-		 * There's no point in doing slow checkpoints in a standalone backend,
-		 * because there's no other backends the checkpoint could disrupt.
-		 */
-		CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE);
-		/*
-		 * After any checkpoint, close all smgr files.	This is so we won't
-		 * hang onto smgr references to deleted files indefinitely.
-		 */
-		smgrcloseall();
-		return;
-	}
-	/*
-	 * Atomically set the request flags, and take a snapshot of the counters.
-	 * When we see ckpt_started > old_started, we know the flags we set here
-	 * have been seen by bgwriter.
-	 *
-	 * Note that we OR the flags with any existing flags, to avoid overriding
-	 * a "stronger" request by another backend.  The flag senses must be
-	 * chosen to make this work!
-	 */
-	SpinLockAcquire(&bgs->ckpt_lck);
-	old_failed = bgs->ckpt_failed;
-	old_started = bgs->ckpt_started;
-	bgs->ckpt_flags |= flags;
-	SpinLockRelease(&bgs->ckpt_lck);
-	/*
-	 * Send signal to request checkpoint.  It's possible that the bgwriter
-	 * hasn't started yet, or is in process of restarting, so we will retry a
-	 * few times if needed.  Also, if not told to wait for the checkpoint to
-	 * occur, we consider failure to send the signal to be nonfatal and merely
-	 * LOG it.
-	 */
-	for (ntries = 0;; ntries++)
-	{
-		if (BgWriterShmem->bgwriter_pid == 0)
-		{
-			if (ntries >= 20)	/* max wait 2.0 sec */
-			{
-				elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
-				"could not request checkpoint because bgwriter not running");
-				break;
-			}
-		}
-		else if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
-		{
-			if (ntries >= 20)	/* max wait 2.0 sec */
-			{
-				elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
-					 "could not signal for checkpoint: %m");
-				break;
-			}
-		}
-		else
-			break;				/* signal sent successfully */
-		CHECK_FOR_INTERRUPTS();
-		pg_usleep(100000L);		/* wait 0.1 sec, then retry */
-	}
-	/*
-	 * If requested, wait for completion.  We detect completion according to
-	 * the algorithm given above.
-	 */
-	if (flags & CHECKPOINT_WAIT)
-	{
-		int			new_started,
-					new_failed;
-		/* Wait for a new checkpoint to start. */
-		for (;;)
-		{
-			SpinLockAcquire(&bgs->ckpt_lck);
-			new_started = bgs->ckpt_started;
-			SpinLockRelease(&bgs->ckpt_lck);
-			if (new_started != old_started)
-				break;
-			CHECK_FOR_INTERRUPTS();
-			pg_usleep(100000L);
-		}
-		/*
-		 * We are waiting for ckpt_done >= new_started, in a modulo sense.
-		 */
-		for (;;)
-		{
-			int			new_done;
-			SpinLockAcquire(&bgs->ckpt_lck);
-			new_done = bgs->ckpt_done;
-			new_failed = bgs->ckpt_failed;
-			SpinLockRelease(&bgs->ckpt_lck);
-			if (new_done - new_started >= 0)
-				break;
-			CHECK_FOR_INTERRUPTS();
-			pg_usleep(100000L);
-		}
-		if (new_failed != old_failed)
-			ereport(ERROR,
-					(errmsg("checkpoint request failed"),
-					 errhint("Consult recent messages in the server log for details.")));
-	}
-}
-/*
- * ForwardFsyncRequest
- *		Forward a file-fsync request from a backend to the bgwriter
- *
- * Whenever a backend is compelled to write directly to a relation
- * (which should be seldom, if the bgwriter is getting its job done),
- * the backend calls this routine to pass over knowledge that the relation
- * is dirty and must be fsync'd before next checkpoint.  We also use this
- * opportunity to count such writes for statistical purposes.
- *
- * segno specifies which segment (not block!) of the relation needs to be
- * fsync'd.  (Since the valid range is much less than BlockNumber, we can
- * use high values for special flags; that's all internal to md.c, which
- * see for details.)
- *
- * To avoid holding the lock for longer than necessary, we normally write
- * to the requests[] queue without checking for duplicates.  The bgwriter
- * will have to eliminate dups internally anyway.  However, if we discover
- * that the queue is full, we make a pass over the entire queue to compact
- * it.	This is somewhat expensive, but the alternative is for the backend
- * to perform its own fsync, which is far more expensive in practice.  It
- * is theoretically possible a backend fsync might still be necessary, if
- * the queue is full and contains no duplicate entries.  In that case, we
- * let the backend know by returning false.
- */
-bool
-ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
-					BlockNumber segno)
-{
-	BgWriterRequest *request;
-	if (!IsUnderPostmaster)
-		return false;			/* probably shouldn't even get here */
-	if (am_bg_writer)
-		elog(ERROR, "ForwardFsyncRequest must not be called in bgwriter");
-	LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
-	/* Count all backend writes regardless of if they fit in the queue */
-	BgWriterShmem->num_backend_writes++;
-	/*
-	 * If the background writer isn't running or the request queue is full,
-	 * the backend will have to perform its own fsync request.	But before
-	 * forcing that to happen, we can try to compact the background writer
-	 * request queue.
-	 */
-	if (BgWriterShmem->bgwriter_pid == 0 ||
-		(BgWriterShmem->num_requests >= BgWriterShmem->max_requests
-		 && !CompactBgwriterRequestQueue()))
-	{
-		/*
-		 * Count the subset of writes where backends have to do their own
-		 * fsync
-		 */
-		BgWriterShmem->num_backend_fsync++;
-		LWLockRelease(BgWriterCommLock);
-		return false;
-	}
-	request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
-	request->rnode = rnode;
-	request->forknum = forknum;
-	request->segno = segno;
-	LWLockRelease(BgWriterCommLock);
-	return true;
-}
-/*
- * CompactBgwriterRequestQueue
- *		Remove duplicates from the request queue to avoid backend fsyncs.
- *
- * Although a full fsync request queue is not common, it can lead to severe
- * performance problems when it does happen.  So far, this situation has
- * only been observed to occur when the system is under heavy write load,
- * and especially during the "sync" phase of a checkpoint.	Without this
- * logic, each backend begins doing an fsync for every block written, which
- * gets very expensive and can slow down the whole system.
- *
- * Trying to do this every time the queue is full could lose if there
- * aren't any removable entries.  But should be vanishingly rare in
- * practice: there's one queue entry per shared buffer.
- */
-static bool
-CompactBgwriterRequestQueue()
-{
-	struct BgWriterSlotMapping
-	{
-		BgWriterRequest request;
-		int			slot;
-	};
-	int			n,
-				preserve_count;
-	int			num_skipped = 0;
-	HASHCTL		ctl;
-	HTAB	   *htab;
-	bool	   *skip_slot;
-	/* must hold BgWriterCommLock in exclusive mode */
-	Assert(LWLockHeldByMe(BgWriterCommLock));
-	/* Initialize temporary hash table */
-	MemSet(&ctl, 0, sizeof(ctl));
-	ctl.keysize = sizeof(BgWriterRequest);
-	ctl.entrysize = sizeof(struct BgWriterSlotMapping);
-	ctl.hash = tag_hash;
-	htab = hash_create("CompactBgwriterRequestQueue",
-					   BgWriterShmem->num_requests,
-					   &ctl,
-					   HASH_ELEM | HASH_FUNCTION);
-	/* Initialize skip_slot array */
-	skip_slot = palloc0(sizeof(bool) * BgWriterShmem->num_requests);
-	/*
-	 * The basic idea here is that a request can be skipped if it's followed
-	 * by a later, identical request.  It might seem more sensible to work
-	 * backwards from the end of the queue and check whether a request is
-	 * *preceded* by an earlier, identical request, in the hopes of doing less
-	 * copying.  But that might change the semantics, if there's an
-	 * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so
-	 * we do it this way.  It would be possible to be even smarter if we made
-	 * the code below understand the specific semantics of such requests (it
-	 * could blow away preceding entries that would end up being canceled
-	 * anyhow), but it's not clear that the extra complexity would buy us
-	 * anything.
-	 */
-	for (n = 0; n < BgWriterShmem->num_requests; ++n)
-	{
-		BgWriterRequest *request;
-		struct BgWriterSlotMapping *slotmap;
-		bool		found;
-		request = &BgWriterShmem->requests[n];
-		slotmap = hash_search(htab, request, HASH_ENTER, &found);
-		if (found)
-		{
-			skip_slot[slotmap->slot] = true;
-			++num_skipped;
-		}
-		slotmap->slot = n;
-	}
-	/* Done with the hash table. */
-	hash_destroy(htab);
-	/* If no duplicates, we're out of luck. */
-	if (!num_skipped)
-	{
-		pfree(skip_slot);
-		return false;
-	}
-	/* We found some duplicates; remove them. */
-	for (n = 0, preserve_count = 0; n < BgWriterShmem->num_requests; ++n)
-	{
-		if (skip_slot[n])
-			continue;
-		BgWriterShmem->requests[preserve_count++] = BgWriterShmem->requests[n];
-	}
-	ereport(DEBUG1,
-	   (errmsg("compacted fsync request queue from %d entries to %d entries",
-			   BgWriterShmem->num_requests, preserve_count)));
-	BgWriterShmem->num_requests = preserve_count;
-	/* Cleanup. */
-	pfree(skip_slot);
-	return true;
-}
-/*
- * AbsorbFsyncRequests
- *		Retrieve queued fsync requests and pass them to local smgr.
- *
- * This is exported because it must be called during CreateCheckPoint;
- * we have to be sure we have accepted all pending requests just before
- * we start fsync'ing.  Since CreateCheckPoint sometimes runs in
- * non-bgwriter processes, do nothing if not bgwriter.
- */
-void
-AbsorbFsyncRequests(void)
-{
-	BgWriterRequest *requests = NULL;
-	BgWriterRequest *request;
-	int			n;
-	if (!am_bg_writer)
-		return;
-	/*
-	 * We have to PANIC if we fail to absorb all the pending requests (eg,
-	 * because our hashtable runs out of memory).  This is because the system
-	 * cannot run safely if we are unable to fsync what we have been told to
-	 * fsync.  Fortunately, the hashtable is so small that the problem is
-	 * quite unlikely to arise in practice.
-	 */
-	START_CRIT_SECTION();
-	/*
-	 * We try to avoid holding the lock for a long time by copying the request
-	 * array.
-	 */
-	LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
-	/* Transfer write count into pending pgstats message */
-	BgWriterStats.m_buf_written_backend += BgWriterShmem->num_backend_writes;
-	BgWriterStats.m_buf_fsync_backend += BgWriterShmem->num_backend_fsync;
-	BgWriterShmem->num_backend_writes = 0;
-	BgWriterShmem->num_backend_fsync = 0;
-	n = BgWriterShmem->num_requests;
-	if (n > 0)
-	{
-		requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest));
-		memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest));
-	}
-	BgWriterShmem->num_requests = 0;
-	LWLockRelease(BgWriterCommLock);
-	for (request = requests; n > 0; request++, n--)
-		RememberFsyncRequest(request->rnode, request->forknum, request->segno);
-	if (requests)
-		pfree(requests);
-	END_CRIT_SECTION();
-}
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -208,6 +208,7 @@ char 		*output_config_variable = NULL;
 /* PIDs of special child processes; 0 when not running */
 static pid_t StartupPID = 0,
 			BgWriterPID = 0,
+			CheckpointerPID = 0,
 			WalWriterPID = 0,
 			WalReceiverPID = 0,
 			AutoVacPID = 0,
@@ -279,7 +280,7 @@ typedef enum
 	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
 	PM_WAIT_READONLY,			/* waiting for read only backends to exit */
 	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
-	PM_SHUTDOWN,				/* waiting for bgwriter to do shutdown ckpt */
+	PM_SHUTDOWN,				/* waiting for checkpointer to do shutdown ckpt */
 	PM_SHUTDOWN_2,				/* waiting for archiver and walsenders to
 								 * finish */
 	PM_WAIT_DEAD_END,			/* waiting for dead_end children to exit */
@@ -465,6 +466,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartupDataBase()		StartChildProcess(StartupProcess)
 #define StartBackgroundWriter() StartChildProcess(BgWriterProcess)
+#define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
@@ -1028,8 +1030,8 @@ PostmasterMain(int argc, char *argv[])
 	 * CAUTION: when changing this list, check for side-effects on the signal
 	 * handling setup of child processes.  See tcop/postgres.c,
 	 * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c,
-	 * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, and
+	 * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c,
-	 * postmaster/syslogger.c.
+	 * postmaster/syslogger.c and postmaster/checkpointer.c
 	 */
 	pqinitmask();
 	PG_SETMASK(&BlockSig);
@@ -1366,10 +1368,14 @@ ServerLoop(void)
 		 * state that prevents it, start one.  It doesn't matter if this
 		 * fails, we'll just try again later.
 		 */
-		if (BgWriterPID == 0 &&
+		if (pmState == PM_RUN || pmState == PM_RECOVERY ||
-			(pmState == PM_RUN || pmState == PM_RECOVERY ||
+			 pmState == PM_HOT_STANDBY)
-			 pmState == PM_HOT_STANDBY))
+		{
+			if (BgWriterPID == 0)
 				BgWriterPID = StartBackgroundWriter();
+			if (CheckpointerPID == 0)
+				CheckpointerPID = StartCheckpointer();
+		}
 		/*
 		 * Likewise, if we have lost the walwriter process, try to start a new
@@ -2047,6 +2053,8 @@ SIGHUP_handler(SIGNAL_ARGS)
 			signal_child(StartupPID, SIGHUP);
 		if (BgWriterPID != 0)
 			signal_child(BgWriterPID, SIGHUP);
+		if (CheckpointerPID != 0)
+			signal_child(CheckpointerPID, SIGHUP);
 		if (WalWriterPID != 0)
 			signal_child(WalWriterPID, SIGHUP);
 		if (WalReceiverPID != 0)
@@ -2119,6 +2127,8 @@ pmdie(SIGNAL_ARGS)
 				/* and the walwriter too */
 				if (WalWriterPID != 0)
 					signal_child(WalWriterPID, SIGTERM);
+				if (BgWriterPID != 0)
+					signal_child(BgWriterPID, SIGTERM);
 				/*
 				 * If we're in recovery, we can't kill the startup process
@@ -2159,9 +2169,11 @@ pmdie(SIGNAL_ARGS)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (BgWriterPID != 0)
+				signal_child(BgWriterPID, SIGTERM);
 			if (pmState == PM_RECOVERY)
 			{
-				/* only bgwriter is active in this state */
+				/* only checkpointer is active in this state */
 				pmState = PM_WAIT_BACKENDS;
 			}
 			else if (pmState == PM_RUN ||
@@ -2206,6 +2218,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(StartupPID, SIGQUIT);
 			if (BgWriterPID != 0)
 				signal_child(BgWriterPID, SIGQUIT);
+			if (CheckpointerPID != 0)
+				signal_child(CheckpointerPID, SIGQUIT);
 			if (WalWriterPID != 0)
 				signal_child(WalWriterPID, SIGQUIT);
 			if (WalReceiverPID != 0)
@@ -2336,12 +2350,14 @@ reaper(SIGNAL_ARGS)
 			}
 			/*
-			 * Crank up the background writer, if we didn't do that already
+			 * Crank up background tasks, if we didn't do that already
 			 * when we entered consistent recovery state.  It doesn't matter
 			 * if this fails, we'll just try again later.
 			 */
 			if (BgWriterPID == 0)
 				BgWriterPID = StartBackgroundWriter();
+			if (CheckpointerPID == 0)
+				CheckpointerPID = StartCheckpointer();
 			/*
 			 * Likewise, start other special children as needed.  In a restart
@@ -2369,10 +2385,22 @@ reaper(SIGNAL_ARGS)
 		if (pid == BgWriterPID)
 		{
 			BgWriterPID = 0;
+			if (!EXIT_STATUS_0(exitstatus))
+				HandleChildCrash(pid, exitstatus,
+								 _("background writer process"));
+			continue;
+		}
+		/*
+		 * Was it the checkpointer?
+		 */
+		if (pid == CheckpointerPID)
+		{
+			CheckpointerPID = 0;
 			if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN)
 			{
 				/*
-				 * OK, we saw normal exit of the bgwriter after it's been told
+				 * OK, we saw normal exit of the checkpointer after it's been told
 				 * to shut down.  We expect that it wrote a shutdown
 				 * checkpoint.	(If for some reason it didn't, recovery will
 				 * occur on next postmaster start.)
@@ -2409,11 +2437,11 @@ reaper(SIGNAL_ARGS)
 			else
 			{
 				/*
-				 * Any unexpected exit of the bgwriter (including FATAL exit)
+				 * Any unexpected exit of the checkpointer (including FATAL exit)
 				 * is treated as a crash.
 				 */
 				HandleChildCrash(pid, exitstatus,
-								 _("background writer process"));
+								 _("checkpointer process"));
 			}
 			continue;
@@ -2597,8 +2625,8 @@ CleanupBackend(int pid,
 }
 /*
- * HandleChildCrash -- cleanup after failed backend, bgwriter, walwriter,
+ * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer,
- * or autovacuum.
+ * walwriter or autovacuum.
 *
 * The objectives here are to clean up our local state about the child
 * process, and to signal all other remaining children to quickdie.
@@ -2691,6 +2719,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
+	/* Take care of the checkpointer too */
+	if (pid == CheckpointerPID)
+		CheckpointerPID = 0;
+	else if (CheckpointerPID != 0 && !FatalError)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) CheckpointerPID)));
+		signal_child(CheckpointerPID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
 	/* Take care of the walwriter too */
 	if (pid == WalWriterPID)
 		WalWriterPID = 0;
@@ -2887,9 +2927,10 @@ PostmasterStateMachine(void)
 	{
 		/*
 		 * PM_WAIT_BACKENDS state ends when we have no regular backends
-		 * (including autovac workers) and no walwriter or autovac launcher.
+		 * (including autovac workers) and no walwriter, autovac launcher
-		 * If we are doing crash recovery then we expect the bgwriter to exit
+		 * or bgwriter.  If we are doing crash recovery then we expect the
-		 * too, otherwise not.	The archiver, stats, and syslogger processes
+		 * checkpointer to exit as well, otherwise not.
+		 * The archiver, stats, and syslogger processes
 		 * are disregarded since they are not connected to shared memory; we
 		 * also disregard dead_end children here. Walsenders are also
 		 * disregarded, they will be terminated later after writing the
@@ -2898,7 +2939,8 @@ PostmasterStateMachine(void)
 		if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
 			StartupPID == 0 &&
 			WalReceiverPID == 0 &&
-			(BgWriterPID == 0 || !FatalError) &&
+			BgWriterPID == 0 &&
+			(CheckpointerPID == 0 || !FatalError) &&
 			WalWriterPID == 0 &&
 			AutoVacPID == 0)
 		{
@@ -2920,22 +2962,22 @@ PostmasterStateMachine(void)
 				/*
 				 * If we get here, we are proceeding with normal shutdown. All
 				 * the regular children are gone, and it's time to tell the
-				 * bgwriter to do a shutdown checkpoint.
+				 * checkpointer to do a shutdown checkpoint.
 				 */
 				Assert(Shutdown > NoShutdown);
-				/* Start the bgwriter if not running */
+				/* Start the checkpointer if not running */
-				if (BgWriterPID == 0)
+				if (CheckpointerPID == 0)
-					BgWriterPID = StartBackgroundWriter();
+					CheckpointerPID = StartCheckpointer();
 				/* And tell it to shut down */
-				if (BgWriterPID != 0)
+				if (CheckpointerPID != 0)
 				{
-					signal_child(BgWriterPID, SIGUSR2);
+					signal_child(CheckpointerPID, SIGUSR2);
 					pmState = PM_SHUTDOWN;
 				}
 				else
 				{
 					/*
-					 * If we failed to fork a bgwriter, just shut down. Any
+					 * If we failed to fork a checkpointer, just shut down. Any
 					 * required cleanup will happen at next restart. We set
 					 * FatalError so that an "abnormal shutdown" message gets
 					 * logged when we exit.
@@ -2994,6 +3036,7 @@ PostmasterStateMachine(void)
 			Assert(StartupPID == 0);
 			Assert(WalReceiverPID == 0);
 			Assert(BgWriterPID == 0);
+			Assert(CheckpointerPID == 0);
 			Assert(WalWriterPID == 0);
 			Assert(AutoVacPID == 0);
 			/* syslogger is not considered here */
@@ -4173,6 +4216,8 @@ sigusr1_handler(SIGNAL_ARGS)
 		 */
 		Assert(BgWriterPID == 0);
 		BgWriterPID = StartBackgroundWriter();
+		Assert(CheckpointerPID == 0);
+		CheckpointerPID = StartCheckpointer();
 		pmState = PM_RECOVERY;
 	}
@@ -4459,6 +4504,10 @@ StartChildProcess(AuxProcType type)
 				ereport(LOG,
 				   (errmsg("could not fork background writer process: %m")));
 				break;
+			case CheckpointerProcess:
+				ereport(LOG,
+				   (errmsg("could not fork checkpointer process: %m")));
+				break;
 			case WalWriterProcess:
 				ereport(LOG,
 						(errmsg("could not fork WAL writer process: %m")));

--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1278,11 +1278,9 @@ BufferSync(int flags)
 					break;
 				/*
-				 * Perform normal bgwriter duties and sleep to throttle our
+				 * Sleep to throttle our I/O rate.
-				 * I/O rate.
 				 */
-				CheckpointWriteDelay(flags,
+				CheckpointWriteDelay(flags, (double) num_written / num_to_write);
-									 (double) num_written / num_to_write);
 			}
 		}

--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -38,7 +38,7 @@
 /*
 * Special values for the segno arg to RememberFsyncRequest.
 *
- * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
+ * Note that CompactcheckpointerRequestQueue assumes that it's OK to remove an
 * fsync request from the queue if an identical, subsequent request is found.
 * See comments there before making changes here.
 */
@@ -77,7 +77,7 @@
 *	Inactive segments are those that once contained data but are currently
 *	not needed because of an mdtruncate() operation.  The reason for leaving
 *	them present at size zero, rather than unlinking them, is that other
- *	backends and/or the bgwriter might be holding open file references to
+ *	backends and/or the checkpointer might be holding open file references to
 *	such segments.	If the relation expands again after mdtruncate(), such
 *	that a deactivated segment becomes active again, it is important that
 *	such file references still be valid --- else data might get written
@@ -111,7 +111,7 @@ static MemoryContext MdCxt;		/* context for all md.c allocations */
 /*
- * In some contexts (currently, standalone backends and the bgwriter process)
+ * In some contexts (currently, standalone backends and the checkpointer process)
 * we keep track of pending fsync operations: we need to remember all relation
 * segments that have been written since the last checkpoint, so that we can
 * fsync them down to disk before completing the next checkpoint.  This hash
@@ -123,7 +123,7 @@ static MemoryContext MdCxt;		/* context for all md.c allocations */
 * a hash table, because we don't expect there to be any duplicate requests.
 *
 * (Regular backends do not track pending operations locally, but forward
- * them to the bgwriter.)
+ * them to the checkpointer.)
 */
 typedef struct
 {
@@ -194,7 +194,7 @@ mdinit(void)
 	 * Create pending-operations hashtable if we need it.  Currently, we need
 	 * it if we are standalone (not under a postmaster) OR if we are a
 	 * bootstrap-mode subprocess of a postmaster (that is, a startup or
-	 * bgwriter process).
+	 * checkpointer process).
 	 */
 	if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 	{
@@ -214,10 +214,10 @@ mdinit(void)
 }
 /*
- * In archive recovery, we rely on bgwriter to do fsyncs, but we will have
+ * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
 * already created the pendingOpsTable during initialization of the startup
 * process.  Calling this function drops the local pendingOpsTable so that
- * subsequent requests will be forwarded to bgwriter.
+ * subsequent requests will be forwarded to checkpointer.
 */
 void
 SetForwardFsyncRequests(void)
@@ -765,9 +765,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	 * NOTE: this assumption could only be wrong if another backend has
 	 * truncated the relation.	We rely on higher code levels to handle that
 	 * scenario by closing and re-opening the md fd, which is handled via
-	 * relcache flush.	(Since the bgwriter doesn't participate in relcache
+	 * relcache flush.	(Since the checkpointer doesn't participate in relcache
 	 * flush, it could have segment chain entries for inactive segments;
-	 * that's OK because the bgwriter never needs to compute relation size.)
+	 * that's OK because the checkpointer never needs to compute relation size.)
 	 */
 	while (v->mdfd_chain != NULL)
 	{
@@ -957,7 +957,7 @@ mdsync(void)
 		elog(ERROR, "cannot sync without a pendingOpsTable");
 	/*
-	 * If we are in the bgwriter, the sync had better include all fsync
+	 * If we are in the checkpointer, the sync had better include all fsync
 	 * requests that were queued by backends up to this point.	The tightest
 	 * race condition that could occur is that a buffer that must be written
 	 * and fsync'd for the checkpoint could have been dumped by a backend just
@@ -1033,7 +1033,7 @@ mdsync(void)
 			int			failures;
 			/*
-			 * If in bgwriter, we want to absorb pending requests every so
+			 * If in checkpointer, we want to absorb pending requests every so
 			 * often to prevent overflow of the fsync request queue.  It is
 			 * unspecified whether newly-added entries will be visited by
 			 * hash_seq_search, but we don't care since we don't need to
@@ -1070,9 +1070,9 @@ mdsync(void)
 				 * say "but an unreferenced SMgrRelation is still a leak!" Not
 				 * really, because the only case in which a checkpoint is done
 				 * by a process that isn't about to shut down is in the
-				 * bgwriter, and it will periodically do smgrcloseall(). This
+				 * checkpointer, and it will periodically do smgrcloseall(). This
 				 * fact justifies our not closing the reln in the success path
-				 * either, which is a good thing since in non-bgwriter cases
+				 * either, which is a good thing since in non-checkpointer cases
 				 * we couldn't safely do that.)  Furthermore, in many cases
 				 * the relation will have been dirtied through this same smgr
 				 * relation, and so we can save a file open/close cycle.
@@ -1301,7 +1301,7 @@ register_unlink(RelFileNodeBackend rnode)
 	else
 	{
 		/*
-		 * Notify the bgwriter about it.  If we fail to queue the request
+		 * Notify the checkpointer about it.  If we fail to queue the request
 		 * message, we have to sleep and try again, because we can't simply
 		 * delete the file now.  Ugly, but hopefully won't happen often.
 		 *
@@ -1315,10 +1315,10 @@ register_unlink(RelFileNodeBackend rnode)
 }
 /*
- * RememberFsyncRequest() -- callback from bgwriter side of fsync request
+ * RememberFsyncRequest() -- callback from checkpointer side of fsync request
 *
 * We stuff most fsync requests into the local hash table for execution
- * during the bgwriter's next checkpoint.  UNLINK requests go into a
+ * during the checkpointer's next checkpoint.  UNLINK requests go into a
 * separate linked list, however, because they get processed separately.
 *
 * The range of possible segment numbers is way less than the range of
@@ -1460,20 +1460,20 @@ ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
 	else if (IsUnderPostmaster)
 	{
 		/*
-		 * Notify the bgwriter about it.  If we fail to queue the revoke
+		 * Notify the checkpointer about it.  If we fail to queue the revoke
 		 * message, we have to sleep and try again ... ugly, but hopefully
 		 * won't happen often.
 		 *
 		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
 		 * error would leave the no-longer-used file still present on disk,
-		 * which would be bad, so I'm inclined to assume that the bgwriter
+		 * which would be bad, so I'm inclined to assume that the checkpointer
 		 * will always empty the queue soon.
 		 */
 		while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
 			pg_usleep(10000L);	/* 10 msec seems a good number */
 		/*
-		 * Note we don't wait for the bgwriter to actually absorb the revoke
+		 * Note we don't wait for the checkpointer to actually absorb the revoke
 		 * message; see mdsync() for the implications.
 		 */
 	}

--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -256,7 +256,7 @@ typedef struct RmgrData
 extern const RmgrData RmgrTable[];
 /*
- * Exported to support xlog switching from bgwriter
+ * Exported to support xlog switching from checkpointer
 */
 extern pg_time_t GetLastSegSwitchTime(void);
 extern XLogRecPtr RequestXLogSwitch(void);

--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -22,6 +22,7 @@ typedef enum
 	BootstrapProcess,
 	StartupProcess,
 	BgWriterProcess,
+	CheckpointerProcess,
 	WalWriterProcess,
 	WalReceiverProcess,

--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -23,6 +23,7 @@ extern int	CheckPointWarning;
 extern double CheckPointCompletionTarget;
 extern void BackgroundWriterMain(void);
+extern void CheckpointerMain(void);
 extern void RequestCheckpoint(int flags);
 extern void CheckpointWriteDelay(int flags, double progress);

--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -190,11 +190,11 @@ extern PROC_HDR *ProcGlobal;
 * We set aside some extra PGPROC structures for auxiliary processes,
 * ie things that aren't full-fledged backends but need shmem access.
 *
- * Background writer and WAL writer run during normal operation. Startup
+ * Background writer, checkpointer and WAL writer run during normal operation.
- * process and WAL receiver also consume 2 slots, but WAL writer is
+ * Startup process and WAL receiver also consume 2 slots, but WAL writer is
- * launched only after startup has exited, so we only need 3 slots.
+ * launched only after startup has exited, so we only need 4 slots.
 */
-#define NUM_AUXILIARY_PROCS		3
+#define NUM_AUXILIARY_PROCS		4
 /* configurable options */

--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -19,7 +19,7 @@
 /*
 * Reasons for signalling a Postgres child process (a backend or an auxiliary
- * process, like bgwriter).  We can cope with concurrent signals for different
+ * process, like checkpointer).  We can cope with concurrent signals for different
 * reasons.  However, if the same reason is signaled multiple times in quick
 * succession, the process is likely to observe only one notification of it.
 * This is okay for the present uses.