Commit 9cd00c45 authored by Andres Freund's avatar Andres Freund

Checkpoint sorting and balancing.

Up to now checkpoints were written in the order they're in the
BufferDescriptors. That's nearly random in a lot of cases, which
performs badly on rotating media, but even on SSDs it causes slowdowns.

To avoid that, sort checkpoints before writing them out. We currently
sort by tablespace, relfilenode, fork and block number.

One of the major reasons that previously wasn't done, was fear of
imbalance between tablespaces. To address that balance writes between
tablespaces.

The other prime concern was that the relatively large allocation to sort
the buffers in might fail, preventing checkpoints from happening. Thus
pre-allocate the required memory in shared memory, at server startup.

This particularly makes it more efficient to have checkpoint flushing
enabled, because that'll often result in a lot of writes that can be
coalesced into one flush.

Discussion: alpine.DEB.2.10.1506011320000.28433@sto
Author: Fabien Coelho and Andres Freund
parent 428b1d6b
...@@ -267,11 +267,6 @@ only needs to take the lock long enough to read the variable value, not ...@@ -267,11 +267,6 @@ only needs to take the lock long enough to read the variable value, not
while scanning the buffers. (This is a very substantial improvement in while scanning the buffers. (This is a very substantial improvement in
the contention cost of the writer compared to PG 8.0.) the contention cost of the writer compared to PG 8.0.)
During a checkpoint, the writer's strategy must be to write every dirty
buffer (pinned or not!). We may as well make it start this scan from
nextVictimBuffer, however, so that the first-to-be-written pages are the
ones that backends might otherwise have to write for themselves soon.
The background writer takes shared content lock on a buffer while writing it The background writer takes shared content lock on a buffer while writing it
out (and anyone else who flushes buffer contents to disk must do so too). out (and anyone else who flushes buffer contents to disk must do so too).
This ensures that the page image transferred to disk is reasonably consistent. This ensures that the page image transferred to disk is reasonably consistent.
......
...@@ -24,6 +24,7 @@ LWLockMinimallyPadded *BufferIOLWLockArray = NULL; ...@@ -24,6 +24,7 @@ LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
LWLockTranche BufferIOLWLockTranche; LWLockTranche BufferIOLWLockTranche;
LWLockTranche BufferContentLWLockTranche; LWLockTranche BufferContentLWLockTranche;
WritebackContext BackendWritebackContext; WritebackContext BackendWritebackContext;
CkptSortItem *CkptBufferIds;
/* /*
...@@ -70,7 +71,8 @@ InitBufferPool(void) ...@@ -70,7 +71,8 @@ InitBufferPool(void)
{ {
bool foundBufs, bool foundBufs,
foundDescs, foundDescs,
foundIOLocks; foundIOLocks,
foundBufCkpt;
/* Align descriptors to a cacheline boundary. */ /* Align descriptors to a cacheline boundary. */
BufferDescriptors = (BufferDescPadded *) BufferDescriptors = (BufferDescPadded *)
...@@ -104,10 +106,21 @@ InitBufferPool(void) ...@@ -104,10 +106,21 @@ InitBufferPool(void)
LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT, LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
&BufferContentLWLockTranche); &BufferContentLWLockTranche);
if (foundDescs || foundBufs || foundIOLocks) /*
* The array used to sort to-be-checkpointed buffer ids is located in
* shared memory, to avoid having to allocate significant amounts of
* memory at runtime. As that'd be in the middle of a checkpoint, or when
* the checkpointer is restarted, memory allocation failures would be
* painful.
*/
CkptBufferIds = (CkptSortItem *)
ShmemInitStruct("Checkpoint BufferIds",
NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
{ {
/* should find all of these, or none of them */ /* should find all of these, or none of them */
Assert(foundDescs && foundBufs && foundIOLocks); Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
/* note: this path is only taken in EXEC_BACKEND case */ /* note: this path is only taken in EXEC_BACKEND case */
} }
else else
...@@ -190,5 +203,8 @@ BufferShmemSize(void) ...@@ -190,5 +203,8 @@ BufferShmemSize(void)
/* to allow aligning the above */ /* to allow aligning the above */
size = add_size(size, PG_CACHE_LINE_SIZE); size = add_size(size, PG_CACHE_LINE_SIZE);
/* size of checkpoint sort array in bufmgr.c */
size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
return size; return size;
} }
This diff is collapsed.
...@@ -239,6 +239,24 @@ extern PGDLLIMPORT WritebackContext BackendWritebackContext; ...@@ -239,6 +239,24 @@ extern PGDLLIMPORT WritebackContext BackendWritebackContext;
/* in localbuf.c */ /* in localbuf.c */
extern BufferDesc *LocalBufferDescriptors; extern BufferDesc *LocalBufferDescriptors;
/* in bufmgr.c */
/*
* Structure to sort buffers per file on checkpoints.
*
* This structure is allocated per buffer in shared memory, so it should be
* kept as small as possible.
*/
typedef struct CkptSortItem
{
Oid tsId;
Oid relNode;
ForkNumber forkNum;
BlockNumber blockNum;
int buf_id;
} CkptSortItem;
extern CkptSortItem *CkptBufferIds;
/* /*
* Internal buffer management routines * Internal buffer management routines
......
...@@ -283,6 +283,8 @@ CheckpointerRequest ...@@ -283,6 +283,8 @@ CheckpointerRequest
CheckpointerShmemStruct CheckpointerShmemStruct
Chromosome Chromosome
City City
CkptSortItem
CkptTsStatus
ClientAuthentication_hook_type ClientAuthentication_hook_type
ClientData ClientData
ClonePtr ClonePtr
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment