Commit d8725104 authored by Thomas Munro's avatar Thomas Munro

Replace buffer I/O locks with condition variables.

1.  Backends waiting for buffer I/O are now interruptible.

2.  If something goes wrong in a backend that is currently performing
I/O, waiting backends no longer wake up until that backend reaches
AbortBufferIO() and broadcasts on the CV.  Previously, any waiters would
wake up (because the I/O lock was automatically released) and then
busy-loop until AbortBufferIO() cleared BM_IO_IN_PROGRESS.

3.  LWLockMinimallyPadded is removed, as it would now be unused.

Author: Robert Haas <robertmhaas@gmail.com>
Reviewed-by: default avatarThomas Munro <thomas.munro@gmail.com>
Reviewed-by: default avatarJulien Rouhaud <rjuju123@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (earlier version, 2016)
Discussion: https://postgr.es/m/CA%2BhUKGJ8nBFrjLuCTuqKN0pd2PQOwj9b_jnsiGFFMDvUxahj_A%40mail.gmail.com
Discussion: https://postgr.es/m/CA+Tgmoaj2aPti0yho7FeEf2qt-JgQPRWb0gci_o1Hfr=C56Xng@mail.gmail.com
parent c3ffe348
......@@ -1586,6 +1586,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry>Waiting for the page number needed to continue a parallel B-tree
scan to become available.</entry>
</row>
<row>
<entry><literal>BufferIO</literal></entry>
<entry>Waiting for buffer I/O to complete.</entry>
</row>
<row>
<entry><literal>CheckpointDone</literal></entry>
<entry>Waiting for a checkpoint to complete.</entry>
......@@ -1876,10 +1880,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry><literal>BufferContent</literal></entry>
<entry>Waiting to access a data page in memory.</entry>
</row>
<row>
<entry><literal>BufferIO</literal></entry>
<entry>Waiting for I/O on a data page.</entry>
</row>
<row>
<entry><literal>BufferMapping</literal></entry>
<entry>Waiting to associate a data block with a buffer in the buffer
......
......@@ -4010,6 +4010,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
case WAIT_EVENT_BTREE_PAGE:
event_name = "BtreePage";
break;
case WAIT_EVENT_BUFFER_IO:
event_name = "BufferIO";
break;
case WAIT_EVENT_CHECKPOINT_DONE:
event_name = "CheckpointDone";
break;
......
......@@ -145,14 +145,11 @@ held within the buffer. Each buffer header also contains an LWLock, the
"buffer content lock", that *does* represent the right to access the data
in the buffer. It is used per the rules above.
There is yet another set of per-buffer LWLocks, the io_in_progress locks,
that are used to wait for I/O on a buffer to complete. The process doing
a read or write takes exclusive lock for the duration, and processes that
need to wait for completion try to take shared locks (which they release
immediately upon obtaining). XXX on systems where an LWLock represents
nontrivial resources, it's fairly annoying to need so many locks. Possibly
we could use per-backend LWLocks instead (a buffer header would then contain
a field to show which backend is doing its I/O).
* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a
buffer to complete (and in releases before 14, it was accompanied by a
per-buffer LWLock). The process doing a read or write sets the flag for the
duration, and processes that need to wait for it to be cleared sleep on a
condition variable.
Normal Buffer Replacement Strategy
......
......@@ -19,7 +19,7 @@
BufferDescPadded *BufferDescriptors;
char *BufferBlocks;
LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
ConditionVariableMinimallyPadded *BufferIOCVArray;
WritebackContext BackendWritebackContext;
CkptSortItem *CkptBufferIds;
......@@ -68,7 +68,7 @@ InitBufferPool(void)
{
bool foundBufs,
foundDescs,
foundIOLocks,
foundIOCV,
foundBufCkpt;
/* Align descriptors to a cacheline boundary. */
......@@ -81,11 +81,11 @@ InitBufferPool(void)
ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ, &foundBufs);
/* Align lwlocks to cacheline boundary */
BufferIOLWLockArray = (LWLockMinimallyPadded *)
ShmemInitStruct("Buffer IO Locks",
NBuffers * (Size) sizeof(LWLockMinimallyPadded),
&foundIOLocks);
/* Align condition variables to cacheline boundary. */
BufferIOCVArray = (ConditionVariableMinimallyPadded *)
ShmemInitStruct("Buffer IO Condition Variables",
NBuffers * sizeof(ConditionVariableMinimallyPadded),
&foundIOCV);
/*
* The array used to sort to-be-checkpointed buffer ids is located in
......@@ -98,10 +98,10 @@ InitBufferPool(void)
ShmemInitStruct("Checkpoint BufferIds",
NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
if (foundDescs || foundBufs || foundIOCV || foundBufCkpt)
{
/* should find all of these, or none of them */
Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt);
/* note: this path is only taken in EXEC_BACKEND case */
}
else
......@@ -131,8 +131,7 @@ InitBufferPool(void)
LWLockInitialize(BufferDescriptorGetContentLock(buf),
LWTRANCHE_BUFFER_CONTENT);
LWLockInitialize(BufferDescriptorGetIOLock(buf),
LWTRANCHE_BUFFER_IO);
ConditionVariableInit(BufferDescriptorGetIOCV(buf));
}
/* Correct last entry of linked list */
......@@ -169,16 +168,9 @@ BufferShmemSize(void)
/* size of stuff controlled by freelist.c */
size = add_size(size, StrategyShmemSize());
/*
* It would be nice to include the I/O locks in the BufferDesc, but that
* would increase the size of a BufferDesc to more than one cache line,
* and benchmarking has shown that keeping every BufferDesc aligned on a
* cache line boundary is important for performance. So, instead, the
* array of I/O locks is allocated in a separate tranche. Because those
* locks are not highly contended, we lay out the array with minimal
* padding.
*/
size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
/* size of I/O condition variables */
size = add_size(size, mul_size(NBuffers,
sizeof(ConditionVariableMinimallyPadded)));
/* to allow aligning the above */
size = add_size(size, PG_CACHE_LINE_SIZE);
......
......@@ -1352,9 +1352,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
LWLockRelease(newPartitionLock);
/*
* Buffer contents are currently invalid. Try to get the io_in_progress
* lock. If StartBufferIO returns false, then someone else managed to
* read it before we did, so there's nothing left for BufferAlloc() to do.
* Buffer contents are currently invalid. Try to obtain the right to
* start I/O. If StartBufferIO returns false, then someone else managed
* to read it before we did, so there's nothing left for BufferAlloc() to
* do.
*/
if (StartBufferIO(buf, true))
*foundPtr = false;
......@@ -1777,9 +1778,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
*/
VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
/* I'd better not still hold any locks on the buffer */
/* I'd better not still hold the buffer content lock */
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
/*
* Decrement the shared reference count.
......@@ -2742,9 +2742,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
uint32 buf_state;
/*
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
* false, then someone else flushed the buffer before we could, so we need
* not do anything.
* Try to start an I/O operation. If StartBufferIO returns false, then
* someone else flushed the buffer before we could, so we need not do
* anything.
*/
if (!StartBufferIO(buf, false))
return;
......@@ -2800,7 +2800,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
/*
* Now it's safe to write buffer to disk. Note that no one else should
* have been able to write it while we were busy with log flushing because
* we have the io_in_progress lock.
* only one process at a time can set the BM_IO_IN_PROGRESS bit.
*/
bufBlock = BufHdrGetBlock(buf);
......@@ -2835,7 +2835,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
/*
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
* end the io_in_progress state.
* end the BM_IO_IN_PROGRESS state.
*/
TerminateBufferIO(buf, true, 0);
......@@ -4271,7 +4271,7 @@ IsBufferCleanupOK(Buffer buffer)
* Functions for buffer I/O handling
*
* Note: We assume that nested buffer I/O never occurs.
* i.e at most one io_in_progress lock is held per proc.
* i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
*
* Also note that these are used only for shared buffers, not local ones.
*/
......@@ -4282,13 +4282,9 @@ IsBufferCleanupOK(Buffer buffer)
static void
WaitIO(BufferDesc *buf)
{
/*
* Changed to wait until there's no IO - Inoue 01/13/2000
*
* Note this is *necessary* because an error abort in the process doing
* I/O could release the io_in_progress_lock prematurely. See
* AbortBufferIO.
*/
ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
ConditionVariablePrepareToSleep(cv);
for (;;)
{
uint32 buf_state;
......@@ -4303,9 +4299,9 @@ WaitIO(BufferDesc *buf)
if (!(buf_state & BM_IO_IN_PROGRESS))
break;
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
LWLockRelease(BufferDescriptorGetIOLock(buf));
ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
}
ConditionVariableCancelSleep();
}
/*
......@@ -4317,7 +4313,7 @@ WaitIO(BufferDesc *buf)
* In some scenarios there are race conditions in which multiple backends
* could attempt the same I/O operation concurrently. If someone else
* has already started I/O on this buffer then we will block on the
* io_in_progress lock until he's done.
* I/O condition variable until he's done.
*
* Input operations are only attempted on buffers that are not BM_VALID,
* and output operations only on buffers that are BM_VALID and BM_DIRTY,
......@@ -4335,25 +4331,11 @@ StartBufferIO(BufferDesc *buf, bool forInput)
for (;;)
{
/*
* Grab the io_in_progress lock so that other processes can wait for
* me to finish the I/O.
*/
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
buf_state = LockBufHdr(buf);
if (!(buf_state & BM_IO_IN_PROGRESS))
break;
/*
* The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
* lock isn't held is if the process doing the I/O is recovering from
* an error (see AbortBufferIO). If that's the case, we must wait for
* him to get unwedged.
*/
UnlockBufHdr(buf, buf_state);
LWLockRelease(BufferDescriptorGetIOLock(buf));
WaitIO(buf);
}
......@@ -4363,7 +4345,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
{
/* someone else already did the I/O */
UnlockBufHdr(buf, buf_state);
LWLockRelease(BufferDescriptorGetIOLock(buf));
return false;
}
......@@ -4381,7 +4362,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
* (Assumptions)
* My process is executing IO for the buffer
* BM_IO_IN_PROGRESS bit is set for the buffer
* We hold the buffer's io_in_progress lock
* The buffer is Pinned
*
* If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
......@@ -4413,7 +4393,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
InProgressBuf = NULL;
LWLockRelease(BufferDescriptorGetIOLock(buf));
ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
}
/*
......@@ -4434,14 +4414,6 @@ AbortBufferIO(void)
{
uint32 buf_state;
/*
* Since LWLockReleaseAll has already been called, we're not holding
* the buffer's io_in_progress_lock. We have to re-acquire it so that
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
* buffer will be in a busy spin until we succeed in doing this.
*/
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
buf_state = LockBufHdr(buf);
Assert(buf_state & BM_IO_IN_PROGRESS);
if (IsForInput)
......
......@@ -146,8 +146,6 @@ static const char *const BuiltinTrancheNames[] = {
"WALInsert",
/* LWTRANCHE_BUFFER_CONTENT: */
"BufferContent",
/* LWTRANCHE_BUFFER_IO: */
"BufferIO",
/* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
"ReplicationOriginState",
/* LWTRANCHE_REPLICATION_SLOT_IO: */
......@@ -469,8 +467,7 @@ CreateLWLocks(void)
StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
"MAX_BACKENDS too big for lwlock.c");
StaticAssertStmt(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
"Miscalculated LWLock padding");
if (!IsUnderPostmaster)
......
......@@ -971,6 +971,7 @@ typedef enum
WAIT_EVENT_BGWORKER_SHUTDOWN,
WAIT_EVENT_BGWORKER_STARTUP,
WAIT_EVENT_BTREE_PAGE,
WAIT_EVENT_BUFFER_IO,
WAIT_EVENT_CHECKPOINT_DONE,
WAIT_EVENT_CHECKPOINT_START,
WAIT_EVENT_EXECUTE_GATHER,
......
......@@ -18,6 +18,7 @@
#include "port/atomics.h"
#include "storage/buf.h"
#include "storage/bufmgr.h"
#include "storage/condition_variable.h"
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/shmem.h"
......@@ -184,7 +185,6 @@ typedef struct BufferDesc
int wait_backend_pid; /* backend PID of pin-count waiter */
int freeNext; /* link in freelist chain */
LWLock content_lock; /* to lock access to buffer contents */
} BufferDesc;
......@@ -221,12 +221,12 @@ typedef union BufferDescPadded
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
#define BufferDescriptorGetIOLock(bdesc) \
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
#define BufferDescriptorGetIOCV(bdesc) \
(&(BufferIOCVArray[(bdesc)->buf_id]).cv)
#define BufferDescriptorGetContentLock(bdesc) \
((LWLock*) (&(bdesc)->content_lock))
extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
/*
* The freeNext field is either the index of the next freelist entry,
......
......@@ -31,6 +31,17 @@ typedef struct
proclist_head wakeup; /* list of wake-able processes */
} ConditionVariable;
/*
* Pad a condition variable to a power-of-two size so that an array of
* condition variables does not cross a cache line boundary.
*/
#define CV_MINIMAL_SIZE (sizeof(ConditionVariable) <= 16 ? 16 : 32)
typedef union ConditionVariableMinimallyPadded
{
ConditionVariable cv;
char pad[CV_MINIMAL_SIZE];
} ConditionVariableMinimallyPadded;
/* Initialize a condition variable. */
extern void ConditionVariableInit(ConditionVariable *cv);
......
......@@ -48,29 +48,8 @@ typedef struct LWLock
* even more padding so that each LWLock takes up an entire cache line; this is
* useful, for example, in the main LWLock array, where the overall number of
* locks is small but some are heavily contended.
*
* When allocating a tranche that contains data other than LWLocks, it is
* probably best to include a bare LWLock and then pad the resulting structure
* as necessary for performance. For an array that contains only LWLocks,
* LWLockMinimallyPadded can be used for cases where we just want to ensure
* that we don't cross cache line boundaries within a single lock, while
* LWLockPadded can be used for cases where we want each lock to be an entire
* cache line.
*
* An LWLockMinimallyPadded might contain more than the absolute minimum amount
* of padding required to keep a lock from crossing a cache line boundary,
* because an unpadded LWLock will normally fit into 16 bytes. We ignore that
* possibility when determining the minimal amount of padding. Older releases
* had larger LWLocks, so 32 really was the minimum, and packing them in
* tighter might hurt performance.
*
* LWLOCK_MINIMAL_SIZE should be 32 on basically all common platforms, but
* because pg_atomic_uint32 is more than 4 bytes on some obscure platforms, we
* allow for the possibility that it might be 64. Even on those platforms,
* we probably won't exceed 32 bytes unless LOCK_DEBUG is defined.
*/
#define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE
#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
/* LWLock, padded to a full cache line size */
typedef union LWLockPadded
......@@ -79,13 +58,6 @@ typedef union LWLockPadded
char pad[LWLOCK_PADDED_SIZE];
} LWLockPadded;
/* LWLock, minimally padded */
typedef union LWLockMinimallyPadded
{
LWLock lock;
char pad[LWLOCK_MINIMAL_SIZE];
} LWLockMinimallyPadded;
extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
/* struct for storing named tranche information */
......@@ -202,7 +174,6 @@ typedef enum BuiltinTrancheIds
LWTRANCHE_SERIAL_BUFFER,
LWTRANCHE_WAL_INSERT,
LWTRANCHE_BUFFER_CONTENT,
LWTRANCHE_BUFFER_IO,
LWTRANCHE_REPLICATION_ORIGIN_STATE,
LWTRANCHE_REPLICATION_SLOT_IO,
LWTRANCHE_LOCK_FASTPATH,
......
......@@ -398,6 +398,7 @@ CompressionAlgorithm
CompressorState
ComputeXidHorizonsResult
ConditionVariable
ConditionVariableMinimallyPadded
ConditionalStack
ConfigData
ConfigVariable
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment