Commit 6150a1b0 authored by Robert Haas's avatar Robert Haas

Move buffer I/O and content LWLocks out of the main tranche.

Move the content lock directly into the BufferDesc, so that locking and
pinning a buffer touches only one cache line rather than two.  Adjust
the definition of BufferDesc slightly so that this doesn't make the
BufferDesc any larger than one cache line (at least on platforms where
a spinlock is only 1 or 2 bytes).

We can't fit the I/O locks into the BufferDesc and stay within one
cache line, so move those to a completely separate tranche.  This
leaves a relatively limited number of LWLocks in the main tranche, so
increase the padding of those remaining locks to a full cache line,
rather than allowing adjacent locks to share a cache line, hopefully
reducing false sharing.

Performance testing shows that these changes make little difference
on laptop-class machines, but help significantly on larger servers,
especially those with more than 2 sockets.

Andres Freund, originally based on an earlier patch by Simon Riggs.
Review and cosmetic adjustments (including heavy rewriting of the
comments) by me.
parent 3fed4174
...@@ -20,6 +20,9 @@ ...@@ -20,6 +20,9 @@
BufferDescPadded *BufferDescriptors; BufferDescPadded *BufferDescriptors;
char *BufferBlocks; char *BufferBlocks;
LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
LWLockTranche BufferIOLWLockTranche;
LWLockTranche BufferContentLWLockTranche;
/* /*
...@@ -65,22 +68,45 @@ void ...@@ -65,22 +68,45 @@ void
InitBufferPool(void) InitBufferPool(void)
{ {
bool foundBufs, bool foundBufs,
foundDescs; foundDescs,
foundIOLocks;
/* Align descriptors to a cacheline boundary. */ /* Align descriptors to a cacheline boundary. */
BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN( BufferDescriptors = (BufferDescPadded *)
CACHELINEALIGN(
ShmemInitStruct("Buffer Descriptors", ShmemInitStruct("Buffer Descriptors",
NBuffers * sizeof(BufferDescPadded) + PG_CACHE_LINE_SIZE, NBuffers * sizeof(BufferDescPadded)
+ PG_CACHE_LINE_SIZE,
&foundDescs)); &foundDescs));
BufferBlocks = (char *) BufferBlocks = (char *)
ShmemInitStruct("Buffer Blocks", ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ, &foundBufs); NBuffers * (Size) BLCKSZ, &foundBufs);
if (foundDescs || foundBufs) /* Align lwlocks to cacheline boundary */
BufferIOLWLockArray = (LWLockMinimallyPadded *)
CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks",
NBuffers * (Size) sizeof(LWLockMinimallyPadded)
+ PG_CACHE_LINE_SIZE,
&foundIOLocks));
BufferIOLWLockTranche.name = "Buffer IO Locks";
BufferIOLWLockTranche.array_base = BufferIOLWLockArray;
BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS,
&BufferIOLWLockTranche);
BufferContentLWLockTranche.name = "Buffer Content Locks";
BufferContentLWLockTranche.array_base =
((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded);
LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
&BufferContentLWLockTranche);
if (foundDescs || foundBufs || foundIOLocks)
{ {
/* both should be present or neither */ /* should find all of these, or none of them */
Assert(foundDescs && foundBufs); Assert(foundDescs && foundBufs && foundIOLocks);
/* note: this path is only taken in EXEC_BACKEND case */ /* note: this path is only taken in EXEC_BACKEND case */
} }
else else
...@@ -110,8 +136,11 @@ InitBufferPool(void) ...@@ -110,8 +136,11 @@ InitBufferPool(void)
*/ */
buf->freeNext = i + 1; buf->freeNext = i + 1;
buf->io_in_progress_lock = LWLockAssign(); LWLockInitialize(BufferDescriptorGetContentLock(buf),
buf->content_lock = LWLockAssign(); LWTRANCHE_BUFFER_CONTENT);
LWLockInitialize(BufferDescriptorGetIOLock(buf),
LWTRANCHE_BUFFER_IO_IN_PROGRESS);
} }
/* Correct last entry of linked list */ /* Correct last entry of linked list */
...@@ -144,5 +173,17 @@ BufferShmemSize(void) ...@@ -144,5 +173,17 @@ BufferShmemSize(void)
/* size of stuff controlled by freelist.c */ /* size of stuff controlled by freelist.c */
size = add_size(size, StrategyShmemSize()); size = add_size(size, StrategyShmemSize());
/*
* It would be nice to include the I/O locks in the BufferDesc, but that
* would increase the size of a BufferDesc to more than one cache line, and
* benchmarking has shown that keeping every BufferDesc aligned on a cache
* line boundary is important for performance. So, instead, the array of
* I/O locks is allocated in a separate tranche. Because those locks are
* not highly contentended, we lay out the array with minimal padding.
*/
size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
/* to allow aligning the above */
size = add_size(size, PG_CACHE_LINE_SIZE);
return size; return size;
} }
...@@ -738,7 +738,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, ...@@ -738,7 +738,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
if (!isLocalBuf) if (!isLocalBuf)
{ {
if (mode == RBM_ZERO_AND_LOCK) if (mode == RBM_ZERO_AND_LOCK)
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
LW_EXCLUSIVE);
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
} }
...@@ -879,7 +880,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, ...@@ -879,7 +880,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) && if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
!isLocalBuf) !isLocalBuf)
{ {
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
} }
if (isLocalBuf) if (isLocalBuf)
...@@ -1045,7 +1046,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, ...@@ -1045,7 +1046,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* happens to be trying to split the page the first one got from * happens to be trying to split the page the first one got from
* StrategyGetBuffer.) * StrategyGetBuffer.)
*/ */
if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED)) if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
LW_SHARED))
{ {
/* /*
* If using a nondefault strategy, and writing the buffer * If using a nondefault strategy, and writing the buffer
...@@ -1067,7 +1069,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, ...@@ -1067,7 +1069,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
StrategyRejectBuffer(strategy, buf)) StrategyRejectBuffer(strategy, buf))
{ {
/* Drop lock/pin and loop around for another buffer */ /* Drop lock/pin and loop around for another buffer */
LWLockRelease(buf->content_lock); LWLockRelease(BufferDescriptorGetContentLock(buf));
UnpinBuffer(buf, true); UnpinBuffer(buf, true);
continue; continue;
} }
...@@ -1080,7 +1082,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, ...@@ -1080,7 +1082,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
smgr->smgr_rnode.node.relNode); smgr->smgr_rnode.node.relNode);
FlushBuffer(buf, NULL); FlushBuffer(buf, NULL);
LWLockRelease(buf->content_lock); LWLockRelease(BufferDescriptorGetContentLock(buf));
TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum, TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode, smgr->smgr_rnode.node.spcNode,
...@@ -1395,7 +1397,7 @@ MarkBufferDirty(Buffer buffer) ...@@ -1395,7 +1397,7 @@ MarkBufferDirty(Buffer buffer)
Assert(BufferIsPinned(buffer)); Assert(BufferIsPinned(buffer));
/* unfortunately we can't check if the lock is held exclusively */ /* unfortunately we can't check if the lock is held exclusively */
Assert(LWLockHeldByMe(bufHdr->content_lock)); Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
LockBufHdr(bufHdr); LockBufHdr(bufHdr);
...@@ -1595,8 +1597,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner) ...@@ -1595,8 +1597,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
if (ref->refcount == 0) if (ref->refcount == 0)
{ {
/* I'd better not still hold any locks on the buffer */ /* I'd better not still hold any locks on the buffer */
Assert(!LWLockHeldByMe(buf->content_lock)); Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
Assert(!LWLockHeldByMe(buf->io_in_progress_lock)); Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
LockBufHdr(buf); LockBufHdr(buf);
...@@ -2116,11 +2118,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used) ...@@ -2116,11 +2118,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
* buffer is clean by the time we've locked it.) * buffer is clean by the time we've locked it.)
*/ */
PinBuffer_Locked(bufHdr); PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL); FlushBuffer(bufHdr, NULL);
LWLockRelease(bufHdr->content_lock); LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true); UnpinBuffer(bufHdr, true);
return result | BUF_WRITTEN; return result | BUF_WRITTEN;
...@@ -2926,9 +2928,9 @@ FlushRelationBuffers(Relation rel) ...@@ -2926,9 +2928,9 @@ FlushRelationBuffers(Relation rel)
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{ {
PinBuffer_Locked(bufHdr); PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, rel->rd_smgr); FlushBuffer(bufHdr, rel->rd_smgr);
LWLockRelease(bufHdr->content_lock); LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true); UnpinBuffer(bufHdr, true);
} }
else else
...@@ -2978,9 +2980,9 @@ FlushDatabaseBuffers(Oid dbid) ...@@ -2978,9 +2980,9 @@ FlushDatabaseBuffers(Oid dbid)
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{ {
PinBuffer_Locked(bufHdr); PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL); FlushBuffer(bufHdr, NULL);
LWLockRelease(bufHdr->content_lock); LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true); UnpinBuffer(bufHdr, true);
} }
else else
...@@ -3004,7 +3006,7 @@ FlushOneBuffer(Buffer buffer) ...@@ -3004,7 +3006,7 @@ FlushOneBuffer(Buffer buffer)
bufHdr = GetBufferDescriptor(buffer - 1); bufHdr = GetBufferDescriptor(buffer - 1);
Assert(LWLockHeldByMe(bufHdr->content_lock)); Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
FlushBuffer(bufHdr, NULL); FlushBuffer(bufHdr, NULL);
} }
...@@ -3101,7 +3103,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) ...@@ -3101,7 +3103,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Assert(GetPrivateRefCount(buffer) > 0); Assert(GetPrivateRefCount(buffer) > 0);
/* here, either share or exclusive lock is OK */ /* here, either share or exclusive lock is OK */
Assert(LWLockHeldByMe(bufHdr->content_lock)); Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
/* /*
* This routine might get called many times on the same page, if we are * This routine might get called many times on the same page, if we are
...@@ -3254,11 +3256,11 @@ LockBuffer(Buffer buffer, int mode) ...@@ -3254,11 +3256,11 @@ LockBuffer(Buffer buffer, int mode)
buf = GetBufferDescriptor(buffer - 1); buf = GetBufferDescriptor(buffer - 1);
if (mode == BUFFER_LOCK_UNLOCK) if (mode == BUFFER_LOCK_UNLOCK)
LWLockRelease(buf->content_lock); LWLockRelease(BufferDescriptorGetContentLock(buf));
else if (mode == BUFFER_LOCK_SHARE) else if (mode == BUFFER_LOCK_SHARE)
LWLockAcquire(buf->content_lock, LW_SHARED); LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
else if (mode == BUFFER_LOCK_EXCLUSIVE) else if (mode == BUFFER_LOCK_EXCLUSIVE)
LWLockAcquire(buf->content_lock, LW_EXCLUSIVE); LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
else else
elog(ERROR, "unrecognized buffer lock mode: %d", mode); elog(ERROR, "unrecognized buffer lock mode: %d", mode);
} }
...@@ -3279,7 +3281,8 @@ ConditionalLockBuffer(Buffer buffer) ...@@ -3279,7 +3281,8 @@ ConditionalLockBuffer(Buffer buffer)
buf = GetBufferDescriptor(buffer - 1); buf = GetBufferDescriptor(buffer - 1);
return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE); return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
LW_EXCLUSIVE);
} }
/* /*
...@@ -3489,8 +3492,8 @@ WaitIO(BufferDesc *buf) ...@@ -3489,8 +3492,8 @@ WaitIO(BufferDesc *buf)
UnlockBufHdr(buf); UnlockBufHdr(buf);
if (!(sv_flags & BM_IO_IN_PROGRESS)) if (!(sv_flags & BM_IO_IN_PROGRESS))
break; break;
LWLockAcquire(buf->io_in_progress_lock, LW_SHARED); LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
LWLockRelease(buf->io_in_progress_lock); LWLockRelease(BufferDescriptorGetIOLock(buf));
} }
} }
...@@ -3523,7 +3526,7 @@ StartBufferIO(BufferDesc *buf, bool forInput) ...@@ -3523,7 +3526,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
* Grab the io_in_progress lock so that other processes can wait for * Grab the io_in_progress lock so that other processes can wait for
* me to finish the I/O. * me to finish the I/O.
*/ */
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE); LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
LockBufHdr(buf); LockBufHdr(buf);
...@@ -3537,7 +3540,7 @@ StartBufferIO(BufferDesc *buf, bool forInput) ...@@ -3537,7 +3540,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
* him to get unwedged. * him to get unwedged.
*/ */
UnlockBufHdr(buf); UnlockBufHdr(buf);
LWLockRelease(buf->io_in_progress_lock); LWLockRelease(BufferDescriptorGetIOLock(buf));
WaitIO(buf); WaitIO(buf);
} }
...@@ -3547,7 +3550,7 @@ StartBufferIO(BufferDesc *buf, bool forInput) ...@@ -3547,7 +3550,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
{ {
/* someone else already did the I/O */ /* someone else already did the I/O */
UnlockBufHdr(buf); UnlockBufHdr(buf);
LWLockRelease(buf->io_in_progress_lock); LWLockRelease(BufferDescriptorGetIOLock(buf));
return false; return false;
} }
...@@ -3595,7 +3598,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits) ...@@ -3595,7 +3598,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
InProgressBuf = NULL; InProgressBuf = NULL;
LWLockRelease(buf->io_in_progress_lock); LWLockRelease(BufferDescriptorGetIOLock(buf));
} }
/* /*
...@@ -3620,7 +3623,7 @@ AbortBufferIO(void) ...@@ -3620,7 +3623,7 @@ AbortBufferIO(void)
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
* buffer will be in a busy spin until we succeed in doing this. * buffer will be in a busy spin until we succeed in doing this.
*/ */
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE); LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
LockBufHdr(buf); LockBufHdr(buf);
Assert(buf->flags & BM_IO_IN_PROGRESS); Assert(buf->flags & BM_IO_IN_PROGRESS);
......
...@@ -344,18 +344,15 @@ NumLWLocks(void) ...@@ -344,18 +344,15 @@ NumLWLocks(void)
int numLocks; int numLocks;
/* /*
* Possibly this logic should be spread out among the affected modules, * Many users of LWLocks no longer reserve space in the main array here,
* the same way that shmem space estimation is done. But for now, there * but instead allocate separate tranches. The latter approach has the
* are few enough users of LWLocks that we can get away with just keeping * advantage of allowing LWLOCK_STATS and LOCK_DEBUG output to produce
* the knowledge here. * more useful output.
*/ */
/* Predefined LWLocks */ /* Predefined LWLocks */
numLocks = NUM_FIXED_LWLOCKS; numLocks = NUM_FIXED_LWLOCKS;
/* bufmgr.c needs two for each shared buffer */
numLocks += 2 * NBuffers;
/* proc.c needs one for each backend or auxiliary process */ /* proc.c needs one for each backend or auxiliary process */
numLocks += MaxBackends + NUM_AUXILIARY_PROCS; numLocks += MaxBackends + NUM_AUXILIARY_PROCS;
...@@ -423,6 +420,10 @@ CreateLWLocks(void) ...@@ -423,6 +420,10 @@ CreateLWLocks(void)
StaticAssertExpr(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS, StaticAssertExpr(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
"MAX_BACKENDS too big for lwlock.c"); "MAX_BACKENDS too big for lwlock.c");
StaticAssertExpr(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
"Miscalculated LWLock padding");
if (!IsUnderPostmaster) if (!IsUnderPostmaster)
{ {
int numLocks = NumLWLocks(); int numLocks = NumLWLocks();
......
...@@ -115,8 +115,8 @@ typedef struct buftag ...@@ -115,8 +115,8 @@ typedef struct buftag
* Note: buf_hdr_lock must be held to examine or change the tag, flags, * Note: buf_hdr_lock must be held to examine or change the tag, flags,
* usage_count, refcount, or wait_backend_pid fields. buf_id field never * usage_count, refcount, or wait_backend_pid fields. buf_id field never
* changes after initialization, so does not need locking. freeNext is * changes after initialization, so does not need locking. freeNext is
* protected by the buffer_strategy_lock not buf_hdr_lock. The LWLocks can take * protected by the buffer_strategy_lock not buf_hdr_lock. The LWLock can
* care of themselves. The buf_hdr_lock is *not* used to control access to * take care of itself. The buf_hdr_lock is *not* used to control access to
* the data in the buffer! * the data in the buffer!
* *
* An exception is that if we have the buffer pinned, its tag can't change * An exception is that if we have the buffer pinned, its tag can't change
...@@ -133,22 +133,24 @@ typedef struct buftag ...@@ -133,22 +133,24 @@ typedef struct buftag
* *
* We use this same struct for local buffer headers, but the lock fields * We use this same struct for local buffer headers, but the lock fields
* are not used and not all of the flag bits are useful either. * are not used and not all of the flag bits are useful either.
*
* Be careful to avoid increasing the size of the struct when adding or
* reordering members. Keeping it below 64 bytes (the most common CPU
* cache line size) is fairly important for performance.
*/ */
typedef struct BufferDesc typedef struct BufferDesc
{ {
BufferTag tag; /* ID of page contained in buffer */ BufferTag tag; /* ID of page contained in buffer */
BufFlags flags; /* see bit definitions above */ BufFlags flags; /* see bit definitions above */
uint16 usage_count; /* usage counter for clock sweep code */ uint8 usage_count; /* usage counter for clock sweep code */
slock_t buf_hdr_lock; /* protects a subset of fields, see above */
unsigned refcount; /* # of backends holding pins on buffer */ unsigned refcount; /* # of backends holding pins on buffer */
int wait_backend_pid; /* backend PID of pin-count waiter */ int wait_backend_pid; /* backend PID of pin-count waiter */
slock_t buf_hdr_lock; /* protects the above fields */
int buf_id; /* buffer's index number (from 0) */ int buf_id; /* buffer's index number (from 0) */
int freeNext; /* link in freelist chain */ int freeNext; /* link in freelist chain */
LWLock *io_in_progress_lock; /* to wait for I/O to complete */ LWLock content_lock; /* to lock access to buffer contents */
LWLock *content_lock; /* to lock access to buffer contents */
} BufferDesc; } BufferDesc;
/* /*
...@@ -184,6 +186,13 @@ typedef union BufferDescPadded ...@@ -184,6 +186,13 @@ typedef union BufferDescPadded
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
#define BufferDescriptorGetIOLock(bdesc) \
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
#define BufferDescriptorGetContentLock(bdesc) \
((LWLock*) (&(bdesc)->content_lock))
extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
/* /*
* The freeNext field is either the index of the next freelist entry, * The freeNext field is either the index of the next freelist entry,
* or one of these special values: * or one of these special values:
......
...@@ -25,6 +25,12 @@ ...@@ -25,6 +25,12 @@
struct PGPROC; struct PGPROC;
/* /*
* Prior to PostgreSQL 9.4, every lightweight lock in the system was stored
* in a single array. For convenience and for compatibility with past
* releases, we still have a main array, but it's now also permissible to
* store LWLocks elsewhere in the main shared memory segment or in a dynamic
* shared memory segment. Each array of lwlocks forms a separate "tranche".
*
* It's occasionally necessary to identify a particular LWLock "by name"; e.g. * It's occasionally necessary to identify a particular LWLock "by name"; e.g.
* because we wish to report the lock to dtrace. We could store a name or * because we wish to report the lock to dtrace. We could store a name or
* other identifying information in the lock itself, but since it's common * other identifying information in the lock itself, but since it's common
...@@ -65,30 +71,51 @@ typedef struct LWLock ...@@ -65,30 +71,51 @@ typedef struct LWLock
} LWLock; } LWLock;
/* /*
* Prior to PostgreSQL 9.4, every lightweight lock in the system was stored * In most cases, it's desirable to force each tranche of LWLocks to be aligned
* in a single array. For convenience and for compatibility with past * on a cache line boundary and make the array stride a power of 2. This saves
* releases, we still have a main array, but it's now also permissible to * a few cycles in indexing, but more importantly ensures that individual
* store LWLocks elsewhere in the main shared memory segment or in a dynamic * LWLocks don't cross cache line boundaries. This reduces cache contention
* shared memory segment. In the main array, we force the array stride to * problems, especially on AMD Opterons. In some cases, it's useful to add
* be a power of 2, which saves a few cycles in indexing, but more importantly * even more padding so that each LWLock takes up an entire cache line; this is
* also ensures that individual LWLocks don't cross cache line boundaries. * useful, for example, in the main LWLock array, where the overall number of
* This reduces cache contention problems, especially on AMD Opterons. * locks is small but some are heavily contended.
* (Of course, we have to also ensure that the array start address is suitably *
* aligned.) * When allocating a tranche that contains data other than LWLocks, it is
* probably best to include a bare LWLock and then pad the resulting structure
* as necessary for performance. For an array that contains only LWLocks,
* LWLockMinimallyPadded can be used for cases where we just want to ensure
* that we don't cross cache line boundaries within a single lock, while
* LWLockPadded can be used for cases where we want each lock to be an entire
* cache line.
* *
* On a 32-bit platforms a LWLock will these days fit into 16 bytes, but since * On 32-bit platforms, an LWLockMinimallyPadded might actually contain more
* that didn't use to be the case and cramming more lwlocks into a cacheline * than the absolute minimum amount of padding required to keep a lock from
* might be detrimental performancewise we still use 32 byte alignment * crossing a cache line boundary, because an unpadded LWLock might fit into
* there. So, both on 32 and 64 bit platforms, it should fit into 32 bytes * 16 bytes. We ignore that possibility when determining the minimal amount
* unless slock_t is really big. We allow for that just in case. * of padding. Older releases had larger LWLocks, so 32 really was the
* minimum, and packing them in tighter might hurt performance.
*
* LWLOCK_MINIMAL_SIZE should be 32 on basically all common platforms, but
* because slock_t is more than 2 bytes on some obscure platforms, we allow
* for the possibility that it might be 64.
*/ */
#define LWLOCK_PADDED_SIZE (sizeof(LWLock) <= 32 ? 32 : 64) #define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE
#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
/* LWLock, padded to a full cache line size */
typedef union LWLockPadded typedef union LWLockPadded
{ {
LWLock lock; LWLock lock;
char pad[LWLOCK_PADDED_SIZE]; char pad[LWLOCK_PADDED_SIZE];
} LWLockPadded; } LWLockPadded;
/* LWLock, minimally padded */
typedef union LWLockMinimallyPadded
{
LWLock lock;
char pad[LWLOCK_MINIMAL_SIZE];
} LWLockMinimallyPadded;
extern PGDLLIMPORT LWLockPadded *MainLWLockArray; extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
extern char *MainLWLockNames[]; extern char *MainLWLockNames[];
...@@ -184,6 +211,8 @@ typedef enum BuiltinTrancheIds ...@@ -184,6 +211,8 @@ typedef enum BuiltinTrancheIds
{ {
LWTRANCHE_MAIN, LWTRANCHE_MAIN,
LWTRANCHE_WAL_INSERT, LWTRANCHE_WAL_INSERT,
LWTRANCHE_BUFFER_CONTENT,
LWTRANCHE_BUFFER_IO_IN_PROGRESS,
LWTRANCHE_FIRST_USER_DEFINED LWTRANCHE_FIRST_USER_DEFINED
} BuiltinTrancheIds; } BuiltinTrancheIds;
......
...@@ -1008,6 +1008,7 @@ LSEG ...@@ -1008,6 +1008,7 @@ LSEG
LVRelStats LVRelStats
LWLock LWLock
LWLockHandle LWLockHandle
LWLockMinimallyPadded
LWLockMode LWLockMode
LWLockPadded LWLockPadded
LWLockTranche LWLockTranche
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment