Move buffer I/O and content LWLocks out of the main tranche.

Move the content lock directly into the BufferDesc, so that locking and
pinning a buffer touches only one cache line rather than two.  Adjust
the definition of BufferDesc slightly so that this doesn't make the
BufferDesc any larger than one cache line (at least on platforms where
a spinlock is only 1 or 2 bytes).

We can't fit the I/O locks into the BufferDesc and stay within one
cache line, so move those to a completely separate tranche.  This
leaves a relatively limited number of LWLocks in the main tranche, so
increase the padding of those remaining locks to a full cache line,
rather than allowing adjacent locks to share a cache line, hopefully
reducing false sharing.

Performance testing shows that these changes make little difference
on laptop-class machines, but help significantly on larger servers,
especially those with more than 2 sockets.

Andres Freund, originally based on an earlier patch by Simon Riggs.
Review and cosmetic adjustments (including heavy rewriting of the
comments) by me.
This commit is contained in:
Robert Haas 2015-12-15 13:32:54 -05:00
parent 3fed417452
commit 6150a1b08a
6 changed files with 151 additions and 67 deletions

View File

@ -20,6 +20,9 @@
BufferDescPadded *BufferDescriptors;
char *BufferBlocks;
LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
LWLockTranche BufferIOLWLockTranche;
LWLockTranche BufferContentLWLockTranche;
/*
@ -65,22 +68,45 @@ void
InitBufferPool(void)
{
bool foundBufs,
foundDescs;
foundDescs,
foundIOLocks;
/* Align descriptors to a cacheline boundary. */
BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN(
ShmemInitStruct("Buffer Descriptors",
NBuffers * sizeof(BufferDescPadded) + PG_CACHE_LINE_SIZE,
&foundDescs));
BufferDescriptors = (BufferDescPadded *)
CACHELINEALIGN(
ShmemInitStruct("Buffer Descriptors",
NBuffers * sizeof(BufferDescPadded)
+ PG_CACHE_LINE_SIZE,
&foundDescs));
BufferBlocks = (char *)
ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ, &foundBufs);
if (foundDescs || foundBufs)
/* Align lwlocks to cacheline boundary */
BufferIOLWLockArray = (LWLockMinimallyPadded *)
CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks",
NBuffers * (Size) sizeof(LWLockMinimallyPadded)
+ PG_CACHE_LINE_SIZE,
&foundIOLocks));
BufferIOLWLockTranche.name = "Buffer IO Locks";
BufferIOLWLockTranche.array_base = BufferIOLWLockArray;
BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded);
LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS,
&BufferIOLWLockTranche);
BufferContentLWLockTranche.name = "Buffer Content Locks";
BufferContentLWLockTranche.array_base =
((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock);
BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded);
LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT,
&BufferContentLWLockTranche);
if (foundDescs || foundBufs || foundIOLocks)
{
/* both should be present or neither */
Assert(foundDescs && foundBufs);
/* should find all of these, or none of them */
Assert(foundDescs && foundBufs && foundIOLocks);
/* note: this path is only taken in EXEC_BACKEND case */
}
else
@ -110,8 +136,11 @@ InitBufferPool(void)
*/
buf->freeNext = i + 1;
buf->io_in_progress_lock = LWLockAssign();
buf->content_lock = LWLockAssign();
LWLockInitialize(BufferDescriptorGetContentLock(buf),
LWTRANCHE_BUFFER_CONTENT);
LWLockInitialize(BufferDescriptorGetIOLock(buf),
LWTRANCHE_BUFFER_IO_IN_PROGRESS);
}
/* Correct last entry of linked list */
@ -144,5 +173,17 @@ BufferShmemSize(void)
/* size of stuff controlled by freelist.c */
size = add_size(size, StrategyShmemSize());
/*
* It would be nice to include the I/O locks in the BufferDesc, but that
* would increase the size of a BufferDesc to more than one cache line, and
* benchmarking has shown that keeping every BufferDesc aligned on a cache
* line boundary is important for performance. So, instead, the array of
* I/O locks is allocated in a separate tranche. Because those locks are
* not highly contentended, we lay out the array with minimal padding.
*/
size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
/* to allow aligning the above */
size = add_size(size, PG_CACHE_LINE_SIZE);
return size;
}

View File

@ -738,7 +738,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
if (!isLocalBuf)
{
if (mode == RBM_ZERO_AND_LOCK)
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
LW_EXCLUSIVE);
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
}
@ -879,7 +880,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
!isLocalBuf)
{
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
}
if (isLocalBuf)
@ -1045,7 +1046,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* happens to be trying to split the page the first one got from
* StrategyGetBuffer.)
*/
if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
LW_SHARED))
{
/*
* If using a nondefault strategy, and writing the buffer
@ -1067,7 +1069,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
StrategyRejectBuffer(strategy, buf))
{
/* Drop lock/pin and loop around for another buffer */
LWLockRelease(buf->content_lock);
LWLockRelease(BufferDescriptorGetContentLock(buf));
UnpinBuffer(buf, true);
continue;
}
@ -1080,7 +1082,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
smgr->smgr_rnode.node.relNode);
FlushBuffer(buf, NULL);
LWLockRelease(buf->content_lock);
LWLockRelease(BufferDescriptorGetContentLock(buf));
TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
@ -1395,7 +1397,7 @@ MarkBufferDirty(Buffer buffer)
Assert(BufferIsPinned(buffer));
/* unfortunately we can't check if the lock is held exclusively */
Assert(LWLockHeldByMe(bufHdr->content_lock));
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
LockBufHdr(bufHdr);
@ -1595,8 +1597,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
if (ref->refcount == 0)
{
/* I'd better not still hold any locks on the buffer */
Assert(!LWLockHeldByMe(buf->content_lock));
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
LockBufHdr(buf);
@ -2116,11 +2118,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
* buffer is clean by the time we've locked it.)
*/
PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL);
LWLockRelease(bufHdr->content_lock);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true);
return result | BUF_WRITTEN;
@ -2926,9 +2928,9 @@ FlushRelationBuffers(Relation rel)
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, rel->rd_smgr);
LWLockRelease(bufHdr->content_lock);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true);
}
else
@ -2978,9 +2980,9 @@ FlushDatabaseBuffers(Oid dbid)
(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(bufHdr->content_lock, LW_SHARED);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL);
LWLockRelease(bufHdr->content_lock);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr, true);
}
else
@ -3004,7 +3006,7 @@ FlushOneBuffer(Buffer buffer)
bufHdr = GetBufferDescriptor(buffer - 1);
Assert(LWLockHeldByMe(bufHdr->content_lock));
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
FlushBuffer(bufHdr, NULL);
}
@ -3101,7 +3103,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Assert(GetPrivateRefCount(buffer) > 0);
/* here, either share or exclusive lock is OK */
Assert(LWLockHeldByMe(bufHdr->content_lock));
Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
/*
* This routine might get called many times on the same page, if we are
@ -3254,11 +3256,11 @@ LockBuffer(Buffer buffer, int mode)
buf = GetBufferDescriptor(buffer - 1);
if (mode == BUFFER_LOCK_UNLOCK)
LWLockRelease(buf->content_lock);
LWLockRelease(BufferDescriptorGetContentLock(buf));
else if (mode == BUFFER_LOCK_SHARE)
LWLockAcquire(buf->content_lock, LW_SHARED);
LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
else if (mode == BUFFER_LOCK_EXCLUSIVE)
LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
else
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
}
@ -3279,7 +3281,8 @@ ConditionalLockBuffer(Buffer buffer)
buf = GetBufferDescriptor(buffer - 1);
return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
LW_EXCLUSIVE);
}
/*
@ -3489,8 +3492,8 @@ WaitIO(BufferDesc *buf)
UnlockBufHdr(buf);
if (!(sv_flags & BM_IO_IN_PROGRESS))
break;
LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
LWLockRelease(buf->io_in_progress_lock);
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
LWLockRelease(BufferDescriptorGetIOLock(buf));
}
}
@ -3523,7 +3526,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
* Grab the io_in_progress lock so that other processes can wait for
* me to finish the I/O.
*/
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
LockBufHdr(buf);
@ -3537,7 +3540,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
* him to get unwedged.
*/
UnlockBufHdr(buf);
LWLockRelease(buf->io_in_progress_lock);
LWLockRelease(BufferDescriptorGetIOLock(buf));
WaitIO(buf);
}
@ -3547,7 +3550,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
{
/* someone else already did the I/O */
UnlockBufHdr(buf);
LWLockRelease(buf->io_in_progress_lock);
LWLockRelease(BufferDescriptorGetIOLock(buf));
return false;
}
@ -3595,7 +3598,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
InProgressBuf = NULL;
LWLockRelease(buf->io_in_progress_lock);
LWLockRelease(BufferDescriptorGetIOLock(buf));
}
/*
@ -3620,7 +3623,7 @@ AbortBufferIO(void)
* we can use TerminateBufferIO. Anyone who's executing WaitIO on the
* buffer will be in a busy spin until we succeed in doing this.
*/
LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
LockBufHdr(buf);
Assert(buf->flags & BM_IO_IN_PROGRESS);

View File

@ -344,18 +344,15 @@ NumLWLocks(void)
int numLocks;
/*
* Possibly this logic should be spread out among the affected modules,
* the same way that shmem space estimation is done. But for now, there
* are few enough users of LWLocks that we can get away with just keeping
* the knowledge here.
* Many users of LWLocks no longer reserve space in the main array here,
* but instead allocate separate tranches. The latter approach has the
* advantage of allowing LWLOCK_STATS and LOCK_DEBUG output to produce
* more useful output.
*/
/* Predefined LWLocks */
numLocks = NUM_FIXED_LWLOCKS;
/* bufmgr.c needs two for each shared buffer */
numLocks += 2 * NBuffers;
/* proc.c needs one for each backend or auxiliary process */
numLocks += MaxBackends + NUM_AUXILIARY_PROCS;
@ -423,6 +420,10 @@ CreateLWLocks(void)
StaticAssertExpr(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
"MAX_BACKENDS too big for lwlock.c");
StaticAssertExpr(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
"Miscalculated LWLock padding");
if (!IsUnderPostmaster)
{
int numLocks = NumLWLocks();

View File

@ -115,8 +115,8 @@ typedef struct buftag
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
* usage_count, refcount, or wait_backend_pid fields. buf_id field never
* changes after initialization, so does not need locking. freeNext is
* protected by the buffer_strategy_lock not buf_hdr_lock. The LWLocks can take
* care of themselves. The buf_hdr_lock is *not* used to control access to
* protected by the buffer_strategy_lock not buf_hdr_lock. The LWLock can
* take care of itself. The buf_hdr_lock is *not* used to control access to
* the data in the buffer!
*
* An exception is that if we have the buffer pinned, its tag can't change
@ -133,22 +133,24 @@ typedef struct buftag
*
* We use this same struct for local buffer headers, but the lock fields
* are not used and not all of the flag bits are useful either.
*
* Be careful to avoid increasing the size of the struct when adding or
* reordering members. Keeping it below 64 bytes (the most common CPU
* cache line size) is fairly important for performance.
*/
typedef struct BufferDesc
{
BufferTag tag; /* ID of page contained in buffer */
BufFlags flags; /* see bit definitions above */
uint16 usage_count; /* usage counter for clock sweep code */
uint8 usage_count; /* usage counter for clock sweep code */
slock_t buf_hdr_lock; /* protects a subset of fields, see above */
unsigned refcount; /* # of backends holding pins on buffer */
int wait_backend_pid; /* backend PID of pin-count waiter */
slock_t buf_hdr_lock; /* protects the above fields */
int buf_id; /* buffer's index number (from 0) */
int freeNext; /* link in freelist chain */
LWLock *io_in_progress_lock; /* to wait for I/O to complete */
LWLock *content_lock; /* to lock access to buffer contents */
LWLock content_lock; /* to lock access to buffer contents */
} BufferDesc;
/*
@ -184,6 +186,13 @@ typedef union BufferDescPadded
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
#define BufferDescriptorGetIOLock(bdesc) \
(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
#define BufferDescriptorGetContentLock(bdesc) \
((LWLock*) (&(bdesc)->content_lock))
extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
/*
* The freeNext field is either the index of the next freelist entry,
* or one of these special values:

View File

@ -25,6 +25,12 @@
struct PGPROC;
/*
* Prior to PostgreSQL 9.4, every lightweight lock in the system was stored
* in a single array. For convenience and for compatibility with past
* releases, we still have a main array, but it's now also permissible to
* store LWLocks elsewhere in the main shared memory segment or in a dynamic
* shared memory segment. Each array of lwlocks forms a separate "tranche".
*
* It's occasionally necessary to identify a particular LWLock "by name"; e.g.
* because we wish to report the lock to dtrace. We could store a name or
* other identifying information in the lock itself, but since it's common
@ -65,30 +71,51 @@ typedef struct LWLock
} LWLock;
/*
* Prior to PostgreSQL 9.4, every lightweight lock in the system was stored
* in a single array. For convenience and for compatibility with past
* releases, we still have a main array, but it's now also permissible to
* store LWLocks elsewhere in the main shared memory segment or in a dynamic
* shared memory segment. In the main array, we force the array stride to
* be a power of 2, which saves a few cycles in indexing, but more importantly
* also ensures that individual LWLocks don't cross cache line boundaries.
* This reduces cache contention problems, especially on AMD Opterons.
* (Of course, we have to also ensure that the array start address is suitably
* aligned.)
* In most cases, it's desirable to force each tranche of LWLocks to be aligned
* on a cache line boundary and make the array stride a power of 2. This saves
* a few cycles in indexing, but more importantly ensures that individual
* LWLocks don't cross cache line boundaries. This reduces cache contention
* problems, especially on AMD Opterons. In some cases, it's useful to add
* even more padding so that each LWLock takes up an entire cache line; this is
* useful, for example, in the main LWLock array, where the overall number of
* locks is small but some are heavily contended.
*
* On a 32-bit platforms a LWLock will these days fit into 16 bytes, but since
* that didn't use to be the case and cramming more lwlocks into a cacheline
* might be detrimental performancewise we still use 32 byte alignment
* there. So, both on 32 and 64 bit platforms, it should fit into 32 bytes
* unless slock_t is really big. We allow for that just in case.
* When allocating a tranche that contains data other than LWLocks, it is
* probably best to include a bare LWLock and then pad the resulting structure
* as necessary for performance. For an array that contains only LWLocks,
* LWLockMinimallyPadded can be used for cases where we just want to ensure
* that we don't cross cache line boundaries within a single lock, while
* LWLockPadded can be used for cases where we want each lock to be an entire
* cache line.
*
* On 32-bit platforms, an LWLockMinimallyPadded might actually contain more
* than the absolute minimum amount of padding required to keep a lock from
* crossing a cache line boundary, because an unpadded LWLock might fit into
* 16 bytes. We ignore that possibility when determining the minimal amount
* of padding. Older releases had larger LWLocks, so 32 really was the
* minimum, and packing them in tighter might hurt performance.
*
* LWLOCK_MINIMAL_SIZE should be 32 on basically all common platforms, but
* because slock_t is more than 2 bytes on some obscure platforms, we allow
* for the possibility that it might be 64.
*/
#define LWLOCK_PADDED_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
#define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE
#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
/* LWLock, padded to a full cache line size */
typedef union LWLockPadded
{
LWLock lock;
char pad[LWLOCK_PADDED_SIZE];
} LWLockPadded;
/* LWLock, minimally padded */
typedef union LWLockMinimallyPadded
{
LWLock lock;
char pad[LWLOCK_MINIMAL_SIZE];
} LWLockMinimallyPadded;
extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
extern char *MainLWLockNames[];
@ -184,6 +211,8 @@ typedef enum BuiltinTrancheIds
{
LWTRANCHE_MAIN,
LWTRANCHE_WAL_INSERT,
LWTRANCHE_BUFFER_CONTENT,
LWTRANCHE_BUFFER_IO_IN_PROGRESS,
LWTRANCHE_FIRST_USER_DEFINED
} BuiltinTrancheIds;

View File

@ -1008,6 +1008,7 @@ LSEG
LVRelStats
LWLock
LWLockHandle
LWLockMinimallyPadded
LWLockMode
LWLockPadded
LWLockTranche