diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 3335d71eba..1ba813bbb9 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1586,6 +1586,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting for the page number needed to continue a parallel B-tree scan to become available. + + BufferIO + Waiting for buffer I/O to complete. + CheckpointDone Waiting for a checkpoint to complete. @@ -1876,10 +1880,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser BufferContent Waiting to access a data page in memory. - - BufferIO - Waiting for I/O on a data page. - BufferMapping Waiting to associate a data block with a buffer in the buffer diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 05d5ce0064..68eefb9722 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4010,6 +4010,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_BTREE_PAGE: event_name = "BtreePage"; break; + case WAIT_EVENT_BUFFER_IO: + event_name = "BufferIO"; + break; case WAIT_EVENT_CHECKPOINT_DONE: event_name = "CheckpointDone"; break; diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index bc80777bb1..a775276ff2 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -145,14 +145,11 @@ held within the buffer. Each buffer header also contains an LWLock, the "buffer content lock", that *does* represent the right to access the data in the buffer. It is used per the rules above. -There is yet another set of per-buffer LWLocks, the io_in_progress locks, -that are used to wait for I/O on a buffer to complete. The process doing -a read or write takes exclusive lock for the duration, and processes that -need to wait for completion try to take shared locks (which they release -immediately upon obtaining). XXX on systems where an LWLock represents -nontrivial resources, it's fairly annoying to need so many locks. Possibly -we could use per-backend LWLocks instead (a buffer header would then contain -a field to show which backend is doing its I/O). +* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a +buffer to complete (and in releases before 14, it was accompanied by a +per-buffer LWLock). The process doing a read or write sets the flag for the +duration, and processes that need to wait for it to be cleared sleep on a +condition variable. Normal Buffer Replacement Strategy diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index e9e4f35bb5..a299be1043 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -19,7 +19,7 @@ BufferDescPadded *BufferDescriptors; char *BufferBlocks; -LWLockMinimallyPadded *BufferIOLWLockArray = NULL; +ConditionVariableMinimallyPadded *BufferIOCVArray; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; @@ -68,7 +68,7 @@ InitBufferPool(void) { bool foundBufs, foundDescs, - foundIOLocks, + foundIOCV, foundBufCkpt; /* Align descriptors to a cacheline boundary. */ @@ -81,11 +81,11 @@ InitBufferPool(void) ShmemInitStruct("Buffer Blocks", NBuffers * (Size) BLCKSZ, &foundBufs); - /* Align lwlocks to cacheline boundary */ - BufferIOLWLockArray = (LWLockMinimallyPadded *) - ShmemInitStruct("Buffer IO Locks", - NBuffers * (Size) sizeof(LWLockMinimallyPadded), - &foundIOLocks); + /* Align condition variables to cacheline boundary. */ + BufferIOCVArray = (ConditionVariableMinimallyPadded *) + ShmemInitStruct("Buffer IO Condition Variables", + NBuffers * sizeof(ConditionVariableMinimallyPadded), + &foundIOCV); /* * The array used to sort to-be-checkpointed buffer ids is located in @@ -98,10 +98,10 @@ InitBufferPool(void) ShmemInitStruct("Checkpoint BufferIds", NBuffers * sizeof(CkptSortItem), &foundBufCkpt); - if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt) + if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) { /* should find all of these, or none of them */ - Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt); + Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt); /* note: this path is only taken in EXEC_BACKEND case */ } else @@ -131,8 +131,7 @@ InitBufferPool(void) LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); - LWLockInitialize(BufferDescriptorGetIOLock(buf), - LWTRANCHE_BUFFER_IO); + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } /* Correct last entry of linked list */ @@ -169,16 +168,9 @@ BufferShmemSize(void) /* size of stuff controlled by freelist.c */ size = add_size(size, StrategyShmemSize()); - /* - * It would be nice to include the I/O locks in the BufferDesc, but that - * would increase the size of a BufferDesc to more than one cache line, - * and benchmarking has shown that keeping every BufferDesc aligned on a - * cache line boundary is important for performance. So, instead, the - * array of I/O locks is allocated in a separate tranche. Because those - * locks are not highly contended, we lay out the array with minimal - * padding. - */ - size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded))); + /* size of I/O condition variables */ + size = add_size(size, mul_size(NBuffers, + sizeof(ConditionVariableMinimallyPadded))); /* to allow aligning the above */ size = add_size(size, PG_CACHE_LINE_SIZE); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 561c212092..4c1d5eceb4 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1352,9 +1352,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, LWLockRelease(newPartitionLock); /* - * Buffer contents are currently invalid. Try to get the io_in_progress - * lock. If StartBufferIO returns false, then someone else managed to - * read it before we did, so there's nothing left for BufferAlloc() to do. + * Buffer contents are currently invalid. Try to obtain the right to + * start I/O. If StartBufferIO returns false, then someone else managed + * to read it before we did, so there's nothing left for BufferAlloc() to + * do. */ if (StartBufferIO(buf, true)) *foundPtr = false; @@ -1777,9 +1778,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner) */ VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ); - /* I'd better not still hold any locks on the buffer */ + /* I'd better not still hold the buffer content lock */ Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); - Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf))); /* * Decrement the shared reference count. @@ -2742,9 +2742,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) uint32 buf_state; /* - * Acquire the buffer's io_in_progress lock. If StartBufferIO returns - * false, then someone else flushed the buffer before we could, so we need - * not do anything. + * Try to start an I/O operation. If StartBufferIO returns false, then + * someone else flushed the buffer before we could, so we need not do + * anything. */ if (!StartBufferIO(buf, false)) return; @@ -2800,7 +2800,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) /* * Now it's safe to write buffer to disk. Note that no one else should * have been able to write it while we were busy with log flushing because - * we have the io_in_progress lock. + * only one process at a time can set the BM_IO_IN_PROGRESS bit. */ bufBlock = BufHdrGetBlock(buf); @@ -2835,7 +2835,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) /* * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and - * end the io_in_progress state. + * end the BM_IO_IN_PROGRESS state. */ TerminateBufferIO(buf, true, 0); @@ -4271,7 +4271,7 @@ IsBufferCleanupOK(Buffer buffer) * Functions for buffer I/O handling * * Note: We assume that nested buffer I/O never occurs. - * i.e at most one io_in_progress lock is held per proc. + * i.e at most one BM_IO_IN_PROGRESS bit is set per proc. * * Also note that these are used only for shared buffers, not local ones. */ @@ -4282,13 +4282,9 @@ IsBufferCleanupOK(Buffer buffer) static void WaitIO(BufferDesc *buf) { - /* - * Changed to wait until there's no IO - Inoue 01/13/2000 - * - * Note this is *necessary* because an error abort in the process doing - * I/O could release the io_in_progress_lock prematurely. See - * AbortBufferIO. - */ + ConditionVariable *cv = BufferDescriptorGetIOCV(buf); + + ConditionVariablePrepareToSleep(cv); for (;;) { uint32 buf_state; @@ -4303,9 +4299,9 @@ WaitIO(BufferDesc *buf) if (!(buf_state & BM_IO_IN_PROGRESS)) break; - LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED); - LWLockRelease(BufferDescriptorGetIOLock(buf)); + ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO); } + ConditionVariableCancelSleep(); } /* @@ -4317,7 +4313,7 @@ WaitIO(BufferDesc *buf) * In some scenarios there are race conditions in which multiple backends * could attempt the same I/O operation concurrently. If someone else * has already started I/O on this buffer then we will block on the - * io_in_progress lock until he's done. + * I/O condition variable until he's done. * * Input operations are only attempted on buffers that are not BM_VALID, * and output operations only on buffers that are BM_VALID and BM_DIRTY, @@ -4335,25 +4331,11 @@ StartBufferIO(BufferDesc *buf, bool forInput) for (;;) { - /* - * Grab the io_in_progress lock so that other processes can wait for - * me to finish the I/O. - */ - LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE); - buf_state = LockBufHdr(buf); if (!(buf_state & BM_IO_IN_PROGRESS)) break; - - /* - * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress - * lock isn't held is if the process doing the I/O is recovering from - * an error (see AbortBufferIO). If that's the case, we must wait for - * him to get unwedged. - */ UnlockBufHdr(buf, buf_state); - LWLockRelease(BufferDescriptorGetIOLock(buf)); WaitIO(buf); } @@ -4363,7 +4345,6 @@ StartBufferIO(BufferDesc *buf, bool forInput) { /* someone else already did the I/O */ UnlockBufHdr(buf, buf_state); - LWLockRelease(BufferDescriptorGetIOLock(buf)); return false; } @@ -4381,7 +4362,6 @@ StartBufferIO(BufferDesc *buf, bool forInput) * (Assumptions) * My process is executing IO for the buffer * BM_IO_IN_PROGRESS bit is set for the buffer - * We hold the buffer's io_in_progress lock * The buffer is Pinned * * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the @@ -4413,7 +4393,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) InProgressBuf = NULL; - LWLockRelease(BufferDescriptorGetIOLock(buf)); + ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf)); } /* @@ -4434,14 +4414,6 @@ AbortBufferIO(void) { uint32 buf_state; - /* - * Since LWLockReleaseAll has already been called, we're not holding - * the buffer's io_in_progress_lock. We have to re-acquire it so that - * we can use TerminateBufferIO. Anyone who's executing WaitIO on the - * buffer will be in a busy spin until we succeed in doing this. - */ - LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE); - buf_state = LockBufHdr(buf); Assert(buf_state & BM_IO_IN_PROGRESS); if (IsForInput) diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 8cb6a6f042..adf19aba75 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -146,8 +146,6 @@ static const char *const BuiltinTrancheNames[] = { "WALInsert", /* LWTRANCHE_BUFFER_CONTENT: */ "BufferContent", - /* LWTRANCHE_BUFFER_IO: */ - "BufferIO", /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */ "ReplicationOriginState", /* LWTRANCHE_REPLICATION_SLOT_IO: */ @@ -469,8 +467,7 @@ CreateLWLocks(void) StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS, "MAX_BACKENDS too big for lwlock.c"); - StaticAssertStmt(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE && - sizeof(LWLock) <= LWLOCK_PADDED_SIZE, + StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE, "Miscalculated LWLock padding"); if (!IsUnderPostmaster) diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 2ccd60f38c..f9166b8655 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -971,6 +971,7 @@ typedef enum WAIT_EVENT_BGWORKER_SHUTDOWN, WAIT_EVENT_BGWORKER_STARTUP, WAIT_EVENT_BTREE_PAGE, + WAIT_EVENT_BUFFER_IO, WAIT_EVENT_CHECKPOINT_DONE, WAIT_EVENT_CHECKPOINT_START, WAIT_EVENT_EXECUTE_GATHER, diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index f6b5782965..9cbf1fc3fa 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -18,6 +18,7 @@ #include "port/atomics.h" #include "storage/buf.h" #include "storage/bufmgr.h" +#include "storage/condition_variable.h" #include "storage/latch.h" #include "storage/lwlock.h" #include "storage/shmem.h" @@ -184,7 +185,6 @@ typedef struct BufferDesc int wait_backend_pid; /* backend PID of pin-count waiter */ int freeNext; /* link in freelist chain */ - LWLock content_lock; /* to lock access to buffer contents */ } BufferDesc; @@ -221,12 +221,12 @@ typedef union BufferDescPadded #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) -#define BufferDescriptorGetIOLock(bdesc) \ - (&(BufferIOLWLockArray[(bdesc)->buf_id]).lock) +#define BufferDescriptorGetIOCV(bdesc) \ + (&(BufferIOCVArray[(bdesc)->buf_id]).cv) #define BufferDescriptorGetContentLock(bdesc) \ ((LWLock*) (&(bdesc)->content_lock)) -extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray; +extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; /* * The freeNext field is either the index of the next freelist entry, diff --git a/src/include/storage/condition_variable.h b/src/include/storage/condition_variable.h index 0b7578f8c4..6310ca230a 100644 --- a/src/include/storage/condition_variable.h +++ b/src/include/storage/condition_variable.h @@ -31,6 +31,17 @@ typedef struct proclist_head wakeup; /* list of wake-able processes */ } ConditionVariable; +/* + * Pad a condition variable to a power-of-two size so that an array of + * condition variables does not cross a cache line boundary. + */ +#define CV_MINIMAL_SIZE (sizeof(ConditionVariable) <= 16 ? 16 : 32) +typedef union ConditionVariableMinimallyPadded +{ + ConditionVariable cv; + char pad[CV_MINIMAL_SIZE]; +} ConditionVariableMinimallyPadded; + /* Initialize a condition variable. */ extern void ConditionVariableInit(ConditionVariable *cv); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index cbf2510fbf..a8f052e484 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -48,29 +48,8 @@ typedef struct LWLock * even more padding so that each LWLock takes up an entire cache line; this is * useful, for example, in the main LWLock array, where the overall number of * locks is small but some are heavily contended. - * - * When allocating a tranche that contains data other than LWLocks, it is - * probably best to include a bare LWLock and then pad the resulting structure - * as necessary for performance. For an array that contains only LWLocks, - * LWLockMinimallyPadded can be used for cases where we just want to ensure - * that we don't cross cache line boundaries within a single lock, while - * LWLockPadded can be used for cases where we want each lock to be an entire - * cache line. - * - * An LWLockMinimallyPadded might contain more than the absolute minimum amount - * of padding required to keep a lock from crossing a cache line boundary, - * because an unpadded LWLock will normally fit into 16 bytes. We ignore that - * possibility when determining the minimal amount of padding. Older releases - * had larger LWLocks, so 32 really was the minimum, and packing them in - * tighter might hurt performance. - * - * LWLOCK_MINIMAL_SIZE should be 32 on basically all common platforms, but - * because pg_atomic_uint32 is more than 4 bytes on some obscure platforms, we - * allow for the possibility that it might be 64. Even on those platforms, - * we probably won't exceed 32 bytes unless LOCK_DEBUG is defined. */ #define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE -#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64) /* LWLock, padded to a full cache line size */ typedef union LWLockPadded @@ -79,13 +58,6 @@ typedef union LWLockPadded char pad[LWLOCK_PADDED_SIZE]; } LWLockPadded; -/* LWLock, minimally padded */ -typedef union LWLockMinimallyPadded -{ - LWLock lock; - char pad[LWLOCK_MINIMAL_SIZE]; -} LWLockMinimallyPadded; - extern PGDLLIMPORT LWLockPadded *MainLWLockArray; /* struct for storing named tranche information */ @@ -202,7 +174,6 @@ typedef enum BuiltinTrancheIds LWTRANCHE_SERIAL_BUFFER, LWTRANCHE_WAL_INSERT, LWTRANCHE_BUFFER_CONTENT, - LWTRANCHE_BUFFER_IO, LWTRANCHE_REPLICATION_ORIGIN_STATE, LWTRANCHE_REPLICATION_SLOT_IO, LWTRANCHE_LOCK_FASTPATH, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index e4d2debb3c..574a8a94fa 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -398,6 +398,7 @@ CompressionAlgorithm CompressorState ComputeXidHorizonsResult ConditionVariable +ConditionVariableMinimallyPadded ConditionalStack ConfigData ConfigVariable