diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 3335d71eba..1ba813bbb9 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1586,6 +1586,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
Waiting for the page number needed to continue a parallel B-tree
scan to become available.
+
+ BufferIO
+ Waiting for buffer I/O to complete.
+
CheckpointDone
Waiting for a checkpoint to complete.
@@ -1876,10 +1880,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
BufferContent
Waiting to access a data page in memory.
-
- BufferIO
- Waiting for I/O on a data page.
-
BufferMapping
Waiting to associate a data block with a buffer in the buffer
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 05d5ce0064..68eefb9722 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4010,6 +4010,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
case WAIT_EVENT_BTREE_PAGE:
event_name = "BtreePage";
break;
+ case WAIT_EVENT_BUFFER_IO:
+ event_name = "BufferIO";
+ break;
case WAIT_EVENT_CHECKPOINT_DONE:
event_name = "CheckpointDone";
break;
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index bc80777bb1..a775276ff2 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -145,14 +145,11 @@ held within the buffer. Each buffer header also contains an LWLock, the
"buffer content lock", that *does* represent the right to access the data
in the buffer. It is used per the rules above.
-There is yet another set of per-buffer LWLocks, the io_in_progress locks,
-that are used to wait for I/O on a buffer to complete. The process doing
-a read or write takes exclusive lock for the duration, and processes that
-need to wait for completion try to take shared locks (which they release
-immediately upon obtaining). XXX on systems where an LWLock represents
-nontrivial resources, it's fairly annoying to need so many locks. Possibly
-we could use per-backend LWLocks instead (a buffer header would then contain
-a field to show which backend is doing its I/O).
+* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a
+buffer to complete (and in releases before 14, it was accompanied by a
+per-buffer LWLock). The process doing a read or write sets the flag for the
+duration, and processes that need to wait for it to be cleared sleep on a
+condition variable.
Normal Buffer Replacement Strategy
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index e9e4f35bb5..a299be1043 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -19,7 +19,7 @@
BufferDescPadded *BufferDescriptors;
char *BufferBlocks;
-LWLockMinimallyPadded *BufferIOLWLockArray = NULL;
+ConditionVariableMinimallyPadded *BufferIOCVArray;
WritebackContext BackendWritebackContext;
CkptSortItem *CkptBufferIds;
@@ -68,7 +68,7 @@ InitBufferPool(void)
{
bool foundBufs,
foundDescs,
- foundIOLocks,
+ foundIOCV,
foundBufCkpt;
/* Align descriptors to a cacheline boundary. */
@@ -81,11 +81,11 @@ InitBufferPool(void)
ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ, &foundBufs);
- /* Align lwlocks to cacheline boundary */
- BufferIOLWLockArray = (LWLockMinimallyPadded *)
- ShmemInitStruct("Buffer IO Locks",
- NBuffers * (Size) sizeof(LWLockMinimallyPadded),
- &foundIOLocks);
+ /* Align condition variables to cacheline boundary. */
+ BufferIOCVArray = (ConditionVariableMinimallyPadded *)
+ ShmemInitStruct("Buffer IO Condition Variables",
+ NBuffers * sizeof(ConditionVariableMinimallyPadded),
+ &foundIOCV);
/*
* The array used to sort to-be-checkpointed buffer ids is located in
@@ -98,10 +98,10 @@ InitBufferPool(void)
ShmemInitStruct("Checkpoint BufferIds",
NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
- if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt)
+ if (foundDescs || foundBufs || foundIOCV || foundBufCkpt)
{
/* should find all of these, or none of them */
- Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt);
+ Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt);
/* note: this path is only taken in EXEC_BACKEND case */
}
else
@@ -131,8 +131,7 @@ InitBufferPool(void)
LWLockInitialize(BufferDescriptorGetContentLock(buf),
LWTRANCHE_BUFFER_CONTENT);
- LWLockInitialize(BufferDescriptorGetIOLock(buf),
- LWTRANCHE_BUFFER_IO);
+ ConditionVariableInit(BufferDescriptorGetIOCV(buf));
}
/* Correct last entry of linked list */
@@ -169,16 +168,9 @@ BufferShmemSize(void)
/* size of stuff controlled by freelist.c */
size = add_size(size, StrategyShmemSize());
- /*
- * It would be nice to include the I/O locks in the BufferDesc, but that
- * would increase the size of a BufferDesc to more than one cache line,
- * and benchmarking has shown that keeping every BufferDesc aligned on a
- * cache line boundary is important for performance. So, instead, the
- * array of I/O locks is allocated in a separate tranche. Because those
- * locks are not highly contended, we lay out the array with minimal
- * padding.
- */
- size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded)));
+ /* size of I/O condition variables */
+ size = add_size(size, mul_size(NBuffers,
+ sizeof(ConditionVariableMinimallyPadded)));
/* to allow aligning the above */
size = add_size(size, PG_CACHE_LINE_SIZE);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 561c212092..4c1d5eceb4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1352,9 +1352,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
LWLockRelease(newPartitionLock);
/*
- * Buffer contents are currently invalid. Try to get the io_in_progress
- * lock. If StartBufferIO returns false, then someone else managed to
- * read it before we did, so there's nothing left for BufferAlloc() to do.
+ * Buffer contents are currently invalid. Try to obtain the right to
+ * start I/O. If StartBufferIO returns false, then someone else managed
+ * to read it before we did, so there's nothing left for BufferAlloc() to
+ * do.
*/
if (StartBufferIO(buf, true))
*foundPtr = false;
@@ -1777,9 +1778,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
*/
VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
- /* I'd better not still hold any locks on the buffer */
+ /* I'd better not still hold the buffer content lock */
Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
- Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
/*
* Decrement the shared reference count.
@@ -2742,9 +2742,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
uint32 buf_state;
/*
- * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
- * false, then someone else flushed the buffer before we could, so we need
- * not do anything.
+ * Try to start an I/O operation. If StartBufferIO returns false, then
+ * someone else flushed the buffer before we could, so we need not do
+ * anything.
*/
if (!StartBufferIO(buf, false))
return;
@@ -2800,7 +2800,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
/*
* Now it's safe to write buffer to disk. Note that no one else should
* have been able to write it while we were busy with log flushing because
- * we have the io_in_progress lock.
+ * only one process at a time can set the BM_IO_IN_PROGRESS bit.
*/
bufBlock = BufHdrGetBlock(buf);
@@ -2835,7 +2835,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
/*
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
- * end the io_in_progress state.
+ * end the BM_IO_IN_PROGRESS state.
*/
TerminateBufferIO(buf, true, 0);
@@ -4271,7 +4271,7 @@ IsBufferCleanupOK(Buffer buffer)
* Functions for buffer I/O handling
*
* Note: We assume that nested buffer I/O never occurs.
- * i.e at most one io_in_progress lock is held per proc.
+ * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
*
* Also note that these are used only for shared buffers, not local ones.
*/
@@ -4282,13 +4282,9 @@ IsBufferCleanupOK(Buffer buffer)
static void
WaitIO(BufferDesc *buf)
{
- /*
- * Changed to wait until there's no IO - Inoue 01/13/2000
- *
- * Note this is *necessary* because an error abort in the process doing
- * I/O could release the io_in_progress_lock prematurely. See
- * AbortBufferIO.
- */
+ ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
+
+ ConditionVariablePrepareToSleep(cv);
for (;;)
{
uint32 buf_state;
@@ -4303,9 +4299,9 @@ WaitIO(BufferDesc *buf)
if (!(buf_state & BM_IO_IN_PROGRESS))
break;
- LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
- LWLockRelease(BufferDescriptorGetIOLock(buf));
+ ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
}
+ ConditionVariableCancelSleep();
}
/*
@@ -4317,7 +4313,7 @@ WaitIO(BufferDesc *buf)
* In some scenarios there are race conditions in which multiple backends
* could attempt the same I/O operation concurrently. If someone else
* has already started I/O on this buffer then we will block on the
- * io_in_progress lock until he's done.
+ * I/O condition variable until he's done.
*
* Input operations are only attempted on buffers that are not BM_VALID,
* and output operations only on buffers that are BM_VALID and BM_DIRTY,
@@ -4335,25 +4331,11 @@ StartBufferIO(BufferDesc *buf, bool forInput)
for (;;)
{
- /*
- * Grab the io_in_progress lock so that other processes can wait for
- * me to finish the I/O.
- */
- LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
-
buf_state = LockBufHdr(buf);
if (!(buf_state & BM_IO_IN_PROGRESS))
break;
-
- /*
- * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
- * lock isn't held is if the process doing the I/O is recovering from
- * an error (see AbortBufferIO). If that's the case, we must wait for
- * him to get unwedged.
- */
UnlockBufHdr(buf, buf_state);
- LWLockRelease(BufferDescriptorGetIOLock(buf));
WaitIO(buf);
}
@@ -4363,7 +4345,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
{
/* someone else already did the I/O */
UnlockBufHdr(buf, buf_state);
- LWLockRelease(BufferDescriptorGetIOLock(buf));
return false;
}
@@ -4381,7 +4362,6 @@ StartBufferIO(BufferDesc *buf, bool forInput)
* (Assumptions)
* My process is executing IO for the buffer
* BM_IO_IN_PROGRESS bit is set for the buffer
- * We hold the buffer's io_in_progress lock
* The buffer is Pinned
*
* If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
@@ -4413,7 +4393,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
InProgressBuf = NULL;
- LWLockRelease(BufferDescriptorGetIOLock(buf));
+ ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
}
/*
@@ -4434,14 +4414,6 @@ AbortBufferIO(void)
{
uint32 buf_state;
- /*
- * Since LWLockReleaseAll has already been called, we're not holding
- * the buffer's io_in_progress_lock. We have to re-acquire it so that
- * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
- * buffer will be in a busy spin until we succeed in doing this.
- */
- LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
-
buf_state = LockBufHdr(buf);
Assert(buf_state & BM_IO_IN_PROGRESS);
if (IsForInput)
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 8cb6a6f042..adf19aba75 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -146,8 +146,6 @@ static const char *const BuiltinTrancheNames[] = {
"WALInsert",
/* LWTRANCHE_BUFFER_CONTENT: */
"BufferContent",
- /* LWTRANCHE_BUFFER_IO: */
- "BufferIO",
/* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
"ReplicationOriginState",
/* LWTRANCHE_REPLICATION_SLOT_IO: */
@@ -469,8 +467,7 @@ CreateLWLocks(void)
StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
"MAX_BACKENDS too big for lwlock.c");
- StaticAssertStmt(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE &&
- sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
+ StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
"Miscalculated LWLock padding");
if (!IsUnderPostmaster)
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 2ccd60f38c..f9166b8655 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -971,6 +971,7 @@ typedef enum
WAIT_EVENT_BGWORKER_SHUTDOWN,
WAIT_EVENT_BGWORKER_STARTUP,
WAIT_EVENT_BTREE_PAGE,
+ WAIT_EVENT_BUFFER_IO,
WAIT_EVENT_CHECKPOINT_DONE,
WAIT_EVENT_CHECKPOINT_START,
WAIT_EVENT_EXECUTE_GATHER,
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index f6b5782965..9cbf1fc3fa 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -18,6 +18,7 @@
#include "port/atomics.h"
#include "storage/buf.h"
#include "storage/bufmgr.h"
+#include "storage/condition_variable.h"
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/shmem.h"
@@ -184,7 +185,6 @@ typedef struct BufferDesc
int wait_backend_pid; /* backend PID of pin-count waiter */
int freeNext; /* link in freelist chain */
-
LWLock content_lock; /* to lock access to buffer contents */
} BufferDesc;
@@ -221,12 +221,12 @@ typedef union BufferDescPadded
#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
-#define BufferDescriptorGetIOLock(bdesc) \
- (&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
+#define BufferDescriptorGetIOCV(bdesc) \
+ (&(BufferIOCVArray[(bdesc)->buf_id]).cv)
#define BufferDescriptorGetContentLock(bdesc) \
((LWLock*) (&(bdesc)->content_lock))
-extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
+extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
/*
* The freeNext field is either the index of the next freelist entry,
diff --git a/src/include/storage/condition_variable.h b/src/include/storage/condition_variable.h
index 0b7578f8c4..6310ca230a 100644
--- a/src/include/storage/condition_variable.h
+++ b/src/include/storage/condition_variable.h
@@ -31,6 +31,17 @@ typedef struct
proclist_head wakeup; /* list of wake-able processes */
} ConditionVariable;
+/*
+ * Pad a condition variable to a power-of-two size so that an array of
+ * condition variables does not cross a cache line boundary.
+ */
+#define CV_MINIMAL_SIZE (sizeof(ConditionVariable) <= 16 ? 16 : 32)
+typedef union ConditionVariableMinimallyPadded
+{
+ ConditionVariable cv;
+ char pad[CV_MINIMAL_SIZE];
+} ConditionVariableMinimallyPadded;
+
/* Initialize a condition variable. */
extern void ConditionVariableInit(ConditionVariable *cv);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index cbf2510fbf..a8f052e484 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -48,29 +48,8 @@ typedef struct LWLock
* even more padding so that each LWLock takes up an entire cache line; this is
* useful, for example, in the main LWLock array, where the overall number of
* locks is small but some are heavily contended.
- *
- * When allocating a tranche that contains data other than LWLocks, it is
- * probably best to include a bare LWLock and then pad the resulting structure
- * as necessary for performance. For an array that contains only LWLocks,
- * LWLockMinimallyPadded can be used for cases where we just want to ensure
- * that we don't cross cache line boundaries within a single lock, while
- * LWLockPadded can be used for cases where we want each lock to be an entire
- * cache line.
- *
- * An LWLockMinimallyPadded might contain more than the absolute minimum amount
- * of padding required to keep a lock from crossing a cache line boundary,
- * because an unpadded LWLock will normally fit into 16 bytes. We ignore that
- * possibility when determining the minimal amount of padding. Older releases
- * had larger LWLocks, so 32 really was the minimum, and packing them in
- * tighter might hurt performance.
- *
- * LWLOCK_MINIMAL_SIZE should be 32 on basically all common platforms, but
- * because pg_atomic_uint32 is more than 4 bytes on some obscure platforms, we
- * allow for the possibility that it might be 64. Even on those platforms,
- * we probably won't exceed 32 bytes unless LOCK_DEBUG is defined.
*/
#define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE
-#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64)
/* LWLock, padded to a full cache line size */
typedef union LWLockPadded
@@ -79,13 +58,6 @@ typedef union LWLockPadded
char pad[LWLOCK_PADDED_SIZE];
} LWLockPadded;
-/* LWLock, minimally padded */
-typedef union LWLockMinimallyPadded
-{
- LWLock lock;
- char pad[LWLOCK_MINIMAL_SIZE];
-} LWLockMinimallyPadded;
-
extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
/* struct for storing named tranche information */
@@ -202,7 +174,6 @@ typedef enum BuiltinTrancheIds
LWTRANCHE_SERIAL_BUFFER,
LWTRANCHE_WAL_INSERT,
LWTRANCHE_BUFFER_CONTENT,
- LWTRANCHE_BUFFER_IO,
LWTRANCHE_REPLICATION_ORIGIN_STATE,
LWTRANCHE_REPLICATION_SLOT_IO,
LWTRANCHE_LOCK_FASTPATH,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e4d2debb3c..574a8a94fa 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -398,6 +398,7 @@ CompressionAlgorithm
CompressorState
ComputeXidHorizonsResult
ConditionVariable
+ConditionVariableMinimallyPadded
ConditionalStack
ConfigData
ConfigVariable