postgresql/src/include/storage/buf_internals.h

469 lines
16 KiB
C

/*-------------------------------------------------------------------------
*
* buf_internals.h
* Internal definitions for buffer manager and the buffer replacement
* strategy.
*
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/buf_internals.h
*
*-------------------------------------------------------------------------
*/
#ifndef BUFMGR_INTERNALS_H
#define BUFMGR_INTERNALS_H
#include "pgstat.h"
#include "port/atomics.h"
#include "storage/buf.h"
#include "storage/bufmgr.h"
#include "storage/condition_variable.h"
#include "storage/latch.h"
#include "storage/lwlock.h"
#include "storage/shmem.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "utils/relcache.h"
#include "utils/resowner.h"
/*
* Buffer state is a single 32-bit variable where following data is combined.
*
* - 18 bits refcount
* - 4 bits usage count
* - 10 bits of flags
*
* Combining these values allows to perform some operations without locking
* the buffer header, by modifying them together with a CAS loop.
*
* The definition of buffer state components is below.
*/
#define BUF_REFCOUNT_ONE 1
#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
#define BUF_USAGECOUNT_MASK 0x003C0000U
#define BUF_USAGECOUNT_ONE (1U << 18)
#define BUF_USAGECOUNT_SHIFT 18
#define BUF_FLAG_MASK 0xFFC00000U
/* Get refcount and usagecount from buffer state */
#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
/*
* Flags for buffer descriptors
*
* Note: BM_TAG_VALID essentially means that there is a buffer hashtable
* entry associated with the buffer's tag.
*/
#define BM_LOCKED (1U << 22) /* buffer header is locked */
#define BM_DIRTY (1U << 23) /* data needs writing */
#define BM_VALID (1U << 24) /* data is valid */
#define BM_TAG_VALID (1U << 25) /* tag is assigned */
#define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
#define BM_IO_ERROR (1U << 27) /* previous I/O failed */
#define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
#define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
#define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
#define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
* or init fork) */
/*
* The maximum allowed value of usage_count represents a tradeoff between
* accuracy and speed of the clock-sweep buffer management algorithm. A
* large value (comparable to NBuffers) would approximate LRU semantics.
* But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
* clock sweeps to find a free buffer, so in practice we don't want the
* value to be very large.
*/
#define BM_MAX_USAGE_COUNT 5
/*
* Buffer tag identifies which disk block the buffer contains.
*
* Note: the BufferTag data must be sufficient to determine where to write the
* block, without reference to pg_class or pg_tablespace entries. It's
* possible that the backend flushing the buffer doesn't even believe the
* relation is visible yet (its xact may have started before the xact that
* created the rel). The storage manager must be able to cope anyway.
*
* Note: if there's any pad bytes in the struct, InitBufferTag will have
* to be fixed to zero them, since this struct is used as a hash key.
*/
typedef struct buftag
{
Oid spcOid; /* tablespace oid */
Oid dbOid; /* database oid */
RelFileNumber relNumber; /* relation file number */
ForkNumber forkNum; /* fork number */
BlockNumber blockNum; /* blknum relative to begin of reln */
} BufferTag;
static inline RelFileNumber
BufTagGetRelNumber(const BufferTag *tag)
{
return tag->relNumber;
}
static inline ForkNumber
BufTagGetForkNum(const BufferTag *tag)
{
return tag->forkNum;
}
static inline void
BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
ForkNumber forknum)
{
tag->relNumber = relnumber;
tag->forkNum = forknum;
}
static inline RelFileLocator
BufTagGetRelFileLocator(const BufferTag *tag)
{
RelFileLocator rlocator;
rlocator.spcOid = tag->spcOid;
rlocator.dbOid = tag->dbOid;
rlocator.relNumber = BufTagGetRelNumber(tag);
return rlocator;
}
static inline void
ClearBufferTag(BufferTag *tag)
{
tag->spcOid = InvalidOid;
tag->dbOid = InvalidOid;
BufTagSetRelForkDetails(tag, InvalidRelFileNumber, InvalidForkNumber);
tag->blockNum = InvalidBlockNumber;
}
static inline void
InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
ForkNumber forkNum, BlockNumber blockNum)
{
tag->spcOid = rlocator->spcOid;
tag->dbOid = rlocator->dbOid;
BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
tag->blockNum = blockNum;
}
static inline bool
BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
{
return (tag1->spcOid == tag2->spcOid) &&
(tag1->dbOid == tag2->dbOid) &&
(tag1->relNumber == tag2->relNumber) &&
(tag1->blockNum == tag2->blockNum) &&
(tag1->forkNum == tag2->forkNum);
}
static inline bool
BufTagMatchesRelFileLocator(const BufferTag *tag,
const RelFileLocator *rlocator)
{
return (tag->spcOid == rlocator->spcOid) &&
(tag->dbOid == rlocator->dbOid) &&
(BufTagGetRelNumber(tag) == rlocator->relNumber);
}
/*
* The shared buffer mapping table is partitioned to reduce contention.
* To determine which partition lock a given tag requires, compute the tag's
* hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
* NB: NUM_BUFFER_PARTITIONS must be a power of 2!
*/
static inline uint32
BufTableHashPartition(uint32 hashcode)
{
return hashcode % NUM_BUFFER_PARTITIONS;
}
static inline LWLock *
BufMappingPartitionLock(uint32 hashcode)
{
return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET +
BufTableHashPartition(hashcode)].lock;
}
static inline LWLock *
BufMappingPartitionLockByIndex(uint32 index)
{
return &MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + index].lock;
}
/*
* BufferDesc -- shared descriptor/state data for a single shared buffer.
*
* Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
* tag, state or wait_backend_pgprocno fields. In general, buffer header lock
* is a spinlock which is combined with flags, refcount and usagecount into
* single atomic variable. This layout allow us to do some operations in a
* single atomic operation, without actually acquiring and releasing spinlock;
* for instance, increase or decrease refcount. buf_id field never changes
* after initialization, so does not need locking. freeNext is protected by
* the buffer_strategy_lock not buffer header lock. The LWLock can take care
* of itself. The buffer header lock is *not* used to control access to the
* data in the buffer!
*
* It's assumed that nobody changes the state field while buffer header lock
* is held. Thus buffer header lock holder can do complex updates of the
* state variable in single write, simultaneously with lock release (cleaning
* BM_LOCKED flag). On the other hand, updating of state without holding
* buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
* is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
*
* An exception is that if we have the buffer pinned, its tag can't change
* underneath us, so we can examine the tag without locking the buffer header.
* Also, in places we do one-time reads of the flags without bothering to
* lock the buffer header; this is generally for situations where we don't
* expect the flag bit being tested to be changing.
*
* We can't physically remove items from a disk page if another backend has
* the buffer pinned. Hence, a backend may need to wait for all other pins
* to go away. This is signaled by storing its own pgprocno into
* wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
* there can be only one such waiter per buffer.
*
* We use this same struct for local buffer headers, but the locks are not
* used and not all of the flag bits are useful either. To avoid unnecessary
* overhead, manipulations of the state field should be done without actual
* atomic operations (i.e. only pg_atomic_read_u32() and
* pg_atomic_unlocked_write_u32()).
*
* Be careful to avoid increasing the size of the struct when adding or
* reordering members. Keeping it below 64 bytes (the most common CPU
* cache line size) is fairly important for performance.
*
* Per-buffer I/O condition variables are currently kept outside this struct in
* a separate array. They could be moved in here and still fit within that
* limit on common systems, but for now that is not done.
*/
typedef struct BufferDesc
{
BufferTag tag; /* ID of page contained in buffer */
int buf_id; /* buffer's index number (from 0) */
/* state of the tag, containing flags, refcount and usagecount */
pg_atomic_uint32 state;
int wait_backend_pgprocno; /* backend of pin-count waiter */
int freeNext; /* link in freelist chain */
LWLock content_lock; /* to lock access to buffer contents */
} BufferDesc;
/*
* Concurrent access to buffer headers has proven to be more efficient if
* they're cache line aligned. So we force the start of the BufferDescriptors
* array to be on a cache line boundary and force the elements to be cache
* line sized.
*
* XXX: As this is primarily matters in highly concurrent workloads which
* probably all are 64bit these days, and the space wastage would be a bit
* more noticeable on 32bit systems, we don't force the stride to be cache
* line sized on those. If somebody does actual performance testing, we can
* reevaluate.
*
* Note that local buffer descriptors aren't forced to be aligned - as there's
* no concurrent access to those it's unlikely to be beneficial.
*
* We use a 64-byte cache line size here, because that's the most common
* size. Making it bigger would be a waste of memory. Even if running on a
* platform with either 32 or 128 byte line sizes, it's good to align to
* boundaries and avoid false sharing.
*/
#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
typedef union BufferDescPadded
{
BufferDesc bufferdesc;
char pad[BUFFERDESC_PAD_TO_SIZE];
} BufferDescPadded;
/*
* The PendingWriteback & WritebackContext structure are used to keep
* information about pending flush requests to be issued to the OS.
*/
typedef struct PendingWriteback
{
/* could store different types of pending flushes here */
BufferTag tag;
} PendingWriteback;
/* struct forward declared in bufmgr.h */
typedef struct WritebackContext
{
/* pointer to the max number of writeback requests to coalesce */
int *max_pending;
/* current number of pending writeback requests */
int nr_pending;
/* pending requests */
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
} WritebackContext;
/* in buf_init.c */
extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
extern PGDLLIMPORT WritebackContext BackendWritebackContext;
/* in localbuf.c */
extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
static inline BufferDesc *
GetBufferDescriptor(uint32 id)
{
return &(BufferDescriptors[id]).bufferdesc;
}
static inline BufferDesc *
GetLocalBufferDescriptor(uint32 id)
{
return &LocalBufferDescriptors[id];
}
static inline Buffer
BufferDescriptorGetBuffer(const BufferDesc *bdesc)
{
return (Buffer) (bdesc->buf_id + 1);
}
static inline ConditionVariable *
BufferDescriptorGetIOCV(const BufferDesc *bdesc)
{
return &(BufferIOCVArray[bdesc->buf_id]).cv;
}
static inline LWLock *
BufferDescriptorGetContentLock(const BufferDesc *bdesc)
{
return (LWLock *) (&bdesc->content_lock);
}
/*
* The freeNext field is either the index of the next freelist entry,
* or one of these special values:
*/
#define FREENEXT_END_OF_LIST (-1)
#define FREENEXT_NOT_IN_LIST (-2)
/*
* Functions for acquiring/releasing a shared buffer header's spinlock. Do
* not apply these to local buffers!
*/
extern uint32 LockBufHdr(BufferDesc *desc);
static inline void
UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
{
pg_write_barrier();
pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
}
/* in bufmgr.c */
/*
* Structure to sort buffers per file on checkpoints.
*
* This structure is allocated per buffer in shared memory, so it should be
* kept as small as possible.
*/
typedef struct CkptSortItem
{
Oid tsId;
RelFileNumber relNumber;
ForkNumber forkNum;
BlockNumber blockNum;
int buf_id;
} CkptSortItem;
extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
/* ResourceOwner callbacks to hold buffer I/Os and pins */
extern const ResourceOwnerDesc buffer_io_resowner_desc;
extern const ResourceOwnerDesc buffer_pin_resowner_desc;
/* Convenience wrappers over ResourceOwnerRemember/Forget */
static inline void
ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
{
ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
}
static inline void
ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
{
ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_pin_resowner_desc);
}
static inline void
ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
{
ResourceOwnerRemember(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
}
static inline void
ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
{
ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
}
/*
* Internal buffer management routines
*/
/* bufmgr.c */
extern void WritebackContextInit(WritebackContext *context, int *max_pending);
extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
IOContext io_context, BufferTag *tag);
/* freelist.c */
extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
uint32 *buf_state, bool *from_ring);
extern void StrategyFreeBuffer(BufferDesc *buf);
extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
BufferDesc *buf, bool from_ring);
extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern bool have_free_buffer(void);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
extern void InitBufTable(int size);
extern uint32 BufTableHashCode(BufferTag *tagPtr);
extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
/* localbuf.c */
extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
extern void UnpinLocalBuffer(Buffer buffer);
extern void UnpinLocalBufferNoOwner(Buffer buffer);
extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
ForkNumber forkNum,
BlockNumber blockNum);
extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
BlockNumber blockNum, bool *foundPtr);
extern BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr,
ForkNumber fork,
uint32 flags,
uint32 extend_by,
BlockNumber extend_upto,
Buffer *buffers,
uint32 *extended_by);
extern void MarkLocalBufferDirty(Buffer buffer);
extern void DropRelationLocalBuffers(RelFileLocator rlocator,
ForkNumber forkNum,
BlockNumber firstDelBlock);
extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
extern void AtEOXact_LocalBuffers(bool isCommit);
#endif /* BUFMGR_INTERNALS_H */