/*------------------------------------------------------------------------- * * buf_internals.h * Internal definitions for buffer manager and the buffer replacement * strategy. * * * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/include/storage/buf_internals.h * *------------------------------------------------------------------------- */ #ifndef BUFMGR_INTERNALS_H #define BUFMGR_INTERNALS_H #include "storage/buf.h" #include "storage/lwlock.h" #include "storage/shmem.h" #include "storage/smgr.h" #include "storage/spin.h" #include "utils/relcache.h" /* * Flags for buffer descriptors * * Note: TAG_VALID essentially means that there is a buffer hashtable * entry associated with the buffer's tag. */ #define BM_DIRTY (1 << 0) /* data needs writing */ #define BM_VALID (1 << 1) /* data is valid */ #define BM_TAG_VALID (1 << 2) /* tag is assigned */ #define BM_IO_IN_PROGRESS (1 << 3) /* read or write in progress */ #define BM_IO_ERROR (1 << 4) /* previous I/O failed */ #define BM_JUST_DIRTIED (1 << 5) /* dirtied since write started */ #define BM_PIN_COUNT_WAITER (1 << 6) /* have waiter for sole pin */ #define BM_CHECKPOINT_NEEDED (1 << 7) /* must write for checkpoint */ #define BM_PERMANENT (1 << 8) /* permanent relation (not unlogged) */ typedef bits16 BufFlags; /* * The maximum allowed value of usage_count represents a tradeoff between * accuracy and speed of the clock-sweep buffer management algorithm. A * large value (comparable to NBuffers) would approximate LRU semantics. * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of * clock sweeps to find a free buffer, so in practice we don't want the * value to be very large. */ #define BM_MAX_USAGE_COUNT 5 /* * Buffer tag identifies which disk block the buffer contains. * * Note: the BufferTag data must be sufficient to determine where to write the * block, without reference to pg_class or pg_tablespace entries. It's * possible that the backend flushing the buffer doesn't even believe the * relation is visible yet (its xact may have started before the xact that * created the rel). The storage manager must be able to cope anyway. * * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have * to be fixed to zero them, since this struct is used as a hash key. */ typedef struct buftag { RelFileNode rnode; /* physical relation identifier */ ForkNumber forkNum; BlockNumber blockNum; /* blknum relative to begin of reln */ } BufferTag; #define CLEAR_BUFFERTAG(a) \ ( \ (a).rnode.spcNode = InvalidOid, \ (a).rnode.dbNode = InvalidOid, \ (a).rnode.relNode = InvalidOid, \ (a).forkNum = InvalidForkNumber, \ (a).blockNum = InvalidBlockNumber \ ) #define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \ ( \ (a).rnode = (xx_rnode), \ (a).forkNum = (xx_forkNum), \ (a).blockNum = (xx_blockNum) \ ) #define BUFFERTAGS_EQUAL(a,b) \ ( \ RelFileNodeEquals((a).rnode, (b).rnode) && \ (a).blockNum == (b).blockNum && \ (a).forkNum == (b).forkNum \ ) /* * The shared buffer mapping table is partitioned to reduce contention. * To determine which partition lock a given tag requires, compute the tag's * hash code with BufTableHashCode(), then apply BufMappingPartitionLock(). * NB: NUM_BUFFER_PARTITIONS must be a power of 2! */ #define BufTableHashPartition(hashcode) \ ((hashcode) % NUM_BUFFER_PARTITIONS) #define BufMappingPartitionLock(hashcode) \ ((LWLockId) (FirstBufMappingLock + BufTableHashPartition(hashcode))) /* * BufferDesc -- shared descriptor/state data for a single shared buffer. * * Note: buf_hdr_lock must be held to examine or change the tag, flags, * usage_count, refcount, or wait_backend_pid fields. buf_id field never * changes after initialization, so does not need locking. freeNext is * protected by the BufFreelistLock not buf_hdr_lock. The LWLocks can take * care of themselves. The buf_hdr_lock is *not* used to control access to * the data in the buffer! * * An exception is that if we have the buffer pinned, its tag can't change * underneath us, so we can examine the tag without locking the spinlock. * Also, in places we do one-time reads of the flags without bothering to * lock the spinlock; this is generally for situations where we don't expect * the flag bit being tested to be changing. * * We can't physically remove items from a disk page if another backend has * the buffer pinned. Hence, a backend may need to wait for all other pins * to go away. This is signaled by storing its own PID into * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER. At present, * there can be only one such waiter per buffer. * * We use this same struct for local buffer headers, but the lock fields * are not used and not all of the flag bits are useful either. */ typedef struct sbufdesc { BufferTag tag; /* ID of page contained in buffer */ BufFlags flags; /* see bit definitions above */ uint16 usage_count; /* usage counter for clock sweep code */ unsigned refcount; /* # of backends holding pins on buffer */ int wait_backend_pid; /* backend PID of pin-count waiter */ slock_t buf_hdr_lock; /* protects the above fields */ int buf_id; /* buffer's index number (from 0) */ int freeNext; /* link in freelist chain */ LWLockId io_in_progress_lock; /* to wait for I/O to complete */ LWLockId content_lock; /* to lock access to buffer contents */ } BufferDesc; #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) /* * The freeNext field is either the index of the next freelist entry, * or one of these special values: */ #define FREENEXT_END_OF_LIST (-1) #define FREENEXT_NOT_IN_LIST (-2) /* * Macros for acquiring/releasing a shared buffer header's spinlock. * Do not apply these to local buffers! * * Note: as a general coding rule, if you are using these then you probably * need to be using a volatile-qualified pointer to the buffer header, to * ensure that the compiler doesn't rearrange accesses to the header to * occur before or after the spinlock is acquired/released. */ #define LockBufHdr(bufHdr) SpinLockAcquire(&(bufHdr)->buf_hdr_lock) #define UnlockBufHdr(bufHdr) SpinLockRelease(&(bufHdr)->buf_hdr_lock) /* in buf_init.c */ extern PGDLLIMPORT BufferDesc *BufferDescriptors; /* in localbuf.c */ extern BufferDesc *LocalBufferDescriptors; /* * Internal routines: only called by bufmgr */ /* freelist.c */ extern volatile BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held); extern void StrategyFreeBuffer(volatile BufferDesc *buf); extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf); extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); /* buf_table.c */ extern Size BufTableShmemSize(int size); extern void InitBufTable(int size); extern uint32 BufTableHashCode(BufferTag *tagPtr); extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); /* localbuf.c */ extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum); extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr); extern void MarkLocalBufferDirty(Buffer buffer); extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock); extern void AtEOXact_LocalBuffers(bool isCommit); #endif /* BUFMGR_INTERNALS_H */