postgresql/src/include/storage/buf_internals.h

236 lines
6.7 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* buf_internals.h--
* Internal definitions.
*
*
* Copyright (c) 1994, Regents of the University of California
*
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
* $Id: buf_internals.h,v 1.12 1997/03/28 07:06:48 scrappy Exp $
*
* NOTE
* If BUFFERPAGE0 is defined, then 0 will be used as a
* valid buffer page number.
*
*-------------------------------------------------------------------------
*/
#ifndef BUFMGR_INTERNALS_H
#define BUFMGR_INTERNALS_H
1996-11-05 07:11:08 +01:00
#include <storage/lmgr.h>
#include <storage/buf.h>
/* Buf Mgr constants */
/* in bufmgr.c */
extern int NBuffers;
extern int Data_Descriptors;
extern int Free_List_Descriptor;
extern int Lookup_List_Descriptor;
extern int Num_Descriptors;
/*
* Flags for buffer descriptors
*/
#define BM_DIRTY (1 << 0)
#define BM_PRIVATE (1 << 1)
#define BM_VALID (1 << 2)
#define BM_DELETED (1 << 3)
#define BM_FREE (1 << 4)
#define BM_IO_IN_PROGRESS (1 << 5)
#define BM_IO_ERROR (1 << 6)
#define BM_JUST_DIRTIED (1 << 7)
typedef bits16 BufFlags;
typedef struct sbufdesc BufferDesc;
typedef struct sbufdesc BufferHdr;
typedef struct buftag BufferTag;
/* long * so alignment will be correct */
typedef long **BufferBlock;
struct buftag{
LRelId relId;
BlockNumber blockNum; /* blknum relative to begin of reln */
};
#define CLEAR_BUFFERTAG(a)\
(a)->relId.dbId = InvalidOid; \
(a)->relId.relId = InvalidOid; \
(a)->blockNum = InvalidBlockNumber
#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \
{ \
(a)->blockNum = xx_blockNum;\
(a)->relId = RelationGetLRelId(xx_reln); \
}
#define COPY_BUFFERTAG(a,b)\
{ \
(a)->blockNum = (b)->blockNum;\
LRelIdAssign(*(a),*(b));\
}
#define EQUAL_BUFFERTAG(a,b) \
(((a)->blockNum == (b)->blockNum) &&\
(OID_Equal((a)->relId.relId,(b)->relId.relId)))
#define BAD_BUFFER_ID(bid) ((bid<1) || (bid>(NBuffers)))
#define INVALID_DESCRIPTOR (-3)
/*
* bletch hack -- anyplace that we declare space for relation or
* database names, we just use '16', not a symbolic constant, to
* specify their lengths. BM_NAMESIZE is the length of these names,
* and is used in the buffer manager code. somebody with lots of
* spare time should do this for all the other modules, too.
*/
#define BM_NAMESIZE 16
/*
* struct sbufdesc -- shared buffer cache metadata for a single
* shared buffer descriptor.
*
* We keep the name of the database and relation in which this
* buffer appears in order to avoid a catalog lookup on cache
* flush if we don't have the reldesc in the cache. It is also
* possible that the relation to which this buffer belongs is
* not visible to all backends at the time that it gets flushed.
* Dbname, relname, dbid, and relid are enough to determine where
* to put the buffer, for all storage managers.
*/
#define PADDED_SBUFDESC_SIZE 128
/* DO NOT CHANGE THIS NEXT STRUCTURE:
It is used only to get padding information for the real sbufdesc structure
It should match the sbufdesc structure exactly except for a missing sb_pad
*/
struct sbufdesc_unpadded {
Buffer freeNext;
Buffer freePrev;
SHMEM_OFFSET data;
BufferTag tag;
int buf_id;
BufFlags flags;
int16 bufsmgr;
unsigned refcount;
#ifdef HAS_TEST_AND_SET
slock_t io_in_progress_lock;
#endif /* HAS_TEST_AND_SET */
char sb_dbname[NAMEDATALEN+1];
/* NOTE NO PADDING OF THE MEMBER HERE */
char sb_relname[NAMEDATALEN+1];
};
/* THE REAL STRUCTURE - the structure above must match it, minus sb_pad */
struct sbufdesc {
Buffer freeNext; /* link for freelist chain */
Buffer freePrev;
SHMEM_OFFSET data; /* pointer to data in buf pool */
/* tag and id must be together for table lookup to work */
BufferTag tag; /* file/block identifier */
int buf_id; /* maps global desc to local desc */
BufFlags flags; /* described below */
int16 bufsmgr; /* storage manager id for buffer */
unsigned refcount; /* # of times buffer is pinned */
#ifdef HAS_TEST_AND_SET
/* can afford a dedicated lock if test-and-set locks are available */
slock_t io_in_progress_lock;
#endif /* HAS_TEST_AND_SET */
char sb_dbname[NAMEDATALEN+1]; /* name of db in which buf belongs */
/*
* I padded this structure to a power of 2 (PADDED_SBUFDESC_SIZE) because
* BufferDescriptorGetBuffer is called a billion times and it does an
* C pointer subtraction (i.e., "x - y" -> array index of x relative
* to y, which is calculated using division by struct size). Integer
* ".div" hits you for 35 cycles, as opposed to a 1-cycle "sra" ...
* this hack cut 10% off of the time to create the Wisconsin database!
* It eats up more shared memory, of course, but we're (allegedly)
* going to make some of these types bigger soon anyway... -pma 1/2/93
*/
/* please, don't take the sizeof() this member and use it for
something important */
char sb_relname[NAMEDATALEN+1+ /* name of reln */
PADDED_SBUFDESC_SIZE-sizeof(struct sbufdesc_unpadded)];
};
/*
* mao tracing buffer allocation
*/
/*#define BMTRACE*/
#ifdef BMTRACE
typedef struct _bmtrace {
int bmt_pid;
long bmt_buf;
long bmt_dbid;
long bmt_relid;
int bmt_blkno;
int bmt_op;
#define BMT_NOTUSED 0
#define BMT_ALLOCFND 1
#define BMT_ALLOCNOTFND 2
#define BMT_DEALLOC 3
} bmtrace;
#endif /* BMTRACE */
/*
* Bufmgr Interface:
*/
/* Internal routines: only called by buf.c */
/*freelist.c*/
extern void AddBufferToFreelist(BufferDesc *bf);
extern void PinBuffer(BufferDesc *buf);
extern void PinBuffer_Debug(char *file, int line, BufferDesc *buf);
extern void UnpinBuffer(BufferDesc *buf);
extern void UnpinBuffer_Debug(char *file, int line, BufferDesc *buf);
extern BufferDesc *GetFreeBuffer(void);
extern void InitFreeList(bool init);
extern void DBG_FreeListCheck(int nfree);
/* buf_table.c */
extern void InitBufTable(void);
extern BufferDesc *BufTableLookup(BufferTag *tagPtr);
extern bool BufTableDelete(BufferDesc *buf);
extern bool BufTableInsert(BufferDesc *buf);
extern void DBG_LookupListCheck(int nlookup);
/* bufmgr.c */
extern BufferDesc *BufferDescriptors;
extern BufferBlock BufferBlocks;
extern long *PrivateRefCount;
extern long *LastRefCount;
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
extern long *CommitInfoNeedsSave;
extern SPINLOCK BufMgrLock;
/* localbuf.c */
extern long *LocalRefCount;
extern BufferDesc *LocalBufferDescriptors;
extern int NLocBuffer;
extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum,
bool *foundPtr);
extern int WriteLocalBuffer(Buffer buffer, bool release);
extern int FlushLocalBuffer(Buffer buffer, bool release);
extern void InitLocalBuffer(void);
extern void LocalBufferSync(void);
extern void ResetLocalBufferPool(void);
#endif /* BUFMGR_INTERNALS_H */