postgresql/src/include/storage/buf_internals.h

/*-------------------------------------------------------------------------
 *
 * buf_internals.h--
 *    Internal definitions.
 *
 *
 * Copyright (c) 1994, Regents of the University of California
 *
 * $Id: buf_internals.h,v 1.12 1997/03/28 07:06:48 scrappy Exp $
 *
 * NOTE
 *	If BUFFERPAGE0 is defined, then 0 will be used as a
 *	valid buffer page number.
 *
 *-------------------------------------------------------------------------
 */
#ifndef	BUFMGR_INTERNALS_H
#define BUFMGR_INTERNALS_H

#include <storage/lmgr.h>
#include <storage/buf.h>

/* Buf Mgr constants */
/* in bufmgr.c */
extern int NBuffers;
extern int Data_Descriptors;
extern int Free_List_Descriptor;
extern int Lookup_List_Descriptor;
extern int Num_Descriptors;

/*
 * Flags for buffer descriptors
 */
#define BM_DIRTY   		(1 << 0)
#define BM_PRIVATE 		(1 << 1)
#define BM_VALID 		(1 << 2)
#define BM_DELETED   		(1 << 3)
#define BM_FREE			(1 << 4)
#define BM_IO_IN_PROGRESS	(1 << 5)
#define BM_IO_ERROR		(1 << 6)
#define BM_JUST_DIRTIED		(1 << 7)

typedef bits16 BufFlags;

typedef struct sbufdesc BufferDesc;
typedef struct sbufdesc BufferHdr;
typedef struct buftag BufferTag;
/* long * so alignment will be correct */
typedef long **BufferBlock;

struct buftag{
  LRelId	relId;
  BlockNumber   blockNum;  /* blknum relative to begin of reln */
};

#define CLEAR_BUFFERTAG(a)\
  (a)->relId.dbId = InvalidOid; \
  (a)->relId.relId = InvalidOid; \
  (a)->blockNum = InvalidBlockNumber

#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \
{ \
  (a)->blockNum = xx_blockNum;\
  (a)->relId = RelationGetLRelId(xx_reln); \
}

#define COPY_BUFFERTAG(a,b)\
{ \
  (a)->blockNum = (b)->blockNum;\
  LRelIdAssign(*(a),*(b));\
}

#define EQUAL_BUFFERTAG(a,b) \
  (((a)->blockNum == (b)->blockNum) &&\
   (OID_Equal((a)->relId.relId,(b)->relId.relId)))


#define BAD_BUFFER_ID(bid) ((bid<1) || (bid>(NBuffers)))
#define INVALID_DESCRIPTOR (-3)

/*
 *  bletch hack -- anyplace that we declare space for relation or
 *  database names, we just use '16', not a symbolic constant, to
 *  specify their lengths.  BM_NAMESIZE is the length of these names,
 *  and is used in the buffer manager code.  somebody with lots of
 *  spare time should do this for all the other modules, too.
 */
#define BM_NAMESIZE	16

/*
 *  struct sbufdesc -- shared buffer cache metadata for a single
 *		       shared buffer descriptor.
 *
 *	We keep the name of the database and relation in which this
 *	buffer appears in order to avoid a catalog lookup on cache
 *	flush if we don't have the reldesc in the cache.  It is also
 *	possible that the relation to which this buffer belongs is
 *	not visible to all backends at the time that it gets flushed.
 *	Dbname, relname, dbid, and relid are enough to determine where
 *	to put the buffer, for all storage managers.
 */

#define PADDED_SBUFDESC_SIZE 	128

/* DO NOT CHANGE THIS NEXT STRUCTURE:
   It is used only to get padding information for the real sbufdesc structure
   It should match the sbufdesc structure exactly except for a missing sb_pad
*/
struct sbufdesc_unpadded {
    Buffer		freeNext;
    Buffer		freePrev;
    SHMEM_OFFSET	data;
    BufferTag		tag;
    int			buf_id;
    BufFlags		flags;
    int16		bufsmgr;
    unsigned		refcount;
#ifdef HAS_TEST_AND_SET
    slock_t	io_in_progress_lock;
#endif /* HAS_TEST_AND_SET */
    char sb_dbname[NAMEDATALEN+1];

    /* NOTE NO PADDING OF THE MEMBER HERE */
    char sb_relname[NAMEDATALEN+1];
};

/* THE REAL STRUCTURE - the structure above must match it, minus sb_pad */
struct sbufdesc {
    Buffer		freeNext;	/* link for freelist chain */
    Buffer		freePrev;
    SHMEM_OFFSET	data;		/* pointer to data in buf pool */

    /* tag and id must be together for table lookup to work */
    BufferTag		tag;		/* file/block identifier */
    int			buf_id;		/* maps global desc to local desc */

    BufFlags		flags;    	/* described below */
    int16		bufsmgr;	/* storage manager id for buffer */
    unsigned		refcount;	/* # of times buffer is pinned */

#ifdef HAS_TEST_AND_SET
    /* can afford a dedicated lock if test-and-set locks are available */
    slock_t	io_in_progress_lock;
#endif /* HAS_TEST_AND_SET */

    char sb_dbname[NAMEDATALEN+1];	/* name of db in which buf belongs */

    /*
     * I padded this structure to a power of 2 (PADDED_SBUFDESC_SIZE) because
     * BufferDescriptorGetBuffer is called a billion times and it does an
     * C pointer subtraction (i.e., "x - y" -> array index of x relative
     * to y, which is calculated using division by struct size).  Integer
     * ".div" hits you for 35 cycles, as opposed to a 1-cycle "sra" ...
     * this hack cut 10% off of the time to create the Wisconsin database!
     * It eats up more shared memory, of course, but we're (allegedly)
     * going to make some of these types bigger soon anyway... -pma 1/2/93
     */

    /* please, don't take the sizeof() this member and use it for
	something important */
	
    char sb_relname[NAMEDATALEN+1+	/* name of reln */
		PADDED_SBUFDESC_SIZE-sizeof(struct sbufdesc_unpadded)];
};

/*
 *  mao tracing buffer allocation
 */

/*#define BMTRACE*/
#ifdef BMTRACE

typedef struct _bmtrace {
    int		bmt_pid;
    long	bmt_buf;
    long	bmt_dbid;
    long	bmt_relid;
    int		bmt_blkno;
    int		bmt_op;

#define BMT_NOTUSED	0
#define BMT_ALLOCFND	1
#define BMT_ALLOCNOTFND	2
#define	BMT_DEALLOC	3

} bmtrace;

#endif /* BMTRACE */


/* 
 * Bufmgr Interface:
 */

/* Internal routines: only called by buf.c */

/*freelist.c*/
extern void AddBufferToFreelist(BufferDesc *bf);
extern void PinBuffer(BufferDesc *buf);
extern void PinBuffer_Debug(char *file, int line, BufferDesc *buf);
extern void UnpinBuffer(BufferDesc *buf);
extern void UnpinBuffer_Debug(char *file, int line, BufferDesc *buf);
extern BufferDesc *GetFreeBuffer(void);
extern void InitFreeList(bool init);
extern void DBG_FreeListCheck(int nfree);

/* buf_table.c */
extern void InitBufTable(void);
extern BufferDesc *BufTableLookup(BufferTag *tagPtr);
extern bool BufTableDelete(BufferDesc *buf);
extern bool BufTableInsert(BufferDesc *buf);
extern void DBG_LookupListCheck(int nlookup);

/* bufmgr.c */
extern BufferDesc 	*BufferDescriptors;
extern BufferBlock 	BufferBlocks;
extern long		*PrivateRefCount;
extern long		*LastRefCount;
extern long             *CommitInfoNeedsSave;
extern SPINLOCK		BufMgrLock;

/* localbuf.c */
extern long *LocalRefCount;
extern BufferDesc *LocalBufferDescriptors;
extern int NLocBuffer;

extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum,
				    bool *foundPtr);
extern int WriteLocalBuffer(Buffer buffer, bool release);
extern int FlushLocalBuffer(Buffer buffer, bool release);
extern void InitLocalBuffer(void);
extern void LocalBufferSync(void);
extern void ResetLocalBufferPool(void);
     
#endif	/* BUFMGR_INTERNALS_H */
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00			`/*-------------------------------------------------------------------------`
			`*`
			`* buf_internals.h--`
			`* Internal definitions.`
			`*`
			`*`
			`* Copyright (c) 1994, Regents of the University of California`
			`*`
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings. 1997-03-28 08:06:53 +01:00			`* $Id: buf_internals.h,v 1.12 1997/03/28 07:06:48 scrappy Exp $`
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00			`*`
			`* NOTE`
			`* If BUFFERPAGE0 is defined, then 0 will be used as a`
			`* valid buffer page number.`
			`*`
			`*-------------------------------------------------------------------------`
			`*/`
			`#ifndef BUFMGR_INTERNALS_H`
			`#define BUFMGR_INTERNALS_H`

Another directory totally cleaned out 1996-11-05 07:11:08 +01:00			`#include <storage/lmgr.h>`
			`#include <storage/buf.h>`
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00
			`/* Buf Mgr constants */`
			`/* in bufmgr.c */`
			`extern int NBuffers;`
			`extern int Data_Descriptors;`
			`extern int Free_List_Descriptor;`
			`extern int Lookup_List_Descriptor;`
			`extern int Num_Descriptors;`

			`/*`
			`* Flags for buffer descriptors`
			`*/`
			`#define BM_DIRTY (1 << 0)`
			`#define BM_PRIVATE (1 << 1)`
			`#define BM_VALID (1 << 2)`
			`#define BM_DELETED (1 << 3)`
			`#define BM_FREE (1 << 4)`
			`#define BM_IO_IN_PROGRESS (1 << 5)`
			`#define BM_IO_ERROR (1 << 6)`
New flag for BufferDesc - BM_JUST_DIRTIED, - to prevent losing data changes. 1997-01-20 05:06:13 +01:00			`#define BM_JUST_DIRTIED (1 << 7)`
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00
			`typedef bits16 BufFlags;`

			`typedef struct sbufdesc BufferDesc;`
			`typedef struct sbufdesc BufferHdr;`
			`typedef struct buftag BufferTag;`
			`/* long * so alignment will be correct */`
			`typedef long **BufferBlock;`

			`struct buftag{`
			`LRelId relId;`
			`BlockNumber blockNum; /* blknum relative to begin of reln */`
			`};`

			`#define CLEAR_BUFFERTAG(a)\`
			`(a)->relId.dbId = InvalidOid; \`
			`(a)->relId.relId = InvalidOid; \`
			`(a)->blockNum = InvalidBlockNumber`

			`#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \`
			`{ \`
			`(a)->blockNum = xx_blockNum;\`
			`(a)->relId = RelationGetLRelId(xx_reln); \`
			`}`

			`#define COPY_BUFFERTAG(a,b)\`
			`{ \`
			`(a)->blockNum = (b)->blockNum;\`
			`LRelIdAssign((a),(b));\`
			`}`

			`#define EQUAL_BUFFERTAG(a,b) \`
			`(((a)->blockNum == (b)->blockNum) &&\`
			`(OID_Equal((a)->relId.relId,(b)->relId.relId)))`


			`#define BAD_BUFFER_ID(bid) ((bid<1) \|\| (bid>(NBuffers)))`
			`#define INVALID_DESCRIPTOR (-3)`

			`/*`
			`* bletch hack -- anyplace that we declare space for relation or`
			`* database names, we just use '16', not a symbolic constant, to`
			`* specify their lengths. BM_NAMESIZE is the length of these names,`
			`* and is used in the buffer manager code. somebody with lots of`
			`* spare time should do this for all the other modules, too.`
			`*/`
			`#define BM_NAMESIZE 16`

			`/*`
			`* struct sbufdesc -- shared buffer cache metadata for a single`
			`* shared buffer descriptor.`
			`*`
			`* We keep the name of the database and relation in which this`
			`* buffer appears in order to avoid a catalog lookup on cache`
			`* flush if we don't have the reldesc in the cache. It is also`
			`* possible that the relation to which this buffer belongs is`
			`* not visible to all backends at the time that it gets flushed.`
			`* Dbname, relname, dbid, and relid are enough to determine where`
			`* to put the buffer, for all storage managers.`
			`*/`

Remove SB_PAD. Compute padding at compile time. 1997-01-23 19:15:29 +01:00			`#define PADDED_SBUFDESC_SIZE 128`

			`/* DO NOT CHANGE THIS NEXT STRUCTURE:`
			`It is used only to get padding information for the real sbufdesc structure`
			`It should match the sbufdesc structure exactly except for a missing sb_pad`
			`*/`
			`struct sbufdesc_unpadded {`
			`Buffer freeNext;`
			`Buffer freePrev;`
			`SHMEM_OFFSET data;`
			`BufferTag tag;`
			`int buf_id;`
			`BufFlags flags;`
			`int16 bufsmgr;`
			`unsigned refcount;`
			`#ifdef HAS_TEST_AND_SET`
			`slock_t io_in_progress_lock;`
			`#endif /* HAS_TEST_AND_SET */`
Restructure padding to handle structure already 128 bytes(alpha). 1997-01-25 04:09:33 +01:00			`char sb_dbname[NAMEDATALEN+1];`

			`/* NOTE NO PADDING OF THE MEMBER HERE */`
			`char sb_relname[NAMEDATALEN+1];`
Remove SB_PAD. Compute padding at compile time. 1997-01-23 19:15:29 +01:00			`};`

			`/* THE REAL STRUCTURE - the structure above must match it, minus sb_pad */`
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00			`struct sbufdesc {`
			`Buffer freeNext; /* link for freelist chain */`
			`Buffer freePrev;`
			`SHMEM_OFFSET data; /* pointer to data in buf pool */`

			`/* tag and id must be together for table lookup to work */`
			`BufferTag tag; /* file/block identifier */`
			`int buf_id; /* maps global desc to local desc */`

			`BufFlags flags; /* described below */`
			`int16 bufsmgr; /* storage manager id for buffer */`
			`unsigned refcount; /* # of times buffer is pinned */`

			`#ifdef HAS_TEST_AND_SET`
			`/* can afford a dedicated lock if test-and-set locks are available */`
			`slock_t io_in_progress_lock;`
			`#endif /* HAS_TEST_AND_SET */`

Restructure padding to handle structure already 128 bytes(alpha). 1997-01-25 04:09:33 +01:00			`char sb_dbname[NAMEDATALEN+1]; /* name of db in which buf belongs */`

More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00			`/*`
Remove SB_PAD. Compute padding at compile time. 1997-01-23 19:15:29 +01:00			`* I padded this structure to a power of 2 (PADDED_SBUFDESC_SIZE) because`
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00			`* BufferDescriptorGetBuffer is called a billion times and it does an`
			`* C pointer subtraction (i.e., "x - y" -> array index of x relative`
			`* to y, which is calculated using division by struct size). Integer`
			`* ".div" hits you for 35 cycles, as opposed to a 1-cycle "sra" ...`
			`* this hack cut 10% off of the time to create the Wisconsin database!`
			`* It eats up more shared memory, of course, but we're (allegedly)`
			`* going to make some of these types bigger soon anyway... -pma 1/2/93`
			`*/`

Restructure padding to handle structure already 128 bytes(alpha). 1997-01-25 04:09:33 +01:00			`/* please, don't take the sizeof() this member and use it for`
			`something important */`

			`char sb_relname[NAMEDATALEN+1+ /* name of reln */`
			`PADDED_SBUFDESC_SIZE-sizeof(struct sbufdesc_unpadded)];`
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00			`};`

			`/*`
			`* mao tracing buffer allocation`
			`*/`

			`/#define BMTRACE/`
			`#ifdef BMTRACE`

			`typedef struct _bmtrace {`
			`int bmt_pid;`
			`long bmt_buf;`
			`long bmt_dbid;`
			`long bmt_relid;`
			`int bmt_blkno;`
			`int bmt_op;`

			`#define BMT_NOTUSED 0`
			`#define BMT_ALLOCFND 1`
			`#define BMT_ALLOCNOTFND 2`
			`#define BMT_DEALLOC 3`

			`} bmtrace;`

			`#endif /* BMTRACE */`


			`/*`
			`* Bufmgr Interface:`
			`*/`

			`/* Internal routines: only called by buf.c */`

			`/freelist.c/`
			`extern void AddBufferToFreelist(BufferDesc *bf);`
			`extern void PinBuffer(BufferDesc *buf);`
			`extern void PinBuffer_Debug(char file, int line, BufferDesc buf);`
			`extern void UnpinBuffer(BufferDesc *buf);`
			`extern void UnpinBuffer_Debug(char file, int line, BufferDesc buf);`
			`extern BufferDesc *GetFreeBuffer(void);`
			`extern void InitFreeList(bool init);`
			`extern void DBG_FreeListCheck(int nfree);`

			`/* buf_table.c */`
			`extern void InitBufTable(void);`
			`extern BufferDesc BufTableLookup(BufferTag tagPtr);`
			`extern bool BufTableDelete(BufferDesc *buf);`
			`extern bool BufTableInsert(BufferDesc *buf);`
			`extern void DBG_LookupListCheck(int nlookup);`

			`/* bufmgr.c */`
			`extern BufferDesc *BufferDescriptors;`
			`extern BufferBlock BufferBlocks;`
			`extern long *PrivateRefCount;`
			`extern long *LastRefCount;`
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings. 1997-03-28 08:06:53 +01:00			`extern long *CommitInfoNeedsSave;`
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00			`extern SPINLOCK BufMgrLock;`

			`/* localbuf.c */`
			`extern long *LocalRefCount;`
			`extern BufferDesc *LocalBufferDescriptors;`
			`extern int NLocBuffer;`

			`extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum,`
			`bool *foundPtr);`
			`extern int WriteLocalBuffer(Buffer buffer, bool release);`
SetBufferWriteMode () added; FlushLocalBuffer () fixed (shouldn't release buffer if called from WriteNoReleaseBuffer ()) 1997-01-16 08:53:27 +01:00			`extern int FlushLocalBuffer(Buffer buffer, bool release);`
All external function definitions now have prototypes that are checked. 1996-11-10 04:06:38 +01:00			`extern void InitLocalBuffer(void);`
			`extern void LocalBufferSync(void);`
			`extern void ResetLocalBufferPool(void);`
More cleanups of the include files - centralizing to simplify the -I's required to compile 1996-08-28 03:59:28 +02:00
			`#endif /* BUFMGR_INTERNALS_H */`