postgresql/src/backend/access/heap/heapam.c

1498 lines
39 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* heapam.c--
* heap access method code
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
* $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.11 1997/03/28 07:04:11 scrappy Exp $
*
*
* INTERFACE ROUTINES
* heapgettup - fetch next heap tuple from a scan
* heap_open - open a heap relation by relationId
* heap_openr - open a heap relation by name
* heap_close - close a heap relation
* heap_beginscan - begin relation scan
* heap_rescan - restart a relation scan
* heap_endscan - end relation scan
* heap_getnext - retrieve next tuple in scan
* heap_fetch - retrive tuple with tid
* heap_insert - insert tuple into a relation
* heap_delete - delete a tuple from a relation
* heap_replace - replace a tuple in a relation with another tuple
* heap_markpos - mark scan position
* heap_restrpos - restore position to marked location
*
* NOTES
* This file contains the heap_ routines which implement
* the POSTGRES heap access method used for all POSTGRES
* relations.
*
* OLD COMMENTS
* struct relscan hints: (struct should be made AM independent?)
*
* rs_ctid is the tid of the last tuple returned by getnext.
* rs_ptid and rs_ntid are the tids of the previous and next tuples
* returned by getnext, respectively. NULL indicates an end of
* scan (either direction); NON indicates an unknow value.
*
* possible combinations:
* rs_p rs_c rs_n interpretation
* NULL NULL NULL empty scan
* NULL NULL NON at begining of scan
* NULL NULL t1 at begining of scan (with cached tid)
* NON NULL NULL at end of scan
* t1 NULL NULL at end of scan (with cached tid)
* NULL t1 NULL just returned only tuple
* NULL t1 NON just returned first tuple
* NULL t1 t2 returned first tuple (with cached tid)
* NON t1 NULL just returned last tuple
* t2 t1 NULL returned last tuple (with cached tid)
* t1 t2 NON in the middle of a forward scan
* NON t2 t1 in the middle of a reverse scan
* ti tj tk in the middle of a scan (w cached tid)
*
* Here NULL is ...tup == NULL && ...buf == InvalidBuffer,
* and NON is ...tup == NULL && ...buf == UnknownBuffer.
*
* Currently, the NONTID values are not cached with their actual
* values by getnext. Values may be cached by markpos since it stores
* all three tids.
*
* NOTE: the calls to elog() must stop. Should decide on an interface
* between the general and specific AM calls.
*
* XXX probably do not need a free tuple routine for heaps.
* Huh? Free tuple is not necessary for tuples returned by scans, but
* is necessary for tuples which are returned by
* RelationGetTupleByItemPointer. -hirohama
*
*-------------------------------------------------------------------------
*/
1996-11-05 10:53:03 +01:00
#include <postgres.h>
1996-11-05 10:53:03 +01:00
#include <storage/bufpage.h>
#include <access/heapam.h>
#include <miscadmin.h>
#include <utils/relcache.h>
#include <access/valid.h>
#include <access/hio.h>
#include <storage/lmgr.h>
#include <storage/smgr.h>
#include <catalog/catalog.h>
#include <access/transam.h>
#include <access/xact.h>
#include <utils/inval.h>
#include <utils/memutils.h>
1996-10-21 07:59:49 +02:00
#ifndef HAVE_MEMMOVE
1996-11-05 10:53:03 +01:00
# include <regex/utils.h>
1996-10-21 07:59:49 +02:00
#else
# include <string.h>
#endif
static bool ImmediateInvalidation;
/* ----------------------------------------------------------------
* heap support routines
* ----------------------------------------------------------------
*/
/* ----------------
* initsdesc - sdesc code common to heap_beginscan and heap_rescan
* ----------------
*/
static void
initsdesc(HeapScanDesc sdesc,
Relation relation,
int atend,
unsigned nkeys,
ScanKey key)
{
if (!RelationGetNumberOfBlocks(relation)) {
/* ----------------
* relation is empty
* ----------------
*/
sdesc->rs_ntup = sdesc->rs_ctup = sdesc->rs_ptup = NULL;
sdesc->rs_nbuf = sdesc->rs_cbuf = sdesc->rs_pbuf = InvalidBuffer;
} else if (atend) {
/* ----------------
* reverse scan
* ----------------
*/
sdesc->rs_ntup = sdesc->rs_ctup = NULL;
sdesc->rs_nbuf = sdesc->rs_cbuf = InvalidBuffer;
sdesc->rs_ptup = NULL;
sdesc->rs_pbuf = UnknownBuffer;
} else {
/* ----------------
* forward scan
* ----------------
*/
sdesc->rs_ctup = sdesc->rs_ptup = NULL;
sdesc->rs_cbuf = sdesc->rs_pbuf = InvalidBuffer;
sdesc->rs_ntup = NULL;
sdesc->rs_nbuf = UnknownBuffer;
} /* invalid too */
/* we don't have a marked position... */
ItemPointerSetInvalid(&(sdesc->rs_mptid));
ItemPointerSetInvalid(&(sdesc->rs_mctid));
ItemPointerSetInvalid(&(sdesc->rs_mntid));
ItemPointerSetInvalid(&(sdesc->rs_mcd));
/* ----------------
* copy the scan key, if appropriate
* ----------------
*/
if (key != NULL)
memmove(sdesc->rs_key, key, nkeys * sizeof(ScanKeyData));
}
/* ----------------
* unpinsdesc - code common to heap_rescan and heap_endscan
* ----------------
*/
static void
unpinsdesc(HeapScanDesc sdesc)
{
if (BufferIsValid(sdesc->rs_pbuf)) {
ReleaseBuffer(sdesc->rs_pbuf);
}
/* ------------------------------------
* Scan will pin buffer one for each non-NULL tuple pointer
* (ptup, ctup, ntup), so they have to be unpinned multiple
* times.
* ------------------------------------
*/
if (BufferIsValid(sdesc->rs_cbuf)) {
ReleaseBuffer(sdesc->rs_cbuf);
}
if (BufferIsValid(sdesc->rs_nbuf)) {
ReleaseBuffer(sdesc->rs_nbuf);
}
}
/* ------------------------------------------
* nextpage
*
* figure out the next page to scan after the current page
* taking into account of possible adjustment of degrees of
* parallelism
* ------------------------------------------
*/
static int
nextpage(int page, int dir)
{
return((dir<0)?page-1:page+1);
}
/* ----------------
* heapgettup - fetch next heap tuple
*
* routine used by heap_getnext() which does most of the
* real work in scanning tuples.
* ----------------
*/
static HeapTuple
heapgettup(Relation relation,
ItemPointer tid,
int dir,
Buffer *b,
TimeQual timeQual,
int nkeys,
ScanKey key)
{
ItemId lpp;
Page dp;
int page;
int pages;
int lines;
HeapTuple rtup;
OffsetNumber lineoff;
int linesleft;
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_heapgettup);
IncrHeapAccessStat(global_heapgettup);
/* ----------------
* debugging stuff
*
* check validity of arguments, here and for other functions too
* Note: no locking manipulations needed--this is a local function
* ----------------
*/
#ifdef HEAPDEBUGALL
if (ItemPointerIsValid(tid)) {
elog(DEBUG, "heapgettup(%.16s, tid=0x%x[%d,%d], dir=%d, ...)",
RelationGetRelationName(relation), tid, tid->ip_blkid,
tid->ip_posid, dir);
} else {
elog(DEBUG, "heapgettup(%.16s, tid=0x%x, dir=%d, ...)",
RelationGetRelationName(relation), tid, dir);
}
elog(DEBUG, "heapgettup(..., b=0x%x, timeQ=0x%x, nkeys=%d, key=0x%x",
b, timeQual, nkeys, key);
if (timeQual == SelfTimeQual) {
elog(DEBUG, "heapgettup: relation(%c)=`%.16s', SelfTimeQual",
relation->rd_rel->relkind, &relation->rd_rel->relname);
} else {
elog(DEBUG, "heapgettup: relation(%c)=`%.16s', timeQual=%d",
relation->rd_rel->relkind, &relation->rd_rel->relname,
timeQual);
}
#endif /* !defined(HEAPDEBUGALL) */
if (!ItemPointerIsValid(tid)) {
Assert(!PointerIsValid(tid));
}
/* ----------------
* return null immediately if relation is empty
* ----------------
*/
if (!(pages = relation->rd_nblocks))
return (NULL);
/* ----------------
* calculate next starting lineoff, given scan direction
* ----------------
*/
if (!dir) {
/* ----------------
* ``no movement'' scan direction
* ----------------
*/
/* assume it is a valid TID XXX */
if (ItemPointerIsValid(tid) == false) {
*b = InvalidBuffer;
return (NULL);
}
*b = RelationGetBufferWithBuffer(relation,
ItemPointerGetBlockNumber(tid),
*b);
#ifndef NO_BUFFERISVALID
if (!BufferIsValid(*b)) {
elog(WARN, "heapgettup: failed ReadBuffer");
}
#endif
dp = (Page) BufferGetPage(*b);
lineoff = ItemPointerGetOffsetNumber(tid);
lpp = PageGetItemId(dp, lineoff);
rtup = (HeapTuple)PageGetItem((Page) dp, lpp);
return (rtup);
} else if (dir < 0) {
/* ----------------
* reverse scan direction
* ----------------
*/
if (ItemPointerIsValid(tid) == false) {
tid = NULL;
}
if (tid == NULL) {
page = pages - 1; /* final page */
} else {
page = ItemPointerGetBlockNumber(tid); /* current page */
}
if (page < 0) {
*b = InvalidBuffer;
return (NULL);
}
*b = RelationGetBufferWithBuffer(relation, page, *b);
#ifndef NO_BUFFERISVALID
if (!BufferIsValid(*b)) {
elog(WARN, "heapgettup: failed ReadBuffer");
}
#endif
dp = (Page) BufferGetPage(*b);
lines = PageGetMaxOffsetNumber(dp);
if (tid == NULL) {
lineoff = lines; /* final offnum */
} else {
lineoff = /* previous offnum */
OffsetNumberPrev(ItemPointerGetOffsetNumber(tid));
}
/* page and lineoff now reference the physically previous tid */
} else {
/* ----------------
* forward scan direction
* ----------------
*/
if (ItemPointerIsValid(tid) == false) {
page = 0; /* first page */
lineoff = FirstOffsetNumber; /* first offnum */
} else {
page = ItemPointerGetBlockNumber(tid); /* current page */
lineoff = /* next offnum */
OffsetNumberNext(ItemPointerGetOffsetNumber(tid));
}
if (page >= pages) {
*b = InvalidBuffer;
return (NULL);
}
/* page and lineoff now reference the physically next tid */
*b = RelationGetBufferWithBuffer(relation, page, *b);
#ifndef NO_BUFFERISVALID
if (!BufferIsValid(*b)) {
elog(WARN, "heapgettup: failed ReadBuffer");
}
#endif
dp = (Page) BufferGetPage(*b);
lines = PageGetMaxOffsetNumber(dp);
}
/* 'dir' is now non-zero */
/* ----------------
* calculate line pointer and number of remaining items
* to check on this page.
* ----------------
*/
lpp = PageGetItemId(dp, lineoff);
if (dir < 0) {
linesleft = lineoff - 1;
} else {
linesleft = lines - lineoff;
}
/* ----------------
* advance the scan until we find a qualifying tuple or
* run out of stuff to scan
* ----------------
*/
for (;;) {
while (linesleft >= 0) {
/* ----------------
* if current tuple qualifies, return it.
* ----------------
*/
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
if ((rtup = heap_tuple_satisfies(lpp, relation, *b, (PageHeader) dp,
timeQual, nkeys, key)) != NULL) {
ItemPointer iptr = &(rtup->t_ctid);
if (ItemPointerGetBlockNumber(iptr) != page) {
/*
* set block id to the correct page number
* --- this is a hack to support the virtual fragment
* concept
*/
ItemPointerSetBlockNumber(iptr, page);
}
return (rtup);
}
/* ----------------
* otherwise move to the next item on the page
* ----------------
*/
--linesleft;
if (dir < 0) {
--lpp; /* move back in this page's ItemId array */
} else {
++lpp; /* move forward in this page's ItemId array */
}
}
/* ----------------
* if we get here, it means we've exhausted the items on
* this page and it's time to move to the next..
* ----------------
*/
page = nextpage(page, dir);
/* ----------------
* return NULL if we've exhausted all the pages..
* ----------------
*/
if (page < 0 || page >= pages) {
if (BufferIsValid(*b))
ReleaseBuffer(*b);
*b = InvalidBuffer;
return (NULL);
}
*b = ReleaseAndReadBuffer(*b, relation, page);
#ifndef NO_BUFFERISVALID
if (!BufferIsValid(*b)) {
elog(WARN, "heapgettup: failed ReadBuffer");
}
#endif
dp = (Page) BufferGetPage(*b);
lines = lineoff = PageGetMaxOffsetNumber((Page) dp);
linesleft = lines - 1;
if (dir < 0) {
lpp = PageGetItemId(dp, lineoff);
} else {
lpp = PageGetItemId(dp, FirstOffsetNumber);
}
}
}
void
doinsert(Relation relation, HeapTuple tup)
{
RelationPutHeapTupleAtEnd(relation, tup);
return;
}
/*
* HeapScanIsValid is now a macro in relscan.h -cim 4/27/91
*/
/* ----------------
* SetHeapAccessMethodImmediateInvalidation
* ----------------
*/
void
SetHeapAccessMethodImmediateInvalidation(bool on)
{
ImmediateInvalidation = on;
}
/* ----------------------------------------------------------------
* heap access method interface
* ----------------------------------------------------------------
*/
/* ----------------
* heap_open - open a heap relation by relationId
*
* presently the relcache routines do all the work we need
* to open/close heap relations.
* ----------------
*/
Relation
heap_open(Oid relationId)
{
Relation r;
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_open);
IncrHeapAccessStat(global_open);
r = (Relation) RelationIdGetRelation(relationId);
if (RelationIsValid(r) && r->rd_rel->relkind == RELKIND_INDEX) {
elog(WARN, "%s is an index relation", r->rd_rel->relname.data);
}
return (r);
}
/* ----------------
* heap_openr - open a heap relation by name
*
* presently the relcache routines do all the work we need
* to open/close heap relations.
* ----------------
*/
Relation
heap_openr(char *relationName)
{
Relation r;
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_openr);
IncrHeapAccessStat(global_openr);
r = RelationNameGetRelation(relationName);
if (RelationIsValid(r) && r->rd_rel->relkind == RELKIND_INDEX) {
elog(WARN, "%s is an index relation", r->rd_rel->relname.data);
}
return (r);
}
/* ----------------
* heap_close - close a heap relation
*
* presently the relcache routines do all the work we need
* to open/close heap relations.
* ----------------
*/
void
heap_close(Relation relation)
{
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_close);
IncrHeapAccessStat(global_close);
(void) RelationClose(relation);
}
/* ----------------
* heap_beginscan - begin relation scan
* ----------------
*/
HeapScanDesc
heap_beginscan(Relation relation,
int atend,
TimeQual timeQual,
unsigned nkeys,
ScanKey key)
{
HeapScanDesc sdesc;
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_beginscan);
IncrHeapAccessStat(global_beginscan);
/* ----------------
* sanity checks
* ----------------
*/
if (RelationIsValid(relation) == false)
elog(WARN, "heap_beginscan: !RelationIsValid(relation)");
/* ----------------
* set relation level read lock
* ----------------
*/
RelationSetLockForRead(relation);
/* XXX someday assert SelfTimeQual if relkind == RELKIND_UNCATALOGED */
if (relation->rd_rel->relkind == RELKIND_UNCATALOGED) {
timeQual = SelfTimeQual;
}
/* ----------------
* increment relation ref count while scanning relation
* ----------------
*/
RelationIncrementReferenceCount(relation);
/* ----------------
* allocate and initialize scan descriptor
* ----------------
*/
sdesc = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
relation->rd_nblocks = smgrnblocks(relation->rd_rel->relsmgr, relation);
sdesc->rs_rd = relation;
if (nkeys) {
/*
* we do this here instead of in initsdesc() because heap_rescan also
* calls initsdesc() and we don't want to allocate memory again
*/
sdesc->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
} else {
sdesc->rs_key = NULL;
}
initsdesc(sdesc, relation, atend, nkeys, key);
sdesc->rs_atend = atend;
sdesc->rs_tr = timeQual;
sdesc->rs_nkeys = (short)nkeys;
return (sdesc);
}
/* ----------------
* heap_rescan - restart a relation scan
* ----------------
*/
void
heap_rescan(HeapScanDesc sdesc,
bool scanFromEnd,
ScanKey key)
{
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_rescan);
IncrHeapAccessStat(global_rescan);
/* Note: set relation level read lock is still set */
/* ----------------
* unpin scan buffers
* ----------------
*/
unpinsdesc(sdesc);
/* ----------------
* reinitialize scan descriptor
* ----------------
*/
initsdesc(sdesc, sdesc->rs_rd, scanFromEnd, sdesc->rs_nkeys, key);
sdesc->rs_atend = (bool) scanFromEnd;
}
/* ----------------
* heap_endscan - end relation scan
*
* See how to integrate with index scans.
* Check handling if reldesc caching.
* ----------------
*/
void
heap_endscan(HeapScanDesc sdesc)
{
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_endscan);
IncrHeapAccessStat(global_endscan);
/* Note: no locking manipulations needed */
/* ----------------
* unpin scan buffers
* ----------------
*/
unpinsdesc(sdesc);
/* ----------------
* decrement relation reference count and free scan descriptor storage
* ----------------
*/
RelationDecrementReferenceCount(sdesc->rs_rd);
/* ----------------
* Non 2-phase read locks on catalog relations
* ----------------
*/
if ( IsSystemRelationName(RelationGetRelationName(sdesc->rs_rd)->data) )
RelationUnsetLockForRead(sdesc->rs_rd);
pfree(sdesc); /* XXX */
}
/* ----------------
* heap_getnext - retrieve next tuple in scan
*
* Fix to work with index relations.
* ----------------
*/
#ifdef HEAPDEBUGALL
#define HEAPDEBUG_1 \
elog(DEBUG, "heap_getnext([%s,nkeys=%d],backw=%d,0x%x) called", \
sdesc->rs_rd->rd_rel->relname.data, sdesc->rs_nkeys, backw, b)
#define HEAPDEBUG_2 \
elog(DEBUG, "heap_getnext called with backw (no tracing yet)")
#define HEAPDEBUG_3 \
elog(DEBUG, "heap_getnext returns NULL at end")
#define HEAPDEBUG_4 \
elog(DEBUG, "heap_getnext valid buffer UNPIN'd")
#define HEAPDEBUG_5 \
elog(DEBUG, "heap_getnext next tuple was cached")
#define HEAPDEBUG_6 \
elog(DEBUG, "heap_getnext returning EOS")
#define HEAPDEBUG_7 \
elog(DEBUG, "heap_getnext returning tuple");
#else
#define HEAPDEBUG_1
#define HEAPDEBUG_2
#define HEAPDEBUG_3
#define HEAPDEBUG_4
#define HEAPDEBUG_5
#define HEAPDEBUG_6
#define HEAPDEBUG_7
#endif /* !defined(HEAPDEBUGALL) */
HeapTuple
heap_getnext(HeapScanDesc scandesc,
int backw,
Buffer *b)
{
register HeapScanDesc sdesc = scandesc;
Buffer localb;
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_getnext);
IncrHeapAccessStat(global_getnext);
/* Note: no locking manipulations needed */
/* ----------------
* argument checks
* ----------------
*/
if (sdesc == NULL)
elog(WARN, "heap_getnext: NULL relscan");
/* ----------------
* initialize return buffer to InvalidBuffer
* ----------------
*/
if (! PointerIsValid(b)) b = &localb;
(*b) = InvalidBuffer;
HEAPDEBUG_1; /* heap_getnext( info ) */
if (backw) {
/* ----------------
* handle reverse scan
* ----------------
*/
HEAPDEBUG_2; /* heap_getnext called with backw */
if (sdesc->rs_ptup == sdesc->rs_ctup &&
BufferIsInvalid(sdesc->rs_pbuf))
{
if (BufferIsValid(sdesc->rs_nbuf))
ReleaseBuffer(sdesc->rs_nbuf);
return (NULL);
}
/*
* Copy the "current" tuple/buffer
* to "next". Pin/unpin the buffers
* accordingly
*/
if (sdesc->rs_nbuf != sdesc->rs_cbuf) {
if (BufferIsValid(sdesc->rs_nbuf))
ReleaseBuffer(sdesc->rs_nbuf);
if (BufferIsValid(sdesc->rs_cbuf))
IncrBufferRefCount(sdesc->rs_cbuf);
}
sdesc->rs_ntup = sdesc->rs_ctup;
sdesc->rs_nbuf = sdesc->rs_cbuf;
if (sdesc->rs_ptup != NULL) {
if (sdesc->rs_cbuf != sdesc->rs_pbuf) {
if (BufferIsValid(sdesc->rs_cbuf))
ReleaseBuffer(sdesc->rs_cbuf);
if (BufferIsValid(sdesc->rs_pbuf))
IncrBufferRefCount(sdesc->rs_pbuf);
}
sdesc->rs_ctup = sdesc->rs_ptup;
sdesc->rs_cbuf = sdesc->rs_pbuf;
} else { /* NONTUP */
ItemPointer iptr;
iptr = (sdesc->rs_ctup != NULL) ?
&(sdesc->rs_ctup->t_ctid) : (ItemPointer) NULL;
/* Don't release sdesc->rs_cbuf at this point, because
heapgettup doesn't increase PrivateRefCount if it
is already set. On a backward scan, both rs_ctup and rs_ntup
usually point to the same buffer page, so
PrivateRefCount[rs_cbuf] should be 2 (or more, if for instance
ctup is stored in a TupleTableSlot). - 01/09/94 */
sdesc->rs_ctup = (HeapTuple)
heapgettup(sdesc->rs_rd,
iptr,
-1,
&(sdesc->rs_cbuf),
sdesc->rs_tr,
sdesc->rs_nkeys,
sdesc->rs_key);
}
if (sdesc->rs_ctup == NULL && !BufferIsValid(sdesc->rs_cbuf))
{
if (BufferIsValid(sdesc->rs_pbuf))
ReleaseBuffer(sdesc->rs_pbuf);
sdesc->rs_ptup = NULL;
sdesc->rs_pbuf = InvalidBuffer;
if (BufferIsValid(sdesc->rs_nbuf))
ReleaseBuffer(sdesc->rs_nbuf);
sdesc->rs_ntup = NULL;
sdesc->rs_nbuf = InvalidBuffer;
return (NULL);
}
if (BufferIsValid(sdesc->rs_pbuf))
ReleaseBuffer(sdesc->rs_pbuf);
sdesc->rs_ptup = NULL;
sdesc->rs_pbuf = UnknownBuffer;
} else {
/* ----------------
* handle forward scan
* ----------------
*/
if (sdesc->rs_ctup == sdesc->rs_ntup &&
BufferIsInvalid(sdesc->rs_nbuf)) {
if (BufferIsValid(sdesc->rs_pbuf))
ReleaseBuffer(sdesc->rs_pbuf);
HEAPDEBUG_3; /* heap_getnext returns NULL at end */
return (NULL);
}
/*
* Copy the "current" tuple/buffer
* to "previous". Pin/unpin the buffers
* accordingly
*/
if (sdesc->rs_pbuf != sdesc->rs_cbuf) {
if (BufferIsValid(sdesc->rs_pbuf))
ReleaseBuffer(sdesc->rs_pbuf);
if (BufferIsValid(sdesc->rs_cbuf))
IncrBufferRefCount(sdesc->rs_cbuf);
}
sdesc->rs_ptup = sdesc->rs_ctup;
sdesc->rs_pbuf = sdesc->rs_cbuf;
if (sdesc->rs_ntup != NULL) {
if (sdesc->rs_cbuf != sdesc->rs_nbuf) {
if (BufferIsValid(sdesc->rs_cbuf))
ReleaseBuffer(sdesc->rs_cbuf);
if (BufferIsValid(sdesc->rs_nbuf))
IncrBufferRefCount(sdesc->rs_nbuf);
}
sdesc->rs_ctup = sdesc->rs_ntup;
sdesc->rs_cbuf = sdesc->rs_nbuf;
HEAPDEBUG_5; /* heap_getnext next tuple was cached */
} else { /* NONTUP */
ItemPointer iptr;
iptr = (sdesc->rs_ctup != NULL) ?
&sdesc->rs_ctup->t_ctid : (ItemPointer) NULL;
/* Don't release sdesc->rs_cbuf at this point, because
heapgettup doesn't increase PrivateRefCount if it
is already set. On a forward scan, both rs_ctup and rs_ptup
usually point to the same buffer page, so
PrivateRefCount[rs_cbuf] should be 2 (or more, if for instance
ctup is stored in a TupleTableSlot). - 01/09/93 */
sdesc->rs_ctup = (HeapTuple)
heapgettup(sdesc->rs_rd,
iptr,
1,
&sdesc->rs_cbuf,
sdesc->rs_tr,
sdesc->rs_nkeys,
sdesc->rs_key);
}
if (sdesc->rs_ctup == NULL && !BufferIsValid(sdesc->rs_cbuf)) {
if (BufferIsValid(sdesc->rs_nbuf))
ReleaseBuffer(sdesc->rs_nbuf);
sdesc->rs_ntup = NULL;
sdesc->rs_nbuf = InvalidBuffer;
if (BufferIsValid(sdesc->rs_pbuf))
ReleaseBuffer(sdesc->rs_pbuf);
sdesc->rs_ptup = NULL;
sdesc->rs_pbuf = InvalidBuffer;
HEAPDEBUG_6; /* heap_getnext returning EOS */
return (NULL);
}
if (BufferIsValid(sdesc->rs_nbuf))
ReleaseBuffer(sdesc->rs_nbuf);
sdesc->rs_ntup = NULL;
sdesc->rs_nbuf = UnknownBuffer;
}
/* ----------------
* if we get here it means we have a new current scan tuple, so
* point to the proper return buffer and return the tuple.
* ----------------
*/
(*b) = sdesc->rs_cbuf;
HEAPDEBUG_7; /* heap_getnext returning tuple */
return (sdesc->rs_ctup);
}
/* ----------------
* heap_fetch - retrive tuple with tid
*
* Currently ignores LP_IVALID during processing!
* ----------------
*/
HeapTuple
heap_fetch(Relation relation,
TimeQual timeQual,
ItemPointer tid,
Buffer *b)
{
ItemId lp;
Buffer buffer;
PageHeader dp;
HeapTuple tuple;
OffsetNumber offnum;
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_fetch);
IncrHeapAccessStat(global_fetch);
/*
* Note: This is collosally expensive - does two system calls per
* indexscan tuple fetch. Not good, and since we should be doing
* page level locking by the scanner anyway, it is commented out.
*/
/* RelationSetLockForTupleRead(relation, tid); */
/* ----------------
* get the buffer from the relation descriptor
* Note that this does a buffer pin.
* ----------------
*/
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
#ifndef NO_BUFFERISVALID
if (!BufferIsValid(buffer)) {
elog(WARN, "heap_fetch: %s relation: ReadBuffer(%lx) failed",
&relation->rd_rel->relname, (long)tid);
}
#endif
/* ----------------
* get the item line pointer corresponding to the requested tid
* ----------------
*/
dp = (PageHeader) BufferGetPage(buffer);
offnum = ItemPointerGetOffsetNumber(tid);
lp = PageGetItemId(dp, offnum);
/* ----------------
* more sanity checks
* ----------------
*/
Assert(ItemIdIsUsed(lp));
/* ----------------
* check time qualification of tid
* ----------------
*/
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
tuple = heap_tuple_satisfies(lp, relation, buffer, dp,
timeQual, 0,(ScanKey)NULL);
if (tuple == NULL)
{
ReleaseBuffer(buffer);
return (NULL);
}
/* ----------------
* all checks passed, now either return a copy of the tuple
* or pin the buffer page and return a pointer, depending on
* whether caller gave us a valid b.
* ----------------
*/
if (PointerIsValid(b)) {
*b = buffer;
} else {
tuple = heap_copytuple(tuple);
ReleaseBuffer(buffer);
}
return (tuple);
}
/* ----------------
* heap_insert - insert tuple
*
* The assignment of t_min (and thus the others) should be
* removed eventually.
*
* Currently places the tuple onto the last page. If there is no room,
* it is placed on new pages. (Heap relations)
* Note that concurrent inserts during a scan will probably have
* unexpected results, though this will be fixed eventually.
*
* Fix to work with indexes.
* ----------------
*/
Oid
heap_insert(Relation relation, HeapTuple tup)
{
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_insert);
IncrHeapAccessStat(global_insert);
/* ----------------
* set relation level write lock. If this is a "local" relation (not
* visible to others), we don't need to set a write lock.
* ----------------
*/
if (!relation->rd_islocal)
RelationSetLockForWrite(relation);
/* ----------------
* If the object id of this tuple has already been assigned, trust
* the caller. There are a couple of ways this can happen. At initial
* db creation, the backend program sets oids for tuples. When we
* define an index, we set the oid. Finally, in the future, we may
* allow users to set their own object ids in order to support a
* persistent object store (objects need to contain pointers to one
* another).
* ----------------
*/
if (!OidIsValid(tup->t_oid)) {
tup->t_oid = newoid();
LastOidProcessed = tup->t_oid;
}
else
CheckMaxObjectId(tup->t_oid);
TransactionIdStore(GetCurrentTransactionId(), &(tup->t_xmin));
tup->t_cmin = GetCurrentCommandId();
StoreInvalidTransactionId(&(tup->t_xmax));
tup->t_tmin = INVALID_ABSTIME;
tup->t_tmax = CURRENT_ABSTIME;
doinsert(relation, tup);
if ( IsSystemRelationName(RelationGetRelationName(relation)->data)) {
RelationUnsetLockForWrite(relation);
/* ----------------
* invalidate caches (only works for system relations)
* ----------------
*/
SetRefreshWhenInvalidate(ImmediateInvalidation);
RelationInvalidateHeapTuple(relation, tup);
SetRefreshWhenInvalidate((bool)!ImmediateInvalidation);
}
return(tup->t_oid);
}
/* ----------------
* heap_delete - delete a tuple
*
* Must decide how to handle errors.
* ----------------
*/
void
heap_delete(Relation relation, ItemPointer tid)
{
ItemId lp;
HeapTuple tp;
PageHeader dp;
Buffer b;
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_delete);
IncrHeapAccessStat(global_delete);
/* ----------------
* sanity check
* ----------------
*/
Assert(ItemPointerIsValid(tid));
/* ----------------
* set relation level write lock
* ----------------
*/
RelationSetLockForWrite(relation);
b = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
#ifndef NO_BUFFERISVALID
if (!BufferIsValid(b)) { /* XXX L_SH better ??? */
elog(WARN, "heap_delete: failed ReadBuffer");
}
#endif /* NO_BUFFERISVALID */
dp = (PageHeader) BufferGetPage(b);
lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid));
/* ----------------
* check that we're deleteing a valid item
* ----------------
*/
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
if (!(tp = heap_tuple_satisfies(lp, relation, b, dp,
NowTimeQual, 0, (ScanKey) NULL))) {
/* XXX call something else */
ReleaseBuffer(b);
elog(WARN, "heap_delete: (am)invalid tid");
}
/* ----------------
* get the tuple and lock tell the buffer manager we want
* exclusive access to the page
* ----------------
*/
/* ----------------
* store transaction information of xact deleting the tuple
* ----------------
*/
TransactionIdStore(GetCurrentTransactionId(), &(tp->t_xmax));
tp->t_cmax = GetCurrentCommandId();
ItemPointerSetInvalid(&tp->t_chain);
/* ----------------
* invalidate caches
* ----------------
*/
SetRefreshWhenInvalidate(ImmediateInvalidation);
RelationInvalidateHeapTuple(relation, tp);
SetRefreshWhenInvalidate((bool)!ImmediateInvalidation);
WriteBuffer(b);
if ( IsSystemRelationName(RelationGetRelationName(relation)->data) )
RelationUnsetLockForWrite(relation);
}
/* ----------------
* heap_replace - replace a tuple
*
* Must decide how to handle errors.
*
* Fix arguments, work with indexes.
*
* 12/30/93 - modified the return value to be 1 when
* a non-functional update is detected. This
* prevents the calling routine from updating
* indices unnecessarily. -kw
*
* ----------------
*/
int
heap_replace(Relation relation, ItemPointer otid, HeapTuple tup)
{
ItemId lp;
HeapTuple tp;
Page dp;
Buffer buffer;
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_replace);
IncrHeapAccessStat(global_replace);
/* ----------------
* sanity checks
* ----------------
*/
Assert(ItemPointerIsValid(otid));
/* ----------------
* set relation level write lock
* ----------------
*/
if (!relation->rd_islocal)
RelationSetLockForWrite(relation);
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
#ifndef NO_BUFFERISVALID
if (!BufferIsValid(buffer)) {
/* XXX L_SH better ??? */
elog(WARN, "amreplace: failed ReadBuffer");
}
#endif /* NO_BUFFERISVALID */
dp = (Page) BufferGetPage(buffer);
lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(otid));
/* ----------------
* logically delete old item
* ----------------
*/
tp = (HeapTuple) PageGetItem(dp, lp);
Assert(HeapTupleIsValid(tp));
/* -----------------
* the following test should be able to catch all non-functional
* update attempts and shut out all ghost tuples.
* XXX In the future, Spyros may need to update the rule lock on a tuple
* more than once within the same command and same transaction.
* He will have to introduce a new flag to override the following check.
* -- Wei
*
* -----------------
*/
if (TupleUpdatedByCurXactAndCmd(tp)) {
elog(NOTICE, "Non-functional update, only first update is performed");
if ( IsSystemRelationName(RelationGetRelationName(relation)->data) )
RelationUnsetLockForWrite(relation);
ReleaseBuffer(buffer);
return(1);
}
/* ----------------
* check that we're replacing a valid item -
*
* NOTE that this check must follow the non-functional update test
* above as it can happen that we try to 'replace' the same tuple
* twice in a single transaction. The second time around the
* tuple will fail the NowTimeQual. We don't want to abort the
* xact, we only want to flag the 'non-functional' NOTICE. -mer
* ----------------
*/
if (!heap_tuple_satisfies(lp,
relation,
From: Dan McGuirk <mcguirk@indirect.com> Reply-To: hackers@hub.org, Dan McGuirk <mcguirk@indirect.com> To: hackers@hub.org Subject: [HACKERS] tmin writeback optimization I was doing some profiling of the backend, and noticed that during a certain benchmark I was running somewhere between 30% and 75% of the backend's CPU time was being spent in calls to TransactionIdDidCommit() from HeapTupleSatisfiesNow() or HeapTupleSatisfiesItself() to determine that changed rows' transactions had in fact been committed even though the rows' tmin values had not yet been set. When a query looks at a given row, it needs to figure out whether the transaction that changed the row has been committed and hence it should pay attention to the row, or whether on the other hand the transaction is still in progress or has been aborted and hence the row should be ignored. If a tmin value is set, it is known definitively that the row's transaction has been committed. However, if tmin is not set, the transaction referred to in xmin must be looked up in pg_log, and this is what the backend was spending a lot of time doing during my benchmark. So, implementing a method suggested by Vadim, I created the following patch that, the first time a query finds a committed row whose tmin value is not set, sets it, and marks the buffer where the row is stored as dirty. (It works for tmax, too.) This doesn't result in the boost in real time performance I was hoping for, however it does decrease backend CPU usage by up to two-thirds in certain situations, so it could be rather beneficial in high-concurrency settings.
1997-03-28 08:06:53 +01:00
buffer,
(PageHeader)dp,
NowTimeQual,
0,
(ScanKey)NULL))
{
ReleaseBuffer(buffer);
elog(WARN, "heap_replace: (am)invalid otid");
}
/* XXX order problems if not atomic assignment ??? */
tup->t_oid = tp->t_oid;
TransactionIdStore(GetCurrentTransactionId(), &(tup->t_xmin));
tup->t_cmin = GetCurrentCommandId();
StoreInvalidTransactionId(&(tup->t_xmax));
tup->t_tmin = INVALID_ABSTIME;
tup->t_tmax = CURRENT_ABSTIME;
ItemPointerSetInvalid(&tup->t_chain);
/* ----------------
* insert new item
* ----------------
*/
if ((unsigned)DOUBLEALIGN(tup->t_len) <= PageGetFreeSpace((Page) dp)) {
RelationPutHeapTuple(relation, BufferGetBlockNumber(buffer), tup);
} else {
/* ----------------
* new item won't fit on same page as old item, have to look
* for a new place to put it.
* ----------------
*/
doinsert(relation, tup);
}
/* ----------------
* new item in place, now record transaction information
* ----------------
*/
TransactionIdStore(GetCurrentTransactionId(), &(tp->t_xmax));
tp->t_cmax = GetCurrentCommandId();
tp->t_chain = tup->t_ctid;
/* ----------------
* invalidate caches
* ----------------
*/
SetRefreshWhenInvalidate(ImmediateInvalidation);
RelationInvalidateHeapTuple(relation, tp);
SetRefreshWhenInvalidate((bool)!ImmediateInvalidation);
WriteBuffer(buffer);
if ( IsSystemRelationName(RelationGetRelationName(relation)->data) )
RelationUnsetLockForWrite(relation);
return(0);
}
/* ----------------
* heap_markpos - mark scan position
*
* Note:
* Should only one mark be maintained per scan at one time.
* Check if this can be done generally--say calls to get the
* next/previous tuple and NEVER pass struct scandesc to the
* user AM's. Now, the mark is sent to the executor for safekeeping.
* Probably can store this info into a GENERAL scan structure.
*
* May be best to change this call to store the marked position
* (up to 2?) in the scan structure itself.
* Fix to use the proper caching structure.
* ----------------
*/
void
heap_markpos(HeapScanDesc sdesc)
{
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_markpos);
IncrHeapAccessStat(global_markpos);
/* Note: no locking manipulations needed */
if (sdesc->rs_ptup == NULL &&
BufferIsUnknown(sdesc->rs_pbuf)) { /* == NONTUP */
sdesc->rs_ptup = (HeapTuple)
heapgettup(sdesc->rs_rd,
(sdesc->rs_ctup == NULL) ?
(ItemPointer)NULL : &sdesc->rs_ctup->t_ctid,
-1,
&sdesc->rs_pbuf,
sdesc->rs_tr,
sdesc->rs_nkeys,
sdesc->rs_key);
} else if (sdesc->rs_ntup == NULL &&
BufferIsUnknown(sdesc->rs_nbuf)) { /* == NONTUP */
sdesc->rs_ntup = (HeapTuple)
heapgettup(sdesc->rs_rd,
(sdesc->rs_ctup == NULL) ?
(ItemPointer)NULL : &sdesc->rs_ctup->t_ctid,
1,
&sdesc->rs_nbuf,
sdesc->rs_tr,
sdesc->rs_nkeys,
sdesc->rs_key);
}
/* ----------------
* Should not unpin the buffer pages. They may still be in use.
* ----------------
*/
if (sdesc->rs_ptup != NULL) {
sdesc->rs_mptid = sdesc->rs_ptup->t_ctid;
} else {
ItemPointerSetInvalid(&sdesc->rs_mptid);
}
if (sdesc->rs_ctup != NULL) {
sdesc->rs_mctid = sdesc->rs_ctup->t_ctid;
} else {
ItemPointerSetInvalid(&sdesc->rs_mctid);
}
if (sdesc->rs_ntup != NULL) {
sdesc->rs_mntid = sdesc->rs_ntup->t_ctid;
} else {
ItemPointerSetInvalid(&sdesc->rs_mntid);
}
}
/* ----------------
* heap_restrpos - restore position to marked location
*
* Note: there are bad side effects here. If we were past the end
* of a relation when heapmarkpos is called, then if the relation is
* extended via insert, then the next call to heaprestrpos will set
* cause the added tuples to be visible when the scan continues.
* Problems also arise if the TID's are rearranged!!!
*
* Now pins buffer once for each valid tuple pointer (rs_ptup,
* rs_ctup, rs_ntup) referencing it.
* - 01/13/94
*
* XXX might be better to do direct access instead of
* using the generality of heapgettup().
*
* XXX It is very possible that when a scan is restored, that a tuple
* XXX which previously qualified may fail for time range purposes, unless
* XXX some form of locking exists (ie., portals currently can act funny.
* ----------------
*/
void
heap_restrpos(HeapScanDesc sdesc)
{
/* ----------------
* increment access statistics
* ----------------
*/
IncrHeapAccessStat(local_restrpos);
IncrHeapAccessStat(global_restrpos);
/* XXX no amrestrpos checking that ammarkpos called */
/* Note: no locking manipulations needed */
unpinsdesc(sdesc);
/* force heapgettup to pin buffer for each loaded tuple */
sdesc->rs_pbuf = InvalidBuffer;
sdesc->rs_cbuf = InvalidBuffer;
sdesc->rs_nbuf = InvalidBuffer;
if (!ItemPointerIsValid(&sdesc->rs_mptid)) {
sdesc->rs_ptup = NULL;
} else {
sdesc->rs_ptup = (HeapTuple)
heapgettup(sdesc->rs_rd,
&sdesc->rs_mptid,
0,
&sdesc->rs_pbuf,
NowTimeQual,
0,
(ScanKey) NULL);
}
if (!ItemPointerIsValid(&sdesc->rs_mctid)) {
sdesc->rs_ctup = NULL;
} else {
sdesc->rs_ctup = (HeapTuple)
heapgettup(sdesc->rs_rd,
&sdesc->rs_mctid,
0,
&sdesc->rs_cbuf,
NowTimeQual,
0,
(ScanKey) NULL);
}
if (!ItemPointerIsValid(&sdesc->rs_mntid)) {
sdesc->rs_ntup = NULL;
} else {
sdesc->rs_ntup = (HeapTuple)
heapgettup(sdesc->rs_rd,
&sdesc->rs_mntid,
0,
&sdesc->rs_nbuf,
NowTimeQual,
0,
(ScanKey) NULL);
}
}