Tweak indexscan and seqscan code to arrange that steps from one page to

the next are handled by ReleaseAndReadBuffer rather than separate
ReleaseBuffer and ReadBuffer calls.  This cuts the number of acquisitions
of the BufMgrLock by a factor of 2 (possibly more, if an indexscan happens
to pull successive rows from the same heap page).  Unfortunately this
doesn't seem enough to get us out of the recently discussed context-switch
storm problem, but it's surely worth doing anyway.
This commit is contained in:
Tom Lane 2004-04-21 18:24:26 +00:00
parent 95a03e9cdf
commit 37fa3b6c89
11 changed files with 132 additions and 65 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.164 2004/04/01 21:28:43 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.165 2004/04/21 18:24:23 tgl Exp $
*
*
* INTERFACE ROUTINES
@ -888,6 +888,28 @@ heap_fetch(Relation relation,
Buffer *userbuf,
bool keep_buf,
PgStat_Info *pgstat_info)
{
/* Assume *userbuf is undefined on entry */
*userbuf = InvalidBuffer;
return heap_release_fetch(relation, snapshot, tuple,
userbuf, keep_buf, pgstat_info);
}
/*
* heap_release_fetch - retrieve tuple with given tid
*
* This has the same API as heap_fetch except that if *userbuf is not
* InvalidBuffer on entry, that buffer will be released before reading
* the new page. This saves a separate ReleaseBuffer step and hence
* one entry into the bufmgr when looping through multiple fetches.
*/
bool
heap_release_fetch(Relation relation,
Snapshot snapshot,
HeapTuple tuple,
Buffer *userbuf,
bool keep_buf,
PgStat_Info *pgstat_info)
{
ItemPointer tid = &(tuple->t_self);
ItemId lp;
@ -898,9 +920,10 @@ heap_fetch(Relation relation,
/*
* get the buffer from the relation descriptor. Note that this does a
* buffer pin.
* buffer pin, and releases the old *userbuf if not InvalidBuffer.
*/
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
buffer = ReleaseAndReadBuffer(*userbuf, relation,
ItemPointerGetBlockNumber(tid));
if (!BufferIsValid(buffer))
elog(ERROR, "ReadBuffer(\"%s\", %lu) failed",

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.72 2003/11/29 19:51:40 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.73 2004/04/21 18:24:24 tgl Exp $
*
* INTERFACE ROUTINES
* index_open - open an index relation by relation OID
@ -316,6 +316,13 @@ index_rescan(IndexScanDesc scan, ScanKey key)
SCAN_CHECKS;
GET_SCAN_PROCEDURE(rescan, amrescan);
/* Release any held pin on a heap page */
if (BufferIsValid(scan->xs_cbuf))
{
ReleaseBuffer(scan->xs_cbuf);
scan->xs_cbuf = InvalidBuffer;
}
scan->kill_prior_tuple = false; /* for safety */
scan->keys_are_unique = false; /* may be set by index AM */
scan->got_tuple = false;
@ -419,13 +426,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
SCAN_CHECKS;
/* Release any previously held pin */
if (BufferIsValid(scan->xs_cbuf))
{
ReleaseBuffer(scan->xs_cbuf);
scan->xs_cbuf = InvalidBuffer;
}
/*
* If we already got a tuple and it must be unique, there's no need to
* make the index AM look through any additional tuples. (This can
@ -508,14 +508,22 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
scan->kill_prior_tuple = false;
if (!found)
{
/* Release any held pin on a heap page */
if (BufferIsValid(scan->xs_cbuf))
{
ReleaseBuffer(scan->xs_cbuf);
scan->xs_cbuf = InvalidBuffer;
}
return NULL; /* failure exit */
}
/*
* Fetch the heap tuple and see if it matches the snapshot.
*/
if (heap_fetch(scan->heapRelation, scan->xs_snapshot,
heapTuple, &scan->xs_cbuf, true,
&scan->xs_pgstat_info))
if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot,
heapTuple, &scan->xs_cbuf, true,
&scan->xs_pgstat_info))
break;
/* Skip if no tuple at this location */
@ -527,7 +535,7 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
* if the tuple is dead to all transactions. If so, signal the
* index AM to not return it on future indexscans.
*
* We told heap_fetch to keep a pin on the buffer, so we can
* We told heap_release_fetch to keep a pin on the buffer, so we can
* re-access the tuple here. But we must re-lock the buffer
* first. Also, it's just barely possible for an update of hint
* bits to occur here.
@ -542,8 +550,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
if (sv_infomask != heapTuple->t_data->t_infomask)
SetBufferCommitInfoNeedsSave(scan->xs_cbuf);
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(scan->xs_cbuf);
scan->xs_cbuf = InvalidBuffer;
}
/* Success exit */

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.111 2004/01/07 18:56:24 neilc Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.112 2004/04/21 18:24:25 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -297,9 +297,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
for (;;)
{
nblkno = opaque->btpo_next;
if (nbuf != InvalidBuffer)
_bt_relbuf(rel, nbuf);
nbuf = _bt_getbuf(rel, nblkno, BT_READ);
nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
page = BufferGetPage(nbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (!P_IGNORE(opaque))
@ -454,9 +452,7 @@ _bt_insertonpg(Relation rel,
{
BlockNumber rblkno = lpageop->btpo_next;
if (rbuf != InvalidBuffer)
_bt_relbuf(rel, rbuf);
rbuf = _bt_getbuf(rel, rblkno, BT_WRITE);
rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
page = BufferGetPage(rbuf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
if (!P_IGNORE(lpageop))
@ -1357,7 +1353,7 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
if (P_RIGHTMOST(opaque))
{
_bt_relbuf(rel, buf);
return (InvalidBuffer);
return InvalidBuffer;
}
blkno = opaque->btpo_next;
start = InvalidOffsetNumber;

View File

@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.74 2003/12/21 01:23:06 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.75 2004/04/21 18:24:25 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@ -262,11 +262,15 @@ _bt_getroot(Relation rel, int access)
Assert(rootblkno != P_NONE);
rootlevel = metad->btm_fastlevel;
_bt_relbuf(rel, metabuf); /* done with the meta page */
/*
* We are done with the metapage; arrange to release it via
* first _bt_relandgetbuf call
*/
rootbuf = metabuf;
for (;;)
{
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
rootpage = BufferGetPage(rootbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
@ -278,8 +282,6 @@ _bt_getroot(Relation rel, int access)
elog(ERROR, "no live root page found in \"%s\"",
RelationGetRelationName(rel));
rootblkno = rootopaque->btpo_next;
_bt_relbuf(rel, rootbuf);
}
/* Note: can't check btpo.level on deleted pages */
@ -352,11 +354,15 @@ _bt_gettrueroot(Relation rel)
rootblkno = metad->btm_root;
rootlevel = metad->btm_level;
_bt_relbuf(rel, metabuf); /* done with the meta page */
/*
* We are done with the metapage; arrange to release it via
* first _bt_relandgetbuf call
*/
rootbuf = metabuf;
for (;;)
{
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
rootpage = BufferGetPage(rootbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
@ -368,8 +374,6 @@ _bt_gettrueroot(Relation rel)
elog(ERROR, "no live root page found in \"%s\"",
RelationGetRelationName(rel));
rootblkno = rootopaque->btpo_next;
_bt_relbuf(rel, rootbuf);
}
/* Note: can't check btpo.level on deleted pages */
@ -492,6 +496,28 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
return buf;
}
/*
* _bt_relandgetbuf() -- release a locked buffer and get another one.
*
* This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
* exception that blkno may not be P_NEW. Also, if obuf is InvalidBuffer
* then it reduces to just _bt_getbuf; allowing this case simplifies some
* callers. The motivation for using this is to avoid two entries to the
* bufmgr when one will do.
*/
Buffer
_bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
{
Buffer buf;
Assert(blkno != P_NEW);
if (BufferIsValid(obuf))
LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
buf = ReleaseAndReadBuffer(obuf, rel, blkno);
LockBuffer(buf, access);
return buf;
}
/*
* _bt_relbuf() -- release a locked buffer.
*

View File

@ -12,7 +12,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.113 2004/02/10 03:42:43 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.114 2004/04/21 18:24:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -919,9 +919,7 @@ _bt_restscan(IndexScanDesc scan)
for (;;)
{
blkno = opaque->btpo_next;
if (nextbuf != InvalidBuffer)
_bt_relbuf(rel, nextbuf);
nextbuf = _bt_getbuf(rel, blkno, BT_READ);
nextbuf = _bt_relandgetbuf(rel, nextbuf, blkno, BT_READ);
page = BufferGetPage(nextbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (!P_IGNORE(opaque))

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.86 2003/12/21 17:52:34 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.87 2004/04/21 18:24:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -108,8 +108,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
new_stack->bts_parent = stack_in;
/* drop the read lock on the parent page, acquire one on the child */
_bt_relbuf(rel, *bufP);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
/* okay, all set to move down a level */
stack_in = new_stack;
@ -178,8 +177,7 @@ _bt_moveright(Relation rel,
/* step right one page */
BlockNumber rblkno = opaque->btpo_next;
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, rblkno, access);
buf = _bt_relandgetbuf(rel, buf, rblkno, access);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
@ -933,8 +931,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
}
/* step right one page */
blkno = opaque->btpo_next;
_bt_relbuf(rel, *bufP);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (!P_IGNORE(opaque))
@ -1041,8 +1038,7 @@ _bt_walk_left(Relation rel, Buffer buf)
obknum = BufferGetBlockNumber(buf);
/* step left */
blkno = lblkno = opaque->btpo_prev;
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@ -1069,15 +1065,13 @@ _bt_walk_left(Relation rel, Buffer buf)
if (P_RIGHTMOST(opaque) || ++tries > 4)
break;
blkno = opaque->btpo_next;
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
/* Return to the original page to see what's up */
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, obknum, BT_READ);
buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (P_ISDELETED(opaque))
@ -1094,8 +1088,7 @@ _bt_walk_left(Relation rel, Buffer buf)
elog(ERROR, "fell off the end of \"%s\"",
RelationGetRelationName(rel));
blkno = opaque->btpo_next;
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (!P_ISDELETED(opaque))
@ -1177,8 +1170,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
if (blkno == P_NONE)
elog(ERROR, "fell off the end of \"%s\"",
RelationGetRelationName(rel));
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
@ -1199,8 +1191,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeIndexscan.c,v 1.92 2004/02/28 19:46:05 tgl Exp $
* $PostgreSQL: pgsql/src/backend/executor/nodeIndexscan.c,v 1.93 2004/04/21 18:24:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -128,6 +128,15 @@ IndexNext(IndexScanState *node)
slot = node->ss.ss_ScanTupleSlot;
scanrelid = ((IndexScan *) node->ss.ps.plan)->scan.scanrelid;
/*
* Clear any reference to the previously returned tuple. The idea here
* is to not have the tuple slot be the last holder of a pin on that
* tuple's buffer; if it is, we'll need a separate visit to the bufmgr
* to release the buffer. By clearing here, we get to have the release
* done by ReleaseAndReadBuffer inside index_getnext.
*/
ExecClearTuple(slot);
/*
* Check if we are evaluating PlanQual for tuple of this relation.
* Additional checking is not good, but no other way for now. We could
@ -139,7 +148,6 @@ IndexNext(IndexScanState *node)
{
List *qual;
ExecClearTuple(slot);
if (estate->es_evTupleNull[scanrelid - 1])
return slot; /* return empty slot */

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeSeqscan.c,v 1.47 2003/11/29 19:51:48 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/executor/nodeSeqscan.c,v 1.48 2004/04/21 18:24:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -61,6 +61,15 @@ SeqNext(SeqScanState *node)
direction = estate->es_direction;
slot = node->ss_ScanTupleSlot;
/*
* Clear any reference to the previously returned tuple. The idea here
* is to not have the tuple slot be the last holder of a pin on that
* tuple's buffer; if it is, we'll need a separate visit to the bufmgr
* to release the buffer. By clearing here, we get to have the release
* done by ReleaseAndReadBuffer inside heap_getnext.
*/
ExecClearTuple(slot);
/*
* Check if we are evaluating PlanQual for tuple of this relation.
* Additional checking is not good, but no other way for now. We could
@ -70,7 +79,6 @@ SeqNext(SeqScanState *node)
if (estate->es_evTuple != NULL &&
estate->es_evTuple[scanrelid - 1] != NULL)
{
ExecClearTuple(slot);
if (estate->es_evTupleNull[scanrelid - 1])
return slot; /* return empty slot */

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/executor/nodeTidscan.c,v 1.36 2003/11/29 19:51:48 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/executor/nodeTidscan.c,v 1.37 2004/04/21 18:24:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -106,6 +106,13 @@ TidNext(TidScanState *node)
slot = node->ss.ss_ScanTupleSlot;
scanrelid = ((TidScan *) node->ss.ps.plan)->scan.scanrelid;
/*
* Clear any reference to the previously returned tuple. This doesn't
* offer any great performance benefit, but it keeps this code in sync
* with SeqNext and IndexNext.
*/
ExecClearTuple(slot);
/*
* Check if we are evaluating PlanQual for tuple of this relation.
* Additional checking is not good, but no other way for now. We could
@ -115,7 +122,6 @@ TidNext(TidScanState *node)
if (estate->es_evTuple != NULL &&
estate->es_evTuple[scanrelid - 1] != NULL)
{
ExecClearTuple(slot);
if (estate->es_evTupleNull[scanrelid - 1])
return slot; /* return empty slot */

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.88 2004/04/01 21:28:45 tgl Exp $
* $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.89 2004/04/21 18:24:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -150,6 +150,9 @@ extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
extern bool heap_fetch(Relation relation, Snapshot snapshot,
HeapTuple tuple, Buffer *userbuf, bool keep_buf,
PgStat_Info *pgstat_info);
extern bool heap_release_fetch(Relation relation, Snapshot snapshot,
HeapTuple tuple, Buffer *userbuf, bool keep_buf,
PgStat_Info *pgstat_info);
extern ItemPointer heap_get_latest_tid(Relation relation, Snapshot snapshot,
ItemPointer tid);

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.76 2004/02/03 17:34:03 tgl Exp $
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.77 2004/04/21 18:24:26 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -446,6 +446,8 @@ extern void _bt_metapinit(Relation rel, bool markvalid);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
BlockNumber blkno, int access);
extern void _bt_relbuf(Relation rel, Buffer buf);
extern void _bt_wrtbuf(Relation rel, Buffer buf);
extern void _bt_wrtnorelbuf(Relation rel, Buffer buf);