This patch makes some improvements to the rtree index implementation:

(1) Keep a pin on the scan's current buffer and mark buffer. This
avoids the need to do a ReadBuffer() for each tuple produced by the
scan. Since ReadBuffer() is expensive, this is a significant win.

(2) Convert a ReleaseBuffer(); ReadBuffer() pair into
ReleaseAndReadBuffer(). Surely not a huge win, but it saves a lock
acquire/release...

(3) Remove a bunch of duplicated code in rtget.c; make rtnext() handle
both the "initial result" and "subsequent result" cases.

(4) Add support for index tuple killing

(5) Remove rtscancache(): it is dead code, for the same reason that
gistscancache() is dead code (an index scan ought not be invoked with
NoMovementScanDirection).

The end result is about a 10% improvement in rtree index scan perf,
according to contrib/rtree_gist/bench.
This commit is contained in:
Neil Conway 2005-01-18 23:25:55 +00:00
parent 1f5299bc3f
commit b4297c177c
4 changed files with 159 additions and 156 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.33 2004/12/31 21:59:26 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.34 2005/01/18 23:25:43 neilc Exp $
*
*-------------------------------------------------------------------------
*/
@ -19,10 +19,8 @@
#include "access/relscan.h"
#include "access/rtree.h"
static OffsetNumber findnext(IndexScanDesc s, Page p, OffsetNumber n,
static OffsetNumber findnext(IndexScanDesc s, OffsetNumber n,
ScanDirection dir);
static bool rtscancache(IndexScanDesc s, ScanDirection dir);
static bool rtfirst(IndexScanDesc s, ScanDirection dir);
static bool rtnext(IndexScanDesc s, ScanDirection dir);
@ -31,138 +29,106 @@ rtgettuple(PG_FUNCTION_ARGS)
{
IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0);
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
bool res;
/* if we have it cached in the scan desc, just return the value */
if (rtscancache(s, dir))
PG_RETURN_BOOL(true);
/* not cached, so we'll have to do some work */
if (ItemPointerIsValid(&(s->currentItemData)))
res = rtnext(s, dir);
else
res = rtfirst(s, dir);
PG_RETURN_BOOL(res);
}
static bool
rtfirst(IndexScanDesc s, ScanDirection dir)
{
Buffer b;
Page p;
OffsetNumber n;
OffsetNumber maxoff;
RTreePageOpaque po;
Page page;
OffsetNumber offnum;
RTreeScanOpaque so;
RTSTACK *stk;
BlockNumber blk;
IndexTuple it;
b = ReadBuffer(s->indexRelation, P_ROOT);
p = BufferGetPage(b);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
so = (RTreeScanOpaque) s->opaque;
/*
* If we've already produced a tuple and the executor has informed
* us that it should be marked "killed", do so know.
*/
if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData)))
{
offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
page = BufferGetPage(so->curbuf);
PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
SetBufferCommitInfoNeedsSave(so->curbuf);
}
/*
* Get the next tuple that matches the search key; if asked to
* skip killed tuples, find the first non-killed tuple that
* matches. Return as soon as we've run out of matches or we've
* found an acceptable match.
*/
for (;;)
{
maxoff = PageGetMaxOffsetNumber(p);
if (ScanDirectionIsBackward(dir))
n = findnext(s, p, maxoff, dir);
else
n = findnext(s, p, FirstOffsetNumber, dir);
bool res = rtnext(s, dir);
while (n < FirstOffsetNumber || n > maxoff)
if (res == true && s->ignore_killed_tuples)
{
ReleaseBuffer(b);
if (so->s_stack == NULL)
return false;
stk = so->s_stack;
b = ReadBuffer(s->indexRelation, stk->rts_blk);
p = BufferGetPage(b);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
maxoff = PageGetMaxOffsetNumber(p);
if (ScanDirectionIsBackward(dir))
n = OffsetNumberPrev(stk->rts_child);
else
n = OffsetNumberNext(stk->rts_child);
so->s_stack = stk->rts_parent;
pfree(stk);
n = findnext(s, p, n, dir);
offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
page = BufferGetPage(so->curbuf);
if (ItemIdDeleted(PageGetItemId(page, offnum)))
continue;
}
if (po->flags & F_LEAF)
{
ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
s->xs_ctup.t_self = it->t_tid;
ReleaseBuffer(b);
return true;
}
else
{
stk = (RTSTACK *) palloc(sizeof(RTSTACK));
stk->rts_child = n;
stk->rts_blk = BufferGetBlockNumber(b);
stk->rts_parent = so->s_stack;
so->s_stack = stk;
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
blk = ItemPointerGetBlockNumber(&(it->t_tid));
ReleaseBuffer(b);
b = ReadBuffer(s->indexRelation, blk);
p = BufferGetPage(b);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
}
PG_RETURN_BOOL(res);
}
}
static bool
rtnext(IndexScanDesc s, ScanDirection dir)
{
Buffer b;
Page p;
OffsetNumber n;
OffsetNumber maxoff;
RTreePageOpaque po;
RTreeScanOpaque so;
RTSTACK *stk;
BlockNumber blk;
IndexTuple it;
blk = ItemPointerGetBlockNumber(&(s->currentItemData));
n = ItemPointerGetOffsetNumber(&(s->currentItemData));
if (ScanDirectionIsForward(dir))
n = OffsetNumberNext(n);
else
n = OffsetNumberPrev(n);
b = ReadBuffer(s->indexRelation, blk);
p = BufferGetPage(b);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
so = (RTreeScanOpaque) s->opaque;
if (!ItemPointerIsValid(&(s->currentItemData)))
{
/* first call: start at the root */
Assert(BufferIsValid(so->curbuf) == false);
so->curbuf = ReadBuffer(s->indexRelation, P_ROOT);
}
p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (!ItemPointerIsValid(&(s->currentItemData)))
{
/* first call: start at first/last offset */
if (ScanDirectionIsForward(dir))
n = FirstOffsetNumber;
else
n = PageGetMaxOffsetNumber(p);
}
else
{
/* go on to the next offset */
n = ItemPointerGetOffsetNumber(&(s->currentItemData));
if (ScanDirectionIsForward(dir))
n = OffsetNumberNext(n);
else
n = OffsetNumberPrev(n);
}
for (;;)
{
maxoff = PageGetMaxOffsetNumber(p);
n = findnext(s, p, n, dir);
IndexTuple it;
RTSTACK *stk;
while (n < FirstOffsetNumber || n > maxoff)
n = findnext(s, n, dir);
/* no match on this page, so read in the next stack entry */
if (n == InvalidOffsetNumber)
{
ReleaseBuffer(b);
/* if out of stack entries, we're done */
if (so->s_stack == NULL)
{
ReleaseBuffer(so->curbuf);
so->curbuf = InvalidBuffer;
return false;
}
stk = so->s_stack;
b = ReadBuffer(s->indexRelation, stk->rts_blk);
p = BufferGetPage(b);
maxoff = PageGetMaxOffsetNumber(p);
so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation,
stk->rts_blk);
p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (ScanDirectionIsBackward(dir))
@ -172,33 +138,41 @@ rtnext(IndexScanDesc s, ScanDirection dir)
so->s_stack = stk->rts_parent;
pfree(stk);
n = findnext(s, p, n, dir);
continue;
}
if (po->flags & F_LEAF)
{
ItemPointerSet(&(s->currentItemData), BufferGetBlockNumber(b), n);
ItemPointerSet(&(s->currentItemData),
BufferGetBlockNumber(so->curbuf),
n);
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
s->xs_ctup.t_self = it->t_tid;
ReleaseBuffer(b);
return true;
}
else
{
BlockNumber blk;
stk = (RTSTACK *) palloc(sizeof(RTSTACK));
stk->rts_child = n;
stk->rts_blk = BufferGetBlockNumber(b);
stk->rts_blk = BufferGetBlockNumber(so->curbuf);
stk->rts_parent = so->s_stack;
so->s_stack = stk;
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
blk = ItemPointerGetBlockNumber(&(it->t_tid));
ReleaseBuffer(b);
b = ReadBuffer(s->indexRelation, blk);
p = BufferGetPage(b);
/*
* Note that we release the pin on the page as we descend
* down the tree, even though there's a good chance we'll
* eventually need to re-read the buffer later in this
* scan. This may or may not be optimal, but it doesn't
* seem likely to make a huge performance difference
* either way.
*/
so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk);
p = BufferGetPage(so->curbuf);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
if (ScanDirectionIsBackward(dir))
@ -209,17 +183,26 @@ rtnext(IndexScanDesc s, ScanDirection dir)
}
}
/*
* Return the offset of the next matching index entry. We begin the
* search at offset "n" and search for matches in the direction
* "dir". If no more matching entries are found on the page,
* InvalidOffsetNumber is returned.
*/
static OffsetNumber
findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
{
OffsetNumber maxoff;
IndexTuple it;
RTreePageOpaque po;
RTreeScanOpaque so;
Page p;
so = (RTreeScanOpaque) s->opaque;
p = BufferGetPage(so->curbuf);
maxoff = PageGetMaxOffsetNumber(p);
po = (RTreePageOpaque) PageGetSpecialPointer(p);
so = (RTreeScanOpaque) s->opaque;
/*
* If we modified the index during the scan, we may have a pointer to
@ -256,28 +239,8 @@ findnext(IndexScanDesc s, Page p, OffsetNumber n, ScanDirection dir)
n = OffsetNumberNext(n);
}
return n;
}
static bool
rtscancache(IndexScanDesc s, ScanDirection dir)
{
Buffer b;
Page p;
OffsetNumber n;
IndexTuple it;
if (!(ScanDirectionIsNoMovement(dir)
&& ItemPointerIsValid(&(s->currentItemData))))
return false;
b = ReadBuffer(s->indexRelation,
ItemPointerGetBlockNumber(&(s->currentItemData)));
p = BufferGetPage(b);
n = ItemPointerGetOffsetNumber(&(s->currentItemData));
it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n));
s->xs_ctup.t_self = it->t_tid;
ReleaseBuffer(b);
return true;
if (n >= FirstOffsetNumber && n <= maxoff)
return n; /* found a match on this page */
else
return InvalidOffsetNumber; /* no match, go to next page */
}

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.85 2004/12/31 21:59:26 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.86 2005/01/18 23:25:47 neilc Exp $
*
*-------------------------------------------------------------------------
*/
@ -280,12 +280,8 @@ rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate)
do
{
/* let go of current buffer before getting next */
if (buffer != InvalidBuffer)
ReleaseBuffer(buffer);
/* get next buffer */
buffer = ReadBuffer(r, blk);
/* release the current buffer, read in the next one */
buffer = ReleaseAndReadBuffer(buffer, r, blk);
page = (Page) BufferGetPage(buffer);
opaque = (RTreePageOpaque) PageGetSpecialPointer(page);

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.56 2004/12/31 21:59:26 pgsql Exp $
* $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.57 2005/01/18 23:25:48 neilc Exp $
*
*-------------------------------------------------------------------------
*/
@ -89,12 +89,24 @@ rtrescan(PG_FUNCTION_ARGS)
freestack(p->s_markstk);
p->s_stack = p->s_markstk = NULL;
p->s_flags = 0x0;
/* drop pins on buffers -- no locks held */
if (BufferIsValid(p->curbuf))
{
ReleaseBuffer(p->curbuf);
p->curbuf = InvalidBuffer;
}
if (BufferIsValid(p->markbuf))
{
ReleaseBuffer(p->markbuf);
p->markbuf = InvalidBuffer;
}
}
else
{
/* initialize opaque data */
p = (RTreeScanOpaque) palloc(sizeof(RTreeScanOpaqueData));
p->s_stack = p->s_markstk = NULL;
p->curbuf = p->markbuf = InvalidBuffer;
p->s_internalNKey = s->numberOfKeys;
p->s_flags = 0x0;
s->opaque = p;
@ -175,6 +187,18 @@ rtmarkpos(PG_FUNCTION_ARGS)
freestack(p->s_markstk);
p->s_markstk = o;
/* Update markbuf: make sure to bump ref count on curbuf */
if (BufferIsValid(p->markbuf))
{
ReleaseBuffer(p->markbuf);
p->markbuf = InvalidBuffer;
}
if (BufferIsValid(p->curbuf))
{
IncrBufferRefCount(p->curbuf);
p->markbuf = p->curbuf;
}
PG_RETURN_VOID();
}
@ -211,6 +235,18 @@ rtrestrpos(PG_FUNCTION_ARGS)
freestack(p->s_stack);
p->s_stack = o;
/* Update curbuf; be sure to bump ref count on markbuf */
if (BufferIsValid(p->curbuf))
{
ReleaseBuffer(p->curbuf);
p->curbuf = InvalidBuffer;
}
if (BufferIsValid(p->markbuf))
{
IncrBufferRefCount(p->markbuf);
p->curbuf = p->markbuf;
}
PG_RETURN_VOID();
}
@ -226,11 +262,14 @@ rtendscan(PG_FUNCTION_ARGS)
{
freestack(p->s_stack);
freestack(p->s_markstk);
if (BufferIsValid(p->curbuf))
ReleaseBuffer(p->curbuf);
if (BufferIsValid(p->markbuf))
ReleaseBuffer(p->markbuf);
pfree(s->opaque);
}
rtdropscan(s);
/* XXX don't unset read lock -- two-phase locking */
PG_RETURN_VOID();
}

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.36 2004/12/31 22:03:21 pgsql Exp $
* $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.37 2005/01/18 23:25:55 neilc Exp $
*
*-------------------------------------------------------------------------
*/
@ -59,11 +59,14 @@ typedef struct RTSTACK
/*
* When we're doing a scan, we need to keep track of the parent stack
* for the marked and current items. Also, rtrees have the following
* property: if you're looking for the box (1,1,2,2), on the internal
* nodes you have to search for all boxes that *contain* (1,1,2,2), and
* not the ones that match it. We have a private scan key for internal
* nodes in the opaque structure for rtrees for this reason. See
* access/index-rtree/rtscan.c and rtstrat.c for how it gets initialized.
* property: if you're looking for the box (1,1,2,2), on the internal
* nodes you have to search for all boxes that *contain* (1,1,2,2),
* and not the ones that match it. We have a private scan key for
* internal nodes in the opaque structure for rtrees for this reason.
* See access/index-rtree/rtscan.c and rtstrat.c for how it gets
* initialized. We also keep pins on the scan's current buffer and
* marked buffer, if any: this avoids the need to invoke ReadBuffer()
* for each tuple produced by the index scan.
*/
typedef struct RTreeScanOpaqueData
@ -73,6 +76,8 @@ typedef struct RTreeScanOpaqueData
uint16 s_flags;
int s_internalNKey;
ScanKey s_internalKey;
Buffer curbuf;
Buffer markbuf;
} RTreeScanOpaqueData;
typedef RTreeScanOpaqueData *RTreeScanOpaque;