postgresql/src/backend/access/nbtree/nbtsearch.c

1050 lines
28 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* nbtsearch.c
* search code for postgres btrees.
*
*
2002-06-20 22:29:54 +02:00
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
2002-06-20 22:29:54 +02:00
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.72 2002/06/20 20:29:25 momjian Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/nbtree.h"
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
/*
* _bt_search() -- Search the tree for a particular scankey,
* or more precisely for the first leaf page it could be on.
*
* Return value is a stack of parent-page pointers. *bufP is set to the
* address of the leaf-page buffer, which is read-locked and pinned.
* No locks are held on the parent pages, however!
*
* NOTE that the returned buffer is read-locked regardless of the access
* parameter. However, access = BT_WRITE will allow an empty root page
2001-03-22 05:01:46 +01:00
* to be created and returned. When access = BT_READ, an empty index
* will result in *bufP being set to InvalidBuffer.
*/
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey,
Buffer *bufP, int access)
{
2001-03-22 05:01:46 +01:00
BTStack stack_in = NULL;
/* Get the root page to start with */
*bufP = _bt_getroot(rel, access);
/* If index is empty and access = BT_READ, no root page is created. */
2001-03-22 05:01:46 +01:00
if (!BufferIsValid(*bufP))
return (BTStack) NULL;
/* Loop iterates once per level descended in the tree */
for (;;)
{
Page page;
BTPageOpaque opaque;
OffsetNumber offnum;
ItemId itemid;
BTItem btitem;
IndexTuple itup;
BlockNumber blkno;
BlockNumber par_blkno;
BTStack new_stack;
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (P_ISLEAF(opaque))
break;
/*
* Find the appropriate item on the internal page, and get the
* child page that it points to.
*/
offnum = _bt_binsrch(rel, *bufP, keysz, scankey);
itemid = PageGetItemId(page, offnum);
btitem = (BTItem) PageGetItem(page, itemid);
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
par_blkno = BufferGetBlockNumber(*bufP);
/*
2001-03-22 05:01:46 +01:00
* We need to save the bit image of the index entry we chose in
* the parent page on a stack. In case we split the tree, we'll
* use this bit image to figure out what our real parent page is,
* in case the parent splits while we're working lower in the
* tree. See the paper by Lehman and Yao for how this is detected
* and handled. (We use the child link to disambiguate duplicate
* keys in the index -- Lehman and Yao disallow duplicate keys.)
*/
new_stack = (BTStack) palloc(sizeof(BTStackData));
new_stack->bts_blkno = par_blkno;
new_stack->bts_offset = offnum;
memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
new_stack->bts_parent = stack_in;
/* drop the read lock on the parent page, acquire one on the child */
_bt_relbuf(rel, *bufP);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
/*
2001-03-22 05:01:46 +01:00
* Race -- the page we just grabbed may have split since we read
* its pointer in the parent. If it has, we may need to move
* right to its new sibling. Do that.
*/
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
/* okay, all set to move down a level */
stack_in = new_stack;
}
return stack_in;
}
/*
* _bt_moveright() -- move right in the btree if necessary.
*
* When we drop and reacquire a pointer to a page, it is possible that
* the page has changed in the meanwhile. If this happens, we're
* guaranteed that the page has "split right" -- that is, that any
* data that appeared on the page originally is either on the page
* or strictly to the right of it.
*
* This routine decides whether or not we need to move right in the
* tree by examining the high key entry on the page. If that entry
* is strictly less than one we expect to be on the page, then our
* picture of the page is incorrect and we need to move right.
*
* On entry, we have the buffer pinned and a lock of the proper type.
* If we move right, we release the buffer and lock and acquire the
2001-03-22 05:01:46 +01:00
* same on the right sibling. Return value is the buffer we stop at.
*/
Buffer
_bt_moveright(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
int access)
{
Page page;
BTPageOpaque opaque;
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* If the scan key that brought us to this page is > the high key
* stored on the page, then the page has split and we need to move
* right. (If the scan key is equal to the high key, we might or
* might not need to move right; have to scan the page first anyway.)
* It could even have split more than once, so scan as far as needed.
*/
while (!P_RIGHTMOST(opaque) &&
_bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)
{
/* step right one page */
2001-03-22 05:01:46 +01:00
BlockNumber rblkno = opaque->btpo_next;
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, rblkno, access);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
1998-09-01 05:29:17 +02:00
return buf;
}
/*
* _bt_binsrch() -- Do a binary search for a key on a particular page.
*
* The scankey we get has the compare function stored in the procedure
* entry of each data struct. We invoke this regproc to do the
* comparison for every key in the scankey.
*
* On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
* key >= given scankey. (NOTE: in particular, this means it is possible
* to return a value 1 greater than the number of keys on the page,
* if the scankey is > all keys on the page.)
*
* On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
* of the last key < given scankey. (Since _bt_compare treats the first
* data key of such a page as minus infinity, there will be at least one
* key < scankey, so the result always points at one of the keys on the
* page.) This key indicates the right place to descend to be sure we
* find all leaf keys >= given scankey.
*
* This procedure is not responsible for walking right, it just examines
2001-03-22 05:01:46 +01:00
* the given page. _bt_binsrch() has no lock or refcount side effects
* on the buffer.
*/
OffsetNumber
_bt_binsrch(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey)
{
TupleDesc itupdesc;
Page page;
BTPageOpaque opaque;
OffsetNumber low,
high;
int32 result;
1998-09-01 05:29:17 +02:00
itupdesc = RelationGetDescr(rel);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
low = P_FIRSTDATAKEY(opaque);
high = PageGetMaxOffsetNumber(page);
/*
* If there are no keys on the page, return the first available slot.
* Note this covers two cases: the page is really empty (no keys), or
* it contains only a high key. The latter case is possible after
* vacuuming. This can never happen on an internal page, however,
* since they are never empty (an internal page must have children).
*/
if (high < low)
1998-09-01 05:29:17 +02:00
return low;
/*
* Binary search to find the first key on the page >= scan key. Loop
* invariant: all slots before 'low' are < scan key, all slots at or
* after 'high' are >= scan key. We can fall out when high == low.
*/
high++; /* establish the loop invariant for high */
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
/* We have low <= mid < high, so mid points at a real slot */
result = _bt_compare(rel, keysz, scankey, page, mid);
if (result > 0)
low = mid + 1;
else
high = mid;
}
/*
* At this point we have high == low, but be careful: they could point
* past the last slot on the page.
*
* On a leaf page, we always return the first key >= scan key (which
* could be the last slot + 1).
*/
if (P_ISLEAF(opaque))
return low;
/*
* On a non-leaf page, return the last key < scan key. There must be
* one if _bt_compare() is playing by the rules.
*/
Assert(low > P_FIRSTDATAKEY(opaque));
return OffsetNumberPrev(low);
}
/*----------
* _bt_compare() -- Compare scankey to a particular tuple on the page.
*
* keysz: number of key conditions to be checked (might be less than the
* total length of the scan key!)
* page/offnum: location of btree item to be compared to.
*
* This routine returns:
* <0 if scankey < tuple at offnum;
* 0 if scankey == tuple at offnum;
* >0 if scankey > tuple at offnum.
* NULLs in the keys are treated as sortable values. Therefore
* "equality" does not necessarily mean that the item should be
* returned to the caller as a matching key!
*
* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
* "minus infinity": this routine will always claim it is less than the
* scankey. The actual key value stored (if any, which there probably isn't)
* does not matter. This convention allows us to implement the Lehman and
* Yao convention that the first down-link pointer is before the first key.
* See backend/access/nbtree/README for details.
*----------
*/
int32
_bt_compare(Relation rel,
int keysz,
ScanKey scankey,
Page page,
OffsetNumber offnum)
{
TupleDesc itupdesc = RelationGetDescr(rel);
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
BTItem btitem;
IndexTuple itup;
int i;
/*
* Force result ">" if target item is first data item on an internal
* page --- see NOTE above.
*/
2001-03-22 05:01:46 +01:00
if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
1998-09-01 05:29:17 +02:00
return 1;
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &(btitem->bti_itup);
/*
* The scan key is set up with the attribute number associated with
* each term in the key. It is important that, if the index is
* multi-key, the scan contain the first k key attributes, and that
* they be in order. If you think about how multi-key ordering works,
* you'll understand why this is.
*
* We don't test for violation of this condition here, however. The
* initial setup for the index scan had better have gotten it right
* (see _bt_first).
*/
for (i = 0; i < keysz; i++)
{
ScanKey entry = &scankey[i];
Datum datum;
bool isNull;
int32 result;
datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull);
/* see comments about NULLs handling in btbuild */
2001-03-22 05:01:46 +01:00
if (entry->sk_flags & SK_ISNULL) /* key is NULL */
{
if (isNull)
result = 0; /* NULL "=" NULL */
else
result = 1; /* NULL ">" NOT_NULL */
}
else if (isNull) /* key is NOT_NULL and item is NULL */
{
result = -1; /* NOT_NULL "<" NULL */
}
else
{
result = DatumGetInt32(FunctionCall2(&entry->sk_func,
entry->sk_argument,
datum));
}
/* if the keys are unequal, return the difference */
if (result != 0)
1998-09-01 05:29:17 +02:00
return result;
}
/* if we get here, the keys are equal */
1998-09-01 05:29:17 +02:00
return 0;
}
/*
* _bt_next() -- Get the next item in a scan.
*
* On entry, we have a valid currentItemData in the scan, and a
* read lock and pin count on the page that contains that item.
* We return the next item in the scan, or false if no more.
* On successful exit, the page containing the new item is locked
* and pinned; on failure exit, no lock or pin is held.
*/
bool
_bt_next(IndexScanDesc scan, ScanDirection dir)
{
Relation rel;
Buffer buf;
Page page;
OffsetNumber offnum;
ItemPointer current;
BTItem btitem;
IndexTuple itup;
BTScanOpaque so;
bool continuescan;
rel = scan->indexRelation;
so = (BTScanOpaque) scan->opaque;
current = &(scan->currentItemData);
/* we still have the buffer pinned and locked */
buf = so->btso_curbuf;
Assert(BufferIsValid(buf));
do
{
/* step one tuple in the appropriate direction */
if (!_bt_step(scan, &buf, dir))
return false;
/* current is the next candidate tuple to return */
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &btitem->bti_itup;
if (_bt_checkkeys(scan, itup, dir, &continuescan))
{
/* tuple passes all scan key conditions, so return it */
scan->xs_ctup.t_self = itup->t_tid;
return true;
}
/* This tuple doesn't pass, but there might be more that do */
} while (continuescan);
/* No more items, so close down the current-item info */
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf);
return false;
}
/*
* _bt_first() -- Find the first item in a scan.
*
* We need to be clever about the type of scan, the operation it's
* performing, and the tree ordering. We find the
* first item in the tree that satisfies the qualification
* associated with the scan descriptor. On exit, the page containing
* the current index tuple is read locked and pinned, and the scan's
* opaque data entry is updated to include the buffer.
*/
bool
_bt_first(IndexScanDesc scan, ScanDirection dir)
{
Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Buffer buf;
Page page;
BTStack stack;
OffsetNumber offnum;
BTItem btitem;
IndexTuple itup;
ItemPointer current;
BlockNumber blkno;
StrategyNumber strat;
bool res;
int32 result;
bool scanFromEnd;
bool continuescan;
ScanKey scankeys = NULL;
int keysCount = 0;
int *nKeyIs = NULL;
int i,
j;
StrategyNumber strat_total;
/*
* Order the scan keys in our canonical fashion and eliminate any
* redundant keys.
*/
_bt_orderkeys(scan);
/*
2001-03-22 05:01:46 +01:00
* Quit now if _bt_orderkeys() discovered that the scan keys can never
* be satisfied (eg, x == 1 AND x > 2).
*/
2001-03-22 05:01:46 +01:00
if (!so->qual_ok)
return false;
/*
* Examine the scan keys to discover where we need to start the scan.
*/
scanFromEnd = false;
strat_total = BTEqualStrategyNumber;
if (so->numberOfKeys > 0)
{
nKeyIs = (int *) palloc(so->numberOfKeys * sizeof(int));
for (i = 0; i < so->numberOfKeys; i++)
{
AttrNumber attno = so->keyData[i].sk_attno;
/* ignore keys for already-determined attrs */
if (attno <= keysCount)
continue;
/* if we didn't find a boundary for the preceding attr, quit */
if (attno > keysCount + 1)
break;
strat = _bt_getstrat(rel, attno,
so->keyData[i].sk_procedure);
2001-03-22 05:01:46 +01:00
/*
* Can we use this key as a starting boundary for this attr?
*
2001-03-22 05:01:46 +01:00
* We can use multiple keys if they look like, say, = >= = but we
* have to stop after accepting a > or < boundary.
*/
if (strat == strat_total ||
strat == BTEqualStrategyNumber)
nKeyIs[keysCount++] = i;
else if (ScanDirectionIsBackward(dir) &&
(strat == BTLessStrategyNumber ||
strat == BTLessEqualStrategyNumber))
{
nKeyIs[keysCount++] = i;
strat_total = strat;
if (strat == BTLessStrategyNumber)
break;
}
else if (ScanDirectionIsForward(dir) &&
(strat == BTGreaterStrategyNumber ||
strat == BTGreaterEqualStrategyNumber))
{
nKeyIs[keysCount++] = i;
strat_total = strat;
if (strat == BTGreaterStrategyNumber)
break;
}
}
if (keysCount == 0)
scanFromEnd = true;
}
else
scanFromEnd = true;
/* if we just need to walk down one edge of the tree, do that */
if (scanFromEnd)
{
if (nKeyIs)
pfree(nKeyIs);
1998-09-01 05:29:17 +02:00
return _bt_endpoint(scan, dir);
}
/*
* We want to start the scan somewhere within the index. Set up a
* scankey we can use to search for the correct starting point.
*/
scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
for (i = 0; i < keysCount; i++)
{
FmgrInfo *procinfo;
j = nKeyIs[i];
2001-03-22 05:01:46 +01:00
/*
* _bt_orderkeys disallows it, but it's place to add some code
* later
*/
if (so->keyData[j].sk_flags & SK_ISNULL)
{
pfree(nKeyIs);
pfree(scankeys);
elog(ERROR, "_bt_first: btree doesn't support is(not)null, yet");
return false;
}
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
ScanKeyEntryInitializeWithInfo(scankeys + i,
so->keyData[j].sk_flags,
i + 1,
procinfo,
CurrentMemoryContext,
so->keyData[j].sk_argument);
}
if (nKeyIs)
pfree(nKeyIs);
current = &(scan->currentItemData);
/*
* Use the manufactured scan key to descend the tree and position
* ourselves on the target leaf page.
*/
stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ);
/* don't need to keep the stack around... */
_bt_freestack(stack);
2001-03-22 05:01:46 +01:00
if (!BufferIsValid(buf))
{
/* Only get here if index is completely empty */
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
pfree(scankeys);
return false;
1997-05-30 20:35:40 +02:00
}
/* remember which buffer we have pinned */
so->btso_curbuf = buf;
blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
/* position to the precise item on the page */
offnum = _bt_binsrch(rel, buf, keysCount, scankeys);
ItemPointerSet(current, blkno, offnum);
/*
* At this point we are positioned at the first item >= scan key, or
* possibly at the end of a page on which all the existing items are
* greater than the scan key and we know that everything on later
* pages is less than or equal to scan key.
*
* We could step forward in the latter case, but that'd be a waste of
* time if we want to scan backwards. So, it's now time to examine
* the scan strategy to find the exact place to start the scan.
*
* Note: if _bt_step fails (meaning we fell off the end of the index in
* one direction or the other), we either return false (no matches) or
* call _bt_endpoint() to set up a scan starting at that index
* endpoint, as appropriate for the desired scan type.
*
* it's yet other place to add some code later for is(not)null ...
*/
switch (strat_total)
1997-05-30 20:35:40 +02:00
{
case BTLessStrategyNumber:
2001-03-22 05:01:46 +01:00
/*
* Back up one to arrive at last item < scankey
*/
if (!_bt_step(scan, &buf, BackwardScanDirection))
{
pfree(scankeys);
return false;
}
break;
case BTLessEqualStrategyNumber:
2001-03-22 05:01:46 +01:00
/*
* We need to find the last item <= scankey, so step forward
* till we find one > scankey, then step back one.
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return _bt_endpoint(scan, dir);
}
}
for (;;)
{
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
if (result < 0)
break;
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return _bt_endpoint(scan, dir);
}
}
if (!_bt_step(scan, &buf, BackwardScanDirection))
{
pfree(scankeys);
return false;
}
break;
case BTEqualStrategyNumber:
2001-03-22 05:01:46 +01:00
/*
2001-03-22 05:01:46 +01:00
* Make sure we are on the first equal item; might have to
* step forward if currently at end of page.
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return false;
}
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
if (result != 0)
2001-03-22 05:01:46 +01:00
goto nomatches; /* no equal items! */
/*
* If a backward scan was specified, need to start with last
* equal item not first one.
*/
if (ScanDirectionIsBackward(dir))
{
do
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return _bt_endpoint(scan, dir);
}
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
} while (result == 0);
if (!_bt_step(scan, &buf, BackwardScanDirection))
elog(ERROR, "_bt_first: equal items disappeared?");
}
break;
case BTGreaterEqualStrategyNumber:
2001-03-22 05:01:46 +01:00
/*
* We want the first item >= scankey, which is where we are...
* unless we're not anywhere at all...
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return false;
}
}
break;
case BTGreaterStrategyNumber:
2001-03-22 05:01:46 +01:00
/*
2001-03-22 05:01:46 +01:00
* We want the first item > scankey, so make sure we are on an
* item and then step over any equal items.
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return false;
}
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
while (result == 0)
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return false;
}
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
}
break;
1997-05-30 20:35:40 +02:00
}
/* okay, current item pointer for the scan is right */
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &btitem->bti_itup;
/* is the first item actually acceptable? */
if (_bt_checkkeys(scan, itup, dir, &continuescan))
1997-05-30 20:35:40 +02:00
{
/* yes, return it */
scan->xs_ctup.t_self = itup->t_tid;
res = true;
}
else if (continuescan)
{
/* no, but there might be another one that is */
res = _bt_next(scan, dir);
}
else
{
/* no tuples in the index match this scan key */
nomatches:
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf);
res = false;
}
pfree(scankeys);
1998-09-01 05:29:17 +02:00
return res;
}
/*
* _bt_step() -- Step one item in the requested direction in a scan on
* the tree.
*
* *bufP is the current buffer (read-locked and pinned). If we change
* pages, it's updated appropriately.
*
* If successful, update scan's currentItemData and return true.
* If no adjacent record exists in the requested direction,
* release buffer pin/locks and return false.
*/
bool
1997-09-08 22:59:27 +02:00
_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
{
Relation rel = scan->indexRelation;
ItemPointer current = &(scan->currentItemData);
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber offnum,
maxoff;
BlockNumber blkno;
BlockNumber obknum;
1999-05-25 18:15:34 +02:00
/*
1999-05-25 18:15:34 +02:00
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
* due to ability of ip_posid to be equal 0.
*/
offnum = current->ip_posid;
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
if (ScanDirectionIsForward(dir))
{
if (!PageIsEmpty(page) && offnum < maxoff)
offnum = OffsetNumberNext(offnum);
else
{
/* walk right to the next page with data */
for (;;)
{
/* if we're at end of scan, release the buffer and return */
if (P_RIGHTMOST(opaque))
{
_bt_relbuf(rel, *bufP);
ItemPointerSetInvalid(current);
*bufP = so->btso_curbuf = InvalidBuffer;
return false;
}
/* step right one page */
blkno = opaque->btpo_next;
_bt_relbuf(rel, *bufP);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
/* done if it's not empty */
offnum = P_FIRSTDATAKEY(opaque);
if (!PageIsEmpty(page) && offnum <= maxoff)
break;
}
}
}
else
{
if (offnum > P_FIRSTDATAKEY(opaque))
offnum = OffsetNumberPrev(offnum);
else
{
/* walk left to the next page with data */
for (;;)
{
/* if we're at end of scan, release the buffer and return */
if (P_LEFTMOST(opaque))
{
_bt_relbuf(rel, *bufP);
ItemPointerSetInvalid(current);
*bufP = so->btso_curbuf = InvalidBuffer;
return false;
}
/* step left */
obknum = BufferGetBlockNumber(*bufP);
blkno = opaque->btpo_prev;
_bt_relbuf(rel, *bufP);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2001-03-22 05:01:46 +01:00
/*
* If the adjacent page just split, then we have to walk
2001-03-22 05:01:46 +01:00
* right to find the block that's now adjacent to where we
* were. Because pages only split right, we don't have to
* worry about this failing to terminate.
*/
while (opaque->btpo_next != obknum)
{
blkno = opaque->btpo_next;
_bt_relbuf(rel, *bufP);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
/* done if it's not empty */
maxoff = PageGetMaxOffsetNumber(page);
offnum = maxoff;
if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque))
break;
}
}
}
/* Update scan state */
so->btso_curbuf = *bufP;
blkno = BufferGetBlockNumber(*bufP);
ItemPointerSet(current, blkno, offnum);
1998-09-01 05:29:17 +02:00
return true;
}
/*
* _bt_endpoint() -- Find the first or last key in the index.
*
* This is used by _bt_first() to set up a scan when we've determined
* that the scan must start at the beginning or end of the index (for
* a forward or backward scan respectively).
*/
static bool
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
{
Relation rel;
Buffer buf;
Page page;
BTPageOpaque opaque;
ItemPointer current;
OffsetNumber offnum,
maxoff;
OffsetNumber start;
BlockNumber blkno;
BTItem btitem;
IndexTuple itup;
BTScanOpaque so;
bool res;
bool continuescan;
rel = scan->indexRelation;
current = &(scan->currentItemData);
so = (BTScanOpaque) scan->opaque;
/*
* Scan down to the leftmost or rightmost leaf page. This is a
2001-03-22 05:01:46 +01:00
* simplified version of _bt_search(). We don't maintain a stack
* since we know we won't need it.
*/
buf = _bt_getroot(rel, BT_READ);
2001-03-22 05:01:46 +01:00
if (!BufferIsValid(buf))
{
/* empty index... */
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
return false;
}
blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
for (;;)
{
if (P_ISLEAF(opaque))
break;
if (ScanDirectionIsForward(dir))
offnum = P_FIRSTDATAKEY(opaque);
else
offnum = PageGetMaxOffsetNumber(page);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* Race condition: If the child page we just stepped onto was just
* split, we need to make sure we're all the way at the right edge
* of the tree. See the paper by Lehman and Yao.
*/
if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
{
do
{
blkno = opaque->btpo_next;
_bt_relbuf(rel, buf);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
} while (!P_RIGHTMOST(opaque));
}
}
/* okay, we've got the {left,right}-most page in the tree */
maxoff = PageGetMaxOffsetNumber(page);
if (ScanDirectionIsForward(dir))
{
Assert(P_LEFTMOST(opaque));
start = P_FIRSTDATAKEY(opaque);
}
else if (ScanDirectionIsBackward(dir))
{
Assert(P_RIGHTMOST(opaque));
start = PageGetMaxOffsetNumber(page);
2001-03-22 05:01:46 +01:00
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty
* page */
start = P_FIRSTDATAKEY(opaque);
}
else
{
elog(ERROR, "Illegal scan direction %d", dir);
start = 0; /* keep compiler quiet */
}
ItemPointerSet(current, blkno, start);
/* remember which buffer we have pinned */
so->btso_curbuf = buf;
/*
2001-03-22 05:01:46 +01:00
* Left/rightmost page could be empty due to deletions, if so step
* till we find a nonempty page.
*/
if (start > maxoff)
{
if (!_bt_step(scan, &buf, dir))
return false;
start = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
itup = &(btitem->bti_itup);
/* see if we picked a winner */
if (_bt_checkkeys(scan, itup, dir, &continuescan))
{
/* yes, return it */
scan->xs_ctup.t_self = itup->t_tid;
res = true;
}
else if (continuescan)
{
/* no, but there might be another one that is */
res = _bt_next(scan, dir);
}
else
{
/* no tuples in the index match this scan key */
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf);
res = false;
}
1998-09-01 05:29:17 +02:00
return res;
}