postgresql/src/backend/access/nbtree/nbtsearch.c

1037 lines
28 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* nbtsearch.c
* search code for postgres btrees.
*
*
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.61 2000/07/21 06:42:32 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/nbtree.h"
static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
/*
* _bt_search() -- Search the tree for a particular scankey,
* or more precisely for the first leaf page it could be on.
*
* Return value is a stack of parent-page pointers. *bufP is set to the
* address of the leaf-page buffer, which is read-locked and pinned.
* No locks are held on the parent pages, however!
*
* NOTE that the returned buffer is read-locked regardless of the access
* parameter. However, access = BT_WRITE will allow an empty root page
* to be created and returned. When access = BT_READ, an empty index
* will result in *bufP being set to InvalidBuffer.
*/
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey,
Buffer *bufP, int access)
{
BTStack stack_in = NULL;
/* Get the root page to start with */
*bufP = _bt_getroot(rel, access);
/* If index is empty and access = BT_READ, no root page is created. */
if (! BufferIsValid(*bufP))
return (BTStack) NULL;
/* Loop iterates once per level descended in the tree */
for (;;)
{
Page page;
BTPageOpaque opaque;
OffsetNumber offnum;
ItemId itemid;
BTItem btitem;
IndexTuple itup;
BlockNumber blkno;
BlockNumber par_blkno;
BTStack new_stack;
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (P_ISLEAF(opaque))
break;
/*
* Find the appropriate item on the internal page, and get the
* child page that it points to.
*/
offnum = _bt_binsrch(rel, *bufP, keysz, scankey);
itemid = PageGetItemId(page, offnum);
btitem = (BTItem) PageGetItem(page, itemid);
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
par_blkno = BufferGetBlockNumber(*bufP);
/*
* We need to save the bit image of the index entry we chose in the
* parent page on a stack. In case we split the tree, we'll use this
* bit image to figure out what our real parent page is, in case the
* parent splits while we're working lower in the tree. See the paper
* by Lehman and Yao for how this is detected and handled. (We use the
* child link to disambiguate duplicate keys in the index -- Lehman
* and Yao disallow duplicate keys.)
*/
new_stack = (BTStack) palloc(sizeof(BTStackData));
new_stack->bts_blkno = par_blkno;
new_stack->bts_offset = offnum;
memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
new_stack->bts_parent = stack_in;
/* drop the read lock on the parent page, acquire one on the child */
_bt_relbuf(rel, *bufP, BT_READ);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
/*
* Race -- the page we just grabbed may have split since we read its
* pointer in the parent. If it has, we may need to move right to its
* new sibling. Do that.
*/
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
/* okay, all set to move down a level */
stack_in = new_stack;
}
return stack_in;
}
/*
* _bt_moveright() -- move right in the btree if necessary.
*
* When we drop and reacquire a pointer to a page, it is possible that
* the page has changed in the meanwhile. If this happens, we're
* guaranteed that the page has "split right" -- that is, that any
* data that appeared on the page originally is either on the page
* or strictly to the right of it.
*
* This routine decides whether or not we need to move right in the
* tree by examining the high key entry on the page. If that entry
* is strictly less than one we expect to be on the page, then our
* picture of the page is incorrect and we need to move right.
*
* On entry, we have the buffer pinned and a lock of the proper type.
* If we move right, we release the buffer and lock and acquire the
* same on the right sibling. Return value is the buffer we stop at.
*/
Buffer
_bt_moveright(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
int access)
{
Page page;
BTPageOpaque opaque;
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* If the scan key that brought us to this page is > the high key
* stored on the page, then the page has split and we need to move
* right. (If the scan key is equal to the high key, we might or
* might not need to move right; have to scan the page first anyway.)
* It could even have split more than once, so scan as far as needed.
*/
while (!P_RIGHTMOST(opaque) &&
_bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)
{
/* step right one page */
BlockNumber rblkno = opaque->btpo_next;
_bt_relbuf(rel, buf, access);
buf = _bt_getbuf(rel, rblkno, access);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
1998-09-01 05:29:17 +02:00
return buf;
}
/*
* _bt_binsrch() -- Do a binary search for a key on a particular page.
*
* The scankey we get has the compare function stored in the procedure
* entry of each data struct. We invoke this regproc to do the
* comparison for every key in the scankey.
*
* On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
* key >= given scankey. (NOTE: in particular, this means it is possible
* to return a value 1 greater than the number of keys on the page,
* if the scankey is > all keys on the page.)
*
* On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
* of the last key < given scankey. (Since _bt_compare treats the first
* data key of such a page as minus infinity, there will be at least one
* key < scankey, so the result always points at one of the keys on the
* page.) This key indicates the right place to descend to be sure we
* find all leaf keys >= given scankey.
*
* This procedure is not responsible for walking right, it just examines
* the given page. _bt_binsrch() has no lock or refcount side effects
* on the buffer.
*/
OffsetNumber
_bt_binsrch(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey)
{
TupleDesc itupdesc;
Page page;
BTPageOpaque opaque;
OffsetNumber low,
high;
int32 result;
1998-09-01 05:29:17 +02:00
itupdesc = RelationGetDescr(rel);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
low = P_FIRSTDATAKEY(opaque);
high = PageGetMaxOffsetNumber(page);
/*
* If there are no keys on the page, return the first available slot.
* Note this covers two cases: the page is really empty (no keys), or
* it contains only a high key. The latter case is possible after
* vacuuming. This can never happen on an internal page, however,
* since they are never empty (an internal page must have children).
*/
if (high < low)
1998-09-01 05:29:17 +02:00
return low;
/*
* Binary search to find the first key on the page >= scan key. Loop
* invariant: all slots before 'low' are < scan key, all slots at or
* after 'high' are >= scan key. We can fall out when high == low.
*/
high++; /* establish the loop invariant for high */
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
/* We have low <= mid < high, so mid points at a real slot */
result = _bt_compare(rel, keysz, scankey, page, mid);
if (result > 0)
low = mid + 1;
else
high = mid;
}
/*--------------------
* At this point we have high == low, but be careful: they could point
* past the last slot on the page.
*
* On a leaf page, we always return the first key >= scan key
* (which could be the last slot + 1).
*--------------------
*/
if (P_ISLEAF(opaque))
return low;
/*--------------------
* On a non-leaf page, return the last key < scan key.
* There must be one if _bt_compare() is playing by the rules.
*--------------------
*/
Assert(low > P_FIRSTDATAKEY(opaque));
return OffsetNumberPrev(low);
}
/*----------
* _bt_compare() -- Compare scankey to a particular tuple on the page.
*
* keysz: number of key conditions to be checked (might be less than the
* total length of the scan key!)
* page/offnum: location of btree item to be compared to.
*
* This routine returns:
* <0 if scankey < tuple at offnum;
* 0 if scankey == tuple at offnum;
* >0 if scankey > tuple at offnum.
* NULLs in the keys are treated as sortable values. Therefore
* "equality" does not necessarily mean that the item should be
* returned to the caller as a matching key!
*
* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
* "minus infinity": this routine will always claim it is less than the
* scankey. The actual key value stored (if any, which there probably isn't)
* does not matter. This convention allows us to implement the Lehman and
* Yao convention that the first down-link pointer is before the first key.
* See backend/access/nbtree/README for details.
*----------
*/
int32
_bt_compare(Relation rel,
int keysz,
ScanKey scankey,
Page page,
OffsetNumber offnum)
{
TupleDesc itupdesc = RelationGetDescr(rel);
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
BTItem btitem;
IndexTuple itup;
int i;
/*
* Force result ">" if target item is first data item on an internal
* page --- see NOTE above.
*/
if (! P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
1998-09-01 05:29:17 +02:00
return 1;
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &(btitem->bti_itup);
/*
* The scan key is set up with the attribute number associated with
* each term in the key. It is important that, if the index is
* multi-key, the scan contain the first k key attributes, and that
* they be in order. If you think about how multi-key ordering works,
* you'll understand why this is.
*
* We don't test for violation of this condition here, however. The
* initial setup for the index scan had better have gotten it right
* (see _bt_first).
*/
for (i = 0; i < keysz; i++)
{
ScanKey entry = &scankey[i];
Datum datum;
bool isNull;
int32 result;
datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull);
/* see comments about NULLs handling in btbuild */
if (entry->sk_flags & SK_ISNULL) /* key is NULL */
{
if (isNull)
result = 0; /* NULL "=" NULL */
else
result = 1; /* NULL ">" NOT_NULL */
}
else if (isNull) /* key is NOT_NULL and item is NULL */
{
result = -1; /* NOT_NULL "<" NULL */
}
else
{
result = DatumGetInt32(FunctionCall2(&entry->sk_func,
entry->sk_argument,
datum));
}
/* if the keys are unequal, return the difference */
if (result != 0)
1998-09-01 05:29:17 +02:00
return result;
}
/* if we get here, the keys are equal */
1998-09-01 05:29:17 +02:00
return 0;
}
/*
* _bt_next() -- Get the next item in a scan.
*
* On entry, we have a valid currentItemData in the scan, and a
* read lock and pin count on the page that contains that item.
* We return the next item in the scan, or NULL if no more.
* On successful exit, the page containing the new item is locked
* and pinned; on NULL exit, no lock or pin is held.
*/
RetrieveIndexResult
_bt_next(IndexScanDesc scan, ScanDirection dir)
{
Relation rel;
Buffer buf;
Page page;
OffsetNumber offnum;
ItemPointer current;
BTItem btitem;
IndexTuple itup;
BTScanOpaque so;
Size keysok;
rel = scan->relation;
so = (BTScanOpaque) scan->opaque;
current = &(scan->currentItemData);
/* we still have the buffer pinned and locked */
buf = so->btso_curbuf;
Assert(BufferIsValid(buf));
do
{
/* step one tuple in the appropriate direction */
if (!_bt_step(scan, &buf, dir))
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
/* current is the next candidate tuple to return */
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &btitem->bti_itup;
if (_bt_checkkeys(scan, itup, &keysok))
{
/* tuple passes all scan key conditions, so return it */
Assert(keysok == so->numberOfKeys);
return FormRetrieveIndexResult(current, &(itup->t_tid));
}
/* This tuple doesn't pass, but there might be more that do */
1999-05-25 18:15:34 +02:00
} while (keysok >= so->numberOfFirstKeys ||
(keysok == ((Size) -1) && ScanDirectionIsBackward(dir)));
/* No more items, so close down the current-item info */
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf, BT_READ);
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
}
/*
* _bt_first() -- Find the first item in a scan.
*
* We need to be clever about the type of scan, the operation it's
* performing, and the tree ordering. We return the RetrieveIndexResult
* of the first item in the tree that satisfies the qualification
* associated with the scan descriptor. On exit, the page containing
* the current index tuple is read locked and pinned, and the scan's
* opaque data entry is updated to include the buffer.
*/
RetrieveIndexResult
_bt_first(IndexScanDesc scan, ScanDirection dir)
{
Relation rel;
Buffer buf;
Page page;
BTStack stack;
OffsetNumber offnum;
BTItem btitem;
IndexTuple itup;
ItemPointer current;
BlockNumber blkno;
StrategyNumber strat;
RetrieveIndexResult res;
RegProcedure proc;
int32 result;
BTScanOpaque so;
Size keysok;
bool strategyCheck;
ScanKey scankeys = 0;
int keysCount = 0;
int *nKeyIs = 0;
int i,
j;
StrategyNumber strat_total;
rel = scan->relation;
so = (BTScanOpaque) scan->opaque;
/*
* Order the keys in the qualification and be sure that the scan
* exploits the tree order.
*/
so->numberOfFirstKeys = 0; /* may be changed by _bt_orderkeys */
so->qual_ok = 1; /* may be changed by _bt_orderkeys */
scan->scanFromEnd = false;
strategyCheck = false;
if (so->numberOfKeys > 0)
{
_bt_orderkeys(rel, so);
if (so->qual_ok)
strategyCheck = true;
}
strat_total = BTEqualStrategyNumber;
if (strategyCheck)
{
AttrNumber attno;
nKeyIs = (int *) palloc(so->numberOfKeys * sizeof(int));
for (i = 0; i < so->numberOfKeys; i++)
{
attno = so->keyData[i].sk_attno;
if (attno == keysCount)
continue;
if (attno > keysCount + 1)
break;
strat = _bt_getstrat(rel, attno,
so->keyData[i].sk_procedure);
if (strat == strat_total ||
strat == BTEqualStrategyNumber)
{
nKeyIs[keysCount++] = i;
continue;
}
if (ScanDirectionIsBackward(dir) &&
(strat == BTLessStrategyNumber ||
strat == BTLessEqualStrategyNumber))
{
nKeyIs[keysCount++] = i;
strat_total = strat;
if (strat == BTLessStrategyNumber)
break;
continue;
}
if (ScanDirectionIsForward(dir) &&
(strat == BTGreaterStrategyNumber ||
strat == BTGreaterEqualStrategyNumber))
{
nKeyIs[keysCount++] = i;
strat_total = strat;
if (strat == BTGreaterStrategyNumber)
break;
continue;
}
}
if (!keysCount)
scan->scanFromEnd = true;
}
else
scan->scanFromEnd = true;
if (so->qual_ok == 0)
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
/* if we just need to walk down one edge of the tree, do that */
if (scan->scanFromEnd)
{
if (nKeyIs)
pfree(nKeyIs);
1998-09-01 05:29:17 +02:00
return _bt_endpoint(scan, dir);
}
/*
* Okay, we want something more complicated. What we'll do is use the
* first item in the scan key passed in (which has been correctly
* ordered to take advantage of index ordering) to position ourselves
* at the right place in the scan.
*/
scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
for (i = 0; i < keysCount; i++)
{
j = nKeyIs[i];
/* _bt_orderkeys disallows it, but it's place to add some code latter */
if (so->keyData[j].sk_flags & SK_ISNULL)
{
pfree(nKeyIs);
pfree(scankeys);
elog(ERROR, "_bt_first: btree doesn't support is(not)null, yet");
return ((RetrieveIndexResult) NULL);
}
proc = index_getprocid(rel, i + 1, BTORDER_PROC);
ScanKeyEntryInitialize(scankeys + i, so->keyData[j].sk_flags,
i + 1, proc, so->keyData[j].sk_argument);
}
if (nKeyIs)
pfree(nKeyIs);
current = &(scan->currentItemData);
/*
* Use the manufactured scan key to descend the tree and position
* ourselves on the target leaf page.
*/
stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ);
/* don't need to keep the stack around... */
_bt_freestack(stack);
if (! BufferIsValid(buf))
{
/* Only get here if index is completely empty */
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
pfree(scankeys);
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
1997-05-30 20:35:40 +02:00
}
/* remember which buffer we have pinned */
so->btso_curbuf = buf;
blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
offnum = _bt_binsrch(rel, buf, keysCount, scankeys);
ItemPointerSet(current, blkno, offnum);
/*----------
* At this point we are positioned at the first item >= scan key,
* or possibly at the end of a page on which all the existing items
* are < scan key and we know that everything on later pages is
* >= scan key. We could step forward in the latter case, but that'd
* be a waste of time if we want to scan backwards. So, it's now time to
* examine the scan strategy to find the exact place to start the scan.
*
* Note: if _bt_step fails (meaning we fell off the end of the index
* in one direction or the other), we either return NULL (no matches) or
* call _bt_endpoint() to set up a scan starting at that index endpoint,
* as appropriate for the desired scan type.
*
* it's yet other place to add some code latter for is(not)null ...
*----------
*/
switch (strat_total)
1997-05-30 20:35:40 +02:00
{
case BTLessStrategyNumber:
/*
* Back up one to arrive at last item < scankey
*/
if (!_bt_step(scan, &buf, BackwardScanDirection))
{
pfree(scankeys);
return (RetrieveIndexResult) NULL;
}
break;
case BTLessEqualStrategyNumber:
/*
* We need to find the last item <= scankey, so step forward
* till we find one > scankey, then step back one.
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return _bt_endpoint(scan, dir);
}
}
for (;;)
{
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
if (result < 0)
break;
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return _bt_endpoint(scan, dir);
}
}
if (!_bt_step(scan, &buf, BackwardScanDirection))
{
pfree(scankeys);
return (RetrieveIndexResult) NULL;
}
break;
case BTEqualStrategyNumber:
/*
* Make sure we are on the first equal item; might have to step
* forward if currently at end of page.
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return (RetrieveIndexResult) NULL;
}
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
if (result != 0)
goto nomatches; /* no equal items! */
/*
* If a backward scan was specified, need to start with last
* equal item not first one.
*/
if (ScanDirectionIsBackward(dir))
{
do
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return _bt_endpoint(scan, dir);
}
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
} while (result == 0);
if (!_bt_step(scan, &buf, BackwardScanDirection))
elog(ERROR, "_bt_first: equal items disappeared?");
}
break;
case BTGreaterEqualStrategyNumber:
/*
* We want the first item >= scankey, which is where we are...
* unless we're not anywhere at all...
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return (RetrieveIndexResult) NULL;
}
}
break;
case BTGreaterStrategyNumber:
/*
* We want the first item > scankey, so make sure we are on
* an item and then step over any equal items.
*/
if (offnum > PageGetMaxOffsetNumber(page))
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return (RetrieveIndexResult) NULL;
}
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
while (result == 0)
{
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
pfree(scankeys);
return (RetrieveIndexResult) NULL;
}
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
}
break;
1997-05-30 20:35:40 +02:00
}
/* okay, current item pointer for the scan is right */
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &btitem->bti_itup;
/* is the first item actually acceptable? */
if (_bt_checkkeys(scan, itup, &keysok))
1997-05-30 20:35:40 +02:00
{
/* yes, return it */
res = FormRetrieveIndexResult(current, &(itup->t_tid));
}
else if (keysok >= so->numberOfFirstKeys ||
(keysok == ((Size) -1) && ScanDirectionIsBackward(dir)))
{
/* no, but there might be another one that is */
res = _bt_next(scan, dir);
}
else
{
/* no tuples in the index match this scan key */
nomatches:
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf, BT_READ);
res = (RetrieveIndexResult) NULL;
}
pfree(scankeys);
1998-09-01 05:29:17 +02:00
return res;
}
/*
* _bt_step() -- Step one item in the requested direction in a scan on
* the tree.
*
* *bufP is the current buffer (read-locked and pinned). If we change
* pages, it's updated appropriately.
*
* If successful, update scan's currentItemData and return true.
* If no adjacent record exists in the requested direction,
* release buffer pin/locks and return false.
*/
bool
1997-09-08 22:59:27 +02:00
_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
{
Relation rel = scan->relation;
ItemPointer current = &(scan->currentItemData);
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber offnum,
maxoff;
BlockNumber blkno;
BlockNumber obknum;
1999-05-25 18:15:34 +02:00
/*
1999-05-25 18:15:34 +02:00
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
* due to ability of ip_posid to be equal 0.
*/
offnum = current->ip_posid;
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
if (ScanDirectionIsForward(dir))
{
if (!PageIsEmpty(page) && offnum < maxoff)
offnum = OffsetNumberNext(offnum);
else
{
/* walk right to the next page with data */
for (;;)
{
/* if we're at end of scan, release the buffer and return */
if (P_RIGHTMOST(opaque))
{
_bt_relbuf(rel, *bufP, BT_READ);
ItemPointerSetInvalid(current);
*bufP = so->btso_curbuf = InvalidBuffer;
return false;
}
/* step right one page */
blkno = opaque->btpo_next;
_bt_relbuf(rel, *bufP, BT_READ);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
/* done if it's not empty */
offnum = P_FIRSTDATAKEY(opaque);
if (!PageIsEmpty(page) && offnum <= maxoff)
break;
}
}
}
else
{
if (offnum > P_FIRSTDATAKEY(opaque))
offnum = OffsetNumberPrev(offnum);
else
{
/* walk left to the next page with data */
for (;;)
{
/* if we're at end of scan, release the buffer and return */
if (P_LEFTMOST(opaque))
{
_bt_relbuf(rel, *bufP, BT_READ);
ItemPointerSetInvalid(current);
*bufP = so->btso_curbuf = InvalidBuffer;
return false;
}
/* step left */
obknum = BufferGetBlockNumber(*bufP);
blkno = opaque->btpo_prev;
_bt_relbuf(rel, *bufP, BT_READ);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* If the adjacent page just split, then we have to walk
* right to find the block that's now adjacent to where
* we were. Because pages only split right, we don't have
* to worry about this failing to terminate.
*/
while (opaque->btpo_next != obknum)
{
blkno = opaque->btpo_next;
_bt_relbuf(rel, *bufP, BT_READ);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
}
/* done if it's not empty */
maxoff = PageGetMaxOffsetNumber(page);
offnum = maxoff;
if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque))
break;
}
}
}
/* Update scan state */
so->btso_curbuf = *bufP;
blkno = BufferGetBlockNumber(*bufP);
ItemPointerSet(current, blkno, offnum);
1998-09-01 05:29:17 +02:00
return true;
}
/*
* _bt_endpoint() -- Find the first or last key in the index.
*
* This is used by _bt_first() to set up a scan when we've determined
* that the scan must start at the beginning or end of the index (for
* a forward or backward scan respectively).
*/
static RetrieveIndexResult
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
{
Relation rel;
Buffer buf;
Page page;
BTPageOpaque opaque;
ItemPointer current;
OffsetNumber offnum,
maxoff;
OffsetNumber start;
BlockNumber blkno;
BTItem btitem;
IndexTuple itup;
BTScanOpaque so;
RetrieveIndexResult res;
Size keysok;
rel = scan->relation;
current = &(scan->currentItemData);
so = (BTScanOpaque) scan->opaque;
/*
* Scan down to the leftmost or rightmost leaf page. This is a
* simplified version of _bt_search(). We don't maintain a stack
* since we know we won't need it.
*/
buf = _bt_getroot(rel, BT_READ);
if (! BufferIsValid(buf))
{
/* empty index... */
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
return (RetrieveIndexResult) NULL;
}
blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
for (;;)
{
if (P_ISLEAF(opaque))
break;
if (ScanDirectionIsForward(dir))
offnum = P_FIRSTDATAKEY(opaque);
else
offnum = PageGetMaxOffsetNumber(page);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
_bt_relbuf(rel, buf, BT_READ);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* Race condition: If the child page we just stepped onto was just
* split, we need to make sure we're all the way at the right edge
* of the tree. See the paper by Lehman and Yao.
*/
if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
{
do
{
blkno = opaque->btpo_next;
_bt_relbuf(rel, buf, BT_READ);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
} while (!P_RIGHTMOST(opaque));
}
}
/* okay, we've got the {left,right}-most page in the tree */
maxoff = PageGetMaxOffsetNumber(page);
if (ScanDirectionIsForward(dir))
{
Assert(P_LEFTMOST(opaque));
start = P_FIRSTDATAKEY(opaque);
}
else if (ScanDirectionIsBackward(dir))
{
Assert(P_RIGHTMOST(opaque));
start = PageGetMaxOffsetNumber(page);
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty page */
start = P_FIRSTDATAKEY(opaque);
}
else
{
elog(ERROR, "Illegal scan direction %d", dir);
start = 0; /* keep compiler quiet */
}
ItemPointerSet(current, blkno, start);
/* remember which buffer we have pinned */
so->btso_curbuf = buf;
/*
* Left/rightmost page could be empty due to deletions,
* if so step till we find a nonempty page.
*/
if (start > maxoff)
{
if (!_bt_step(scan, &buf, dir))
return (RetrieveIndexResult) NULL;
start = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
itup = &(btitem->bti_itup);
/* see if we picked a winner */
if (_bt_checkkeys(scan, itup, &keysok))
{
/* yes, return it */
res = FormRetrieveIndexResult(current, &(itup->t_tid));
}
else if (keysok >= so->numberOfFirstKeys ||
(keysok == ((Size) -1) && ScanDirectionIsBackward(dir)))
{
/* no, but there might be another one that is */
res = _bt_next(scan, dir);
}
else
{
/* no tuples in the index match this scan key */
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf, BT_READ);
res = (RetrieveIndexResult) NULL;
}
1998-09-01 05:29:17 +02:00
return res;
}