1050 lines
28 KiB
C
1050 lines
28 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* nbtsearch.c
|
|
* search code for postgres btrees.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.72 2002/06/20 20:29:25 momjian Exp $
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "access/genam.h"
|
|
#include "access/nbtree.h"
|
|
|
|
|
|
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
|
|
|
|
|
|
/*
|
|
* _bt_search() -- Search the tree for a particular scankey,
|
|
* or more precisely for the first leaf page it could be on.
|
|
*
|
|
* Return value is a stack of parent-page pointers. *bufP is set to the
|
|
* address of the leaf-page buffer, which is read-locked and pinned.
|
|
* No locks are held on the parent pages, however!
|
|
*
|
|
* NOTE that the returned buffer is read-locked regardless of the access
|
|
* parameter. However, access = BT_WRITE will allow an empty root page
|
|
* to be created and returned. When access = BT_READ, an empty index
|
|
* will result in *bufP being set to InvalidBuffer.
|
|
*/
|
|
BTStack
|
|
_bt_search(Relation rel, int keysz, ScanKey scankey,
|
|
Buffer *bufP, int access)
|
|
{
|
|
BTStack stack_in = NULL;
|
|
|
|
/* Get the root page to start with */
|
|
*bufP = _bt_getroot(rel, access);
|
|
|
|
/* If index is empty and access = BT_READ, no root page is created. */
|
|
if (!BufferIsValid(*bufP))
|
|
return (BTStack) NULL;
|
|
|
|
/* Loop iterates once per level descended in the tree */
|
|
for (;;)
|
|
{
|
|
Page page;
|
|
BTPageOpaque opaque;
|
|
OffsetNumber offnum;
|
|
ItemId itemid;
|
|
BTItem btitem;
|
|
IndexTuple itup;
|
|
BlockNumber blkno;
|
|
BlockNumber par_blkno;
|
|
BTStack new_stack;
|
|
|
|
/* if this is a leaf page, we're done */
|
|
page = BufferGetPage(*bufP);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
if (P_ISLEAF(opaque))
|
|
break;
|
|
|
|
/*
|
|
* Find the appropriate item on the internal page, and get the
|
|
* child page that it points to.
|
|
*/
|
|
offnum = _bt_binsrch(rel, *bufP, keysz, scankey);
|
|
itemid = PageGetItemId(page, offnum);
|
|
btitem = (BTItem) PageGetItem(page, itemid);
|
|
itup = &(btitem->bti_itup);
|
|
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
|
par_blkno = BufferGetBlockNumber(*bufP);
|
|
|
|
/*
|
|
* We need to save the bit image of the index entry we chose in
|
|
* the parent page on a stack. In case we split the tree, we'll
|
|
* use this bit image to figure out what our real parent page is,
|
|
* in case the parent splits while we're working lower in the
|
|
* tree. See the paper by Lehman and Yao for how this is detected
|
|
* and handled. (We use the child link to disambiguate duplicate
|
|
* keys in the index -- Lehman and Yao disallow duplicate keys.)
|
|
*/
|
|
new_stack = (BTStack) palloc(sizeof(BTStackData));
|
|
new_stack->bts_blkno = par_blkno;
|
|
new_stack->bts_offset = offnum;
|
|
memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
|
|
new_stack->bts_parent = stack_in;
|
|
|
|
/* drop the read lock on the parent page, acquire one on the child */
|
|
_bt_relbuf(rel, *bufP);
|
|
*bufP = _bt_getbuf(rel, blkno, BT_READ);
|
|
|
|
/*
|
|
* Race -- the page we just grabbed may have split since we read
|
|
* its pointer in the parent. If it has, we may need to move
|
|
* right to its new sibling. Do that.
|
|
*/
|
|
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
|
|
|
|
/* okay, all set to move down a level */
|
|
stack_in = new_stack;
|
|
}
|
|
|
|
return stack_in;
|
|
}
|
|
|
|
/*
|
|
* _bt_moveright() -- move right in the btree if necessary.
|
|
*
|
|
* When we drop and reacquire a pointer to a page, it is possible that
|
|
* the page has changed in the meanwhile. If this happens, we're
|
|
* guaranteed that the page has "split right" -- that is, that any
|
|
* data that appeared on the page originally is either on the page
|
|
* or strictly to the right of it.
|
|
*
|
|
* This routine decides whether or not we need to move right in the
|
|
* tree by examining the high key entry on the page. If that entry
|
|
* is strictly less than one we expect to be on the page, then our
|
|
* picture of the page is incorrect and we need to move right.
|
|
*
|
|
* On entry, we have the buffer pinned and a lock of the proper type.
|
|
* If we move right, we release the buffer and lock and acquire the
|
|
* same on the right sibling. Return value is the buffer we stop at.
|
|
*/
|
|
Buffer
|
|
_bt_moveright(Relation rel,
|
|
Buffer buf,
|
|
int keysz,
|
|
ScanKey scankey,
|
|
int access)
|
|
{
|
|
Page page;
|
|
BTPageOpaque opaque;
|
|
|
|
page = BufferGetPage(buf);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
/*
|
|
* If the scan key that brought us to this page is > the high key
|
|
* stored on the page, then the page has split and we need to move
|
|
* right. (If the scan key is equal to the high key, we might or
|
|
* might not need to move right; have to scan the page first anyway.)
|
|
* It could even have split more than once, so scan as far as needed.
|
|
*/
|
|
while (!P_RIGHTMOST(opaque) &&
|
|
_bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)
|
|
{
|
|
/* step right one page */
|
|
BlockNumber rblkno = opaque->btpo_next;
|
|
|
|
_bt_relbuf(rel, buf);
|
|
buf = _bt_getbuf(rel, rblkno, access);
|
|
page = BufferGetPage(buf);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
}
|
|
|
|
return buf;
|
|
}
|
|
|
|
/*
|
|
* _bt_binsrch() -- Do a binary search for a key on a particular page.
|
|
*
|
|
* The scankey we get has the compare function stored in the procedure
|
|
* entry of each data struct. We invoke this regproc to do the
|
|
* comparison for every key in the scankey.
|
|
*
|
|
* On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
|
|
* key >= given scankey. (NOTE: in particular, this means it is possible
|
|
* to return a value 1 greater than the number of keys on the page,
|
|
* if the scankey is > all keys on the page.)
|
|
*
|
|
* On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
|
|
* of the last key < given scankey. (Since _bt_compare treats the first
|
|
* data key of such a page as minus infinity, there will be at least one
|
|
* key < scankey, so the result always points at one of the keys on the
|
|
* page.) This key indicates the right place to descend to be sure we
|
|
* find all leaf keys >= given scankey.
|
|
*
|
|
* This procedure is not responsible for walking right, it just examines
|
|
* the given page. _bt_binsrch() has no lock or refcount side effects
|
|
* on the buffer.
|
|
*/
|
|
OffsetNumber
|
|
_bt_binsrch(Relation rel,
|
|
Buffer buf,
|
|
int keysz,
|
|
ScanKey scankey)
|
|
{
|
|
TupleDesc itupdesc;
|
|
Page page;
|
|
BTPageOpaque opaque;
|
|
OffsetNumber low,
|
|
high;
|
|
int32 result;
|
|
|
|
itupdesc = RelationGetDescr(rel);
|
|
page = BufferGetPage(buf);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
low = P_FIRSTDATAKEY(opaque);
|
|
high = PageGetMaxOffsetNumber(page);
|
|
|
|
/*
|
|
* If there are no keys on the page, return the first available slot.
|
|
* Note this covers two cases: the page is really empty (no keys), or
|
|
* it contains only a high key. The latter case is possible after
|
|
* vacuuming. This can never happen on an internal page, however,
|
|
* since they are never empty (an internal page must have children).
|
|
*/
|
|
if (high < low)
|
|
return low;
|
|
|
|
/*
|
|
* Binary search to find the first key on the page >= scan key. Loop
|
|
* invariant: all slots before 'low' are < scan key, all slots at or
|
|
* after 'high' are >= scan key. We can fall out when high == low.
|
|
*/
|
|
high++; /* establish the loop invariant for high */
|
|
|
|
while (high > low)
|
|
{
|
|
OffsetNumber mid = low + ((high - low) / 2);
|
|
|
|
/* We have low <= mid < high, so mid points at a real slot */
|
|
|
|
result = _bt_compare(rel, keysz, scankey, page, mid);
|
|
|
|
if (result > 0)
|
|
low = mid + 1;
|
|
else
|
|
high = mid;
|
|
}
|
|
|
|
/*
|
|
* At this point we have high == low, but be careful: they could point
|
|
* past the last slot on the page.
|
|
*
|
|
* On a leaf page, we always return the first key >= scan key (which
|
|
* could be the last slot + 1).
|
|
*/
|
|
if (P_ISLEAF(opaque))
|
|
return low;
|
|
|
|
/*
|
|
* On a non-leaf page, return the last key < scan key. There must be
|
|
* one if _bt_compare() is playing by the rules.
|
|
*/
|
|
Assert(low > P_FIRSTDATAKEY(opaque));
|
|
|
|
return OffsetNumberPrev(low);
|
|
}
|
|
|
|
/*----------
|
|
* _bt_compare() -- Compare scankey to a particular tuple on the page.
|
|
*
|
|
* keysz: number of key conditions to be checked (might be less than the
|
|
* total length of the scan key!)
|
|
* page/offnum: location of btree item to be compared to.
|
|
*
|
|
* This routine returns:
|
|
* <0 if scankey < tuple at offnum;
|
|
* 0 if scankey == tuple at offnum;
|
|
* >0 if scankey > tuple at offnum.
|
|
* NULLs in the keys are treated as sortable values. Therefore
|
|
* "equality" does not necessarily mean that the item should be
|
|
* returned to the caller as a matching key!
|
|
*
|
|
* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
|
|
* "minus infinity": this routine will always claim it is less than the
|
|
* scankey. The actual key value stored (if any, which there probably isn't)
|
|
* does not matter. This convention allows us to implement the Lehman and
|
|
* Yao convention that the first down-link pointer is before the first key.
|
|
* See backend/access/nbtree/README for details.
|
|
*----------
|
|
*/
|
|
int32
|
|
_bt_compare(Relation rel,
|
|
int keysz,
|
|
ScanKey scankey,
|
|
Page page,
|
|
OffsetNumber offnum)
|
|
{
|
|
TupleDesc itupdesc = RelationGetDescr(rel);
|
|
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
BTItem btitem;
|
|
IndexTuple itup;
|
|
int i;
|
|
|
|
/*
|
|
* Force result ">" if target item is first data item on an internal
|
|
* page --- see NOTE above.
|
|
*/
|
|
if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
|
|
return 1;
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
itup = &(btitem->bti_itup);
|
|
|
|
/*
|
|
* The scan key is set up with the attribute number associated with
|
|
* each term in the key. It is important that, if the index is
|
|
* multi-key, the scan contain the first k key attributes, and that
|
|
* they be in order. If you think about how multi-key ordering works,
|
|
* you'll understand why this is.
|
|
*
|
|
* We don't test for violation of this condition here, however. The
|
|
* initial setup for the index scan had better have gotten it right
|
|
* (see _bt_first).
|
|
*/
|
|
|
|
for (i = 0; i < keysz; i++)
|
|
{
|
|
ScanKey entry = &scankey[i];
|
|
Datum datum;
|
|
bool isNull;
|
|
int32 result;
|
|
|
|
datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull);
|
|
|
|
/* see comments about NULLs handling in btbuild */
|
|
if (entry->sk_flags & SK_ISNULL) /* key is NULL */
|
|
{
|
|
if (isNull)
|
|
result = 0; /* NULL "=" NULL */
|
|
else
|
|
result = 1; /* NULL ">" NOT_NULL */
|
|
}
|
|
else if (isNull) /* key is NOT_NULL and item is NULL */
|
|
{
|
|
result = -1; /* NOT_NULL "<" NULL */
|
|
}
|
|
else
|
|
{
|
|
result = DatumGetInt32(FunctionCall2(&entry->sk_func,
|
|
entry->sk_argument,
|
|
datum));
|
|
}
|
|
|
|
/* if the keys are unequal, return the difference */
|
|
if (result != 0)
|
|
return result;
|
|
}
|
|
|
|
/* if we get here, the keys are equal */
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* _bt_next() -- Get the next item in a scan.
|
|
*
|
|
* On entry, we have a valid currentItemData in the scan, and a
|
|
* read lock and pin count on the page that contains that item.
|
|
* We return the next item in the scan, or false if no more.
|
|
* On successful exit, the page containing the new item is locked
|
|
* and pinned; on failure exit, no lock or pin is held.
|
|
*/
|
|
bool
|
|
_bt_next(IndexScanDesc scan, ScanDirection dir)
|
|
{
|
|
Relation rel;
|
|
Buffer buf;
|
|
Page page;
|
|
OffsetNumber offnum;
|
|
ItemPointer current;
|
|
BTItem btitem;
|
|
IndexTuple itup;
|
|
BTScanOpaque so;
|
|
bool continuescan;
|
|
|
|
rel = scan->indexRelation;
|
|
so = (BTScanOpaque) scan->opaque;
|
|
current = &(scan->currentItemData);
|
|
|
|
/* we still have the buffer pinned and locked */
|
|
buf = so->btso_curbuf;
|
|
Assert(BufferIsValid(buf));
|
|
|
|
do
|
|
{
|
|
/* step one tuple in the appropriate direction */
|
|
if (!_bt_step(scan, &buf, dir))
|
|
return false;
|
|
|
|
/* current is the next candidate tuple to return */
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
page = BufferGetPage(buf);
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
itup = &btitem->bti_itup;
|
|
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
|
{
|
|
/* tuple passes all scan key conditions, so return it */
|
|
scan->xs_ctup.t_self = itup->t_tid;
|
|
return true;
|
|
}
|
|
|
|
/* This tuple doesn't pass, but there might be more that do */
|
|
} while (continuescan);
|
|
|
|
/* No more items, so close down the current-item info */
|
|
ItemPointerSetInvalid(current);
|
|
so->btso_curbuf = InvalidBuffer;
|
|
_bt_relbuf(rel, buf);
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* _bt_first() -- Find the first item in a scan.
|
|
*
|
|
* We need to be clever about the type of scan, the operation it's
|
|
* performing, and the tree ordering. We find the
|
|
* first item in the tree that satisfies the qualification
|
|
* associated with the scan descriptor. On exit, the page containing
|
|
* the current index tuple is read locked and pinned, and the scan's
|
|
* opaque data entry is updated to include the buffer.
|
|
*/
|
|
bool
|
|
_bt_first(IndexScanDesc scan, ScanDirection dir)
|
|
{
|
|
Relation rel = scan->indexRelation;
|
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
|
Buffer buf;
|
|
Page page;
|
|
BTStack stack;
|
|
OffsetNumber offnum;
|
|
BTItem btitem;
|
|
IndexTuple itup;
|
|
ItemPointer current;
|
|
BlockNumber blkno;
|
|
StrategyNumber strat;
|
|
bool res;
|
|
int32 result;
|
|
bool scanFromEnd;
|
|
bool continuescan;
|
|
ScanKey scankeys = NULL;
|
|
int keysCount = 0;
|
|
int *nKeyIs = NULL;
|
|
int i,
|
|
j;
|
|
StrategyNumber strat_total;
|
|
|
|
/*
|
|
* Order the scan keys in our canonical fashion and eliminate any
|
|
* redundant keys.
|
|
*/
|
|
_bt_orderkeys(scan);
|
|
|
|
/*
|
|
* Quit now if _bt_orderkeys() discovered that the scan keys can never
|
|
* be satisfied (eg, x == 1 AND x > 2).
|
|
*/
|
|
if (!so->qual_ok)
|
|
return false;
|
|
|
|
/*
|
|
* Examine the scan keys to discover where we need to start the scan.
|
|
*/
|
|
scanFromEnd = false;
|
|
strat_total = BTEqualStrategyNumber;
|
|
if (so->numberOfKeys > 0)
|
|
{
|
|
nKeyIs = (int *) palloc(so->numberOfKeys * sizeof(int));
|
|
for (i = 0; i < so->numberOfKeys; i++)
|
|
{
|
|
AttrNumber attno = so->keyData[i].sk_attno;
|
|
|
|
/* ignore keys for already-determined attrs */
|
|
if (attno <= keysCount)
|
|
continue;
|
|
/* if we didn't find a boundary for the preceding attr, quit */
|
|
if (attno > keysCount + 1)
|
|
break;
|
|
strat = _bt_getstrat(rel, attno,
|
|
so->keyData[i].sk_procedure);
|
|
|
|
/*
|
|
* Can we use this key as a starting boundary for this attr?
|
|
*
|
|
* We can use multiple keys if they look like, say, = >= = but we
|
|
* have to stop after accepting a > or < boundary.
|
|
*/
|
|
if (strat == strat_total ||
|
|
strat == BTEqualStrategyNumber)
|
|
nKeyIs[keysCount++] = i;
|
|
else if (ScanDirectionIsBackward(dir) &&
|
|
(strat == BTLessStrategyNumber ||
|
|
strat == BTLessEqualStrategyNumber))
|
|
{
|
|
nKeyIs[keysCount++] = i;
|
|
strat_total = strat;
|
|
if (strat == BTLessStrategyNumber)
|
|
break;
|
|
}
|
|
else if (ScanDirectionIsForward(dir) &&
|
|
(strat == BTGreaterStrategyNumber ||
|
|
strat == BTGreaterEqualStrategyNumber))
|
|
{
|
|
nKeyIs[keysCount++] = i;
|
|
strat_total = strat;
|
|
if (strat == BTGreaterStrategyNumber)
|
|
break;
|
|
}
|
|
}
|
|
if (keysCount == 0)
|
|
scanFromEnd = true;
|
|
}
|
|
else
|
|
scanFromEnd = true;
|
|
|
|
/* if we just need to walk down one edge of the tree, do that */
|
|
if (scanFromEnd)
|
|
{
|
|
if (nKeyIs)
|
|
pfree(nKeyIs);
|
|
return _bt_endpoint(scan, dir);
|
|
}
|
|
|
|
/*
|
|
* We want to start the scan somewhere within the index. Set up a
|
|
* scankey we can use to search for the correct starting point.
|
|
*/
|
|
scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
|
|
for (i = 0; i < keysCount; i++)
|
|
{
|
|
FmgrInfo *procinfo;
|
|
|
|
j = nKeyIs[i];
|
|
|
|
/*
|
|
* _bt_orderkeys disallows it, but it's place to add some code
|
|
* later
|
|
*/
|
|
if (so->keyData[j].sk_flags & SK_ISNULL)
|
|
{
|
|
pfree(nKeyIs);
|
|
pfree(scankeys);
|
|
elog(ERROR, "_bt_first: btree doesn't support is(not)null, yet");
|
|
return false;
|
|
}
|
|
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
|
|
ScanKeyEntryInitializeWithInfo(scankeys + i,
|
|
so->keyData[j].sk_flags,
|
|
i + 1,
|
|
procinfo,
|
|
CurrentMemoryContext,
|
|
so->keyData[j].sk_argument);
|
|
}
|
|
if (nKeyIs)
|
|
pfree(nKeyIs);
|
|
|
|
current = &(scan->currentItemData);
|
|
|
|
/*
|
|
* Use the manufactured scan key to descend the tree and position
|
|
* ourselves on the target leaf page.
|
|
*/
|
|
stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ);
|
|
|
|
/* don't need to keep the stack around... */
|
|
_bt_freestack(stack);
|
|
|
|
if (!BufferIsValid(buf))
|
|
{
|
|
/* Only get here if index is completely empty */
|
|
ItemPointerSetInvalid(current);
|
|
so->btso_curbuf = InvalidBuffer;
|
|
pfree(scankeys);
|
|
return false;
|
|
}
|
|
|
|
/* remember which buffer we have pinned */
|
|
so->btso_curbuf = buf;
|
|
blkno = BufferGetBlockNumber(buf);
|
|
page = BufferGetPage(buf);
|
|
|
|
/* position to the precise item on the page */
|
|
offnum = _bt_binsrch(rel, buf, keysCount, scankeys);
|
|
|
|
ItemPointerSet(current, blkno, offnum);
|
|
|
|
/*
|
|
* At this point we are positioned at the first item >= scan key, or
|
|
* possibly at the end of a page on which all the existing items are
|
|
* greater than the scan key and we know that everything on later
|
|
* pages is less than or equal to scan key.
|
|
*
|
|
* We could step forward in the latter case, but that'd be a waste of
|
|
* time if we want to scan backwards. So, it's now time to examine
|
|
* the scan strategy to find the exact place to start the scan.
|
|
*
|
|
* Note: if _bt_step fails (meaning we fell off the end of the index in
|
|
* one direction or the other), we either return false (no matches) or
|
|
* call _bt_endpoint() to set up a scan starting at that index
|
|
* endpoint, as appropriate for the desired scan type.
|
|
*
|
|
* it's yet other place to add some code later for is(not)null ...
|
|
*/
|
|
|
|
switch (strat_total)
|
|
{
|
|
case BTLessStrategyNumber:
|
|
|
|
/*
|
|
* Back up one to arrive at last item < scankey
|
|
*/
|
|
if (!_bt_step(scan, &buf, BackwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return false;
|
|
}
|
|
break;
|
|
|
|
case BTLessEqualStrategyNumber:
|
|
|
|
/*
|
|
* We need to find the last item <= scankey, so step forward
|
|
* till we find one > scankey, then step back one.
|
|
*/
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
|
{
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return _bt_endpoint(scan, dir);
|
|
}
|
|
}
|
|
for (;;)
|
|
{
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
page = BufferGetPage(buf);
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
|
if (result < 0)
|
|
break;
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return _bt_endpoint(scan, dir);
|
|
}
|
|
}
|
|
if (!_bt_step(scan, &buf, BackwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return false;
|
|
}
|
|
break;
|
|
|
|
case BTEqualStrategyNumber:
|
|
|
|
/*
|
|
* Make sure we are on the first equal item; might have to
|
|
* step forward if currently at end of page.
|
|
*/
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
|
{
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return false;
|
|
}
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
page = BufferGetPage(buf);
|
|
}
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
|
if (result != 0)
|
|
goto nomatches; /* no equal items! */
|
|
|
|
/*
|
|
* If a backward scan was specified, need to start with last
|
|
* equal item not first one.
|
|
*/
|
|
if (ScanDirectionIsBackward(dir))
|
|
{
|
|
do
|
|
{
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return _bt_endpoint(scan, dir);
|
|
}
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
page = BufferGetPage(buf);
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
|
} while (result == 0);
|
|
if (!_bt_step(scan, &buf, BackwardScanDirection))
|
|
elog(ERROR, "_bt_first: equal items disappeared?");
|
|
}
|
|
break;
|
|
|
|
case BTGreaterEqualStrategyNumber:
|
|
|
|
/*
|
|
* We want the first item >= scankey, which is where we are...
|
|
* unless we're not anywhere at all...
|
|
*/
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
|
{
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case BTGreaterStrategyNumber:
|
|
|
|
/*
|
|
* We want the first item > scankey, so make sure we are on an
|
|
* item and then step over any equal items.
|
|
*/
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
|
{
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return false;
|
|
}
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
page = BufferGetPage(buf);
|
|
}
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
|
while (result == 0)
|
|
{
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
{
|
|
pfree(scankeys);
|
|
return false;
|
|
}
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
page = BufferGetPage(buf);
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
|
}
|
|
break;
|
|
}
|
|
|
|
/* okay, current item pointer for the scan is right */
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
page = BufferGetPage(buf);
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
itup = &btitem->bti_itup;
|
|
|
|
/* is the first item actually acceptable? */
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
|
{
|
|
/* yes, return it */
|
|
scan->xs_ctup.t_self = itup->t_tid;
|
|
res = true;
|
|
}
|
|
else if (continuescan)
|
|
{
|
|
/* no, but there might be another one that is */
|
|
res = _bt_next(scan, dir);
|
|
}
|
|
else
|
|
{
|
|
/* no tuples in the index match this scan key */
|
|
nomatches:
|
|
ItemPointerSetInvalid(current);
|
|
so->btso_curbuf = InvalidBuffer;
|
|
_bt_relbuf(rel, buf);
|
|
res = false;
|
|
}
|
|
|
|
pfree(scankeys);
|
|
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
* _bt_step() -- Step one item in the requested direction in a scan on
|
|
* the tree.
|
|
*
|
|
* *bufP is the current buffer (read-locked and pinned). If we change
|
|
* pages, it's updated appropriately.
|
|
*
|
|
* If successful, update scan's currentItemData and return true.
|
|
* If no adjacent record exists in the requested direction,
|
|
* release buffer pin/locks and return false.
|
|
*/
|
|
bool
|
|
_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|
{
|
|
Relation rel = scan->indexRelation;
|
|
ItemPointer current = &(scan->currentItemData);
|
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
|
Page page;
|
|
BTPageOpaque opaque;
|
|
OffsetNumber offnum,
|
|
maxoff;
|
|
BlockNumber blkno;
|
|
BlockNumber obknum;
|
|
|
|
/*
|
|
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
|
|
* due to ability of ip_posid to be equal 0.
|
|
*/
|
|
offnum = current->ip_posid;
|
|
|
|
page = BufferGetPage(*bufP);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
if (ScanDirectionIsForward(dir))
|
|
{
|
|
if (!PageIsEmpty(page) && offnum < maxoff)
|
|
offnum = OffsetNumberNext(offnum);
|
|
else
|
|
{
|
|
/* walk right to the next page with data */
|
|
for (;;)
|
|
{
|
|
/* if we're at end of scan, release the buffer and return */
|
|
if (P_RIGHTMOST(opaque))
|
|
{
|
|
_bt_relbuf(rel, *bufP);
|
|
ItemPointerSetInvalid(current);
|
|
*bufP = so->btso_curbuf = InvalidBuffer;
|
|
return false;
|
|
}
|
|
/* step right one page */
|
|
blkno = opaque->btpo_next;
|
|
_bt_relbuf(rel, *bufP);
|
|
*bufP = _bt_getbuf(rel, blkno, BT_READ);
|
|
page = BufferGetPage(*bufP);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
/* done if it's not empty */
|
|
offnum = P_FIRSTDATAKEY(opaque);
|
|
if (!PageIsEmpty(page) && offnum <= maxoff)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (offnum > P_FIRSTDATAKEY(opaque))
|
|
offnum = OffsetNumberPrev(offnum);
|
|
else
|
|
{
|
|
/* walk left to the next page with data */
|
|
for (;;)
|
|
{
|
|
/* if we're at end of scan, release the buffer and return */
|
|
if (P_LEFTMOST(opaque))
|
|
{
|
|
_bt_relbuf(rel, *bufP);
|
|
ItemPointerSetInvalid(current);
|
|
*bufP = so->btso_curbuf = InvalidBuffer;
|
|
return false;
|
|
}
|
|
/* step left */
|
|
obknum = BufferGetBlockNumber(*bufP);
|
|
blkno = opaque->btpo_prev;
|
|
_bt_relbuf(rel, *bufP);
|
|
*bufP = _bt_getbuf(rel, blkno, BT_READ);
|
|
page = BufferGetPage(*bufP);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
/*
|
|
* If the adjacent page just split, then we have to walk
|
|
* right to find the block that's now adjacent to where we
|
|
* were. Because pages only split right, we don't have to
|
|
* worry about this failing to terminate.
|
|
*/
|
|
while (opaque->btpo_next != obknum)
|
|
{
|
|
blkno = opaque->btpo_next;
|
|
_bt_relbuf(rel, *bufP);
|
|
*bufP = _bt_getbuf(rel, blkno, BT_READ);
|
|
page = BufferGetPage(*bufP);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
}
|
|
/* done if it's not empty */
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
offnum = maxoff;
|
|
if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque))
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Update scan state */
|
|
so->btso_curbuf = *bufP;
|
|
blkno = BufferGetBlockNumber(*bufP);
|
|
ItemPointerSet(current, blkno, offnum);
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* _bt_endpoint() -- Find the first or last key in the index.
|
|
*
|
|
* This is used by _bt_first() to set up a scan when we've determined
|
|
* that the scan must start at the beginning or end of the index (for
|
|
* a forward or backward scan respectively).
|
|
*/
|
|
static bool
|
|
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
|
{
|
|
Relation rel;
|
|
Buffer buf;
|
|
Page page;
|
|
BTPageOpaque opaque;
|
|
ItemPointer current;
|
|
OffsetNumber offnum,
|
|
maxoff;
|
|
OffsetNumber start;
|
|
BlockNumber blkno;
|
|
BTItem btitem;
|
|
IndexTuple itup;
|
|
BTScanOpaque so;
|
|
bool res;
|
|
bool continuescan;
|
|
|
|
rel = scan->indexRelation;
|
|
current = &(scan->currentItemData);
|
|
so = (BTScanOpaque) scan->opaque;
|
|
|
|
/*
|
|
* Scan down to the leftmost or rightmost leaf page. This is a
|
|
* simplified version of _bt_search(). We don't maintain a stack
|
|
* since we know we won't need it.
|
|
*/
|
|
buf = _bt_getroot(rel, BT_READ);
|
|
|
|
if (!BufferIsValid(buf))
|
|
{
|
|
/* empty index... */
|
|
ItemPointerSetInvalid(current);
|
|
so->btso_curbuf = InvalidBuffer;
|
|
return false;
|
|
}
|
|
|
|
blkno = BufferGetBlockNumber(buf);
|
|
page = BufferGetPage(buf);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
for (;;)
|
|
{
|
|
if (P_ISLEAF(opaque))
|
|
break;
|
|
|
|
if (ScanDirectionIsForward(dir))
|
|
offnum = P_FIRSTDATAKEY(opaque);
|
|
else
|
|
offnum = PageGetMaxOffsetNumber(page);
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
itup = &(btitem->bti_itup);
|
|
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
|
|
|
_bt_relbuf(rel, buf);
|
|
buf = _bt_getbuf(rel, blkno, BT_READ);
|
|
|
|
page = BufferGetPage(buf);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
/*
|
|
* Race condition: If the child page we just stepped onto was just
|
|
* split, we need to make sure we're all the way at the right edge
|
|
* of the tree. See the paper by Lehman and Yao.
|
|
*/
|
|
if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
|
|
{
|
|
do
|
|
{
|
|
blkno = opaque->btpo_next;
|
|
_bt_relbuf(rel, buf);
|
|
buf = _bt_getbuf(rel, blkno, BT_READ);
|
|
page = BufferGetPage(buf);
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
} while (!P_RIGHTMOST(opaque));
|
|
}
|
|
}
|
|
|
|
/* okay, we've got the {left,right}-most page in the tree */
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
if (ScanDirectionIsForward(dir))
|
|
{
|
|
Assert(P_LEFTMOST(opaque));
|
|
|
|
start = P_FIRSTDATAKEY(opaque);
|
|
}
|
|
else if (ScanDirectionIsBackward(dir))
|
|
{
|
|
Assert(P_RIGHTMOST(opaque));
|
|
|
|
start = PageGetMaxOffsetNumber(page);
|
|
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty
|
|
* page */
|
|
start = P_FIRSTDATAKEY(opaque);
|
|
}
|
|
else
|
|
{
|
|
elog(ERROR, "Illegal scan direction %d", dir);
|
|
start = 0; /* keep compiler quiet */
|
|
}
|
|
|
|
ItemPointerSet(current, blkno, start);
|
|
/* remember which buffer we have pinned */
|
|
so->btso_curbuf = buf;
|
|
|
|
/*
|
|
* Left/rightmost page could be empty due to deletions, if so step
|
|
* till we find a nonempty page.
|
|
*/
|
|
if (start > maxoff)
|
|
{
|
|
if (!_bt_step(scan, &buf, dir))
|
|
return false;
|
|
start = ItemPointerGetOffsetNumber(current);
|
|
page = BufferGetPage(buf);
|
|
}
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
|
|
itup = &(btitem->bti_itup);
|
|
|
|
/* see if we picked a winner */
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
|
{
|
|
/* yes, return it */
|
|
scan->xs_ctup.t_self = itup->t_tid;
|
|
res = true;
|
|
}
|
|
else if (continuescan)
|
|
{
|
|
/* no, but there might be another one that is */
|
|
res = _bt_next(scan, dir);
|
|
}
|
|
else
|
|
{
|
|
/* no tuples in the index match this scan key */
|
|
ItemPointerSetInvalid(current);
|
|
so->btso_curbuf = InvalidBuffer;
|
|
_bt_relbuf(rel, buf);
|
|
res = false;
|
|
}
|
|
|
|
return res;
|
|
}
|