postgresql/src/backend/access/nbtree/nbtsearch.c

1522 lines
41 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* btsearch.c
* search code for postgres btrees.
*
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.59 2000/04/12 17:14:49 momjian Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/nbtree.h"
static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey,
1997-09-08 22:59:27 +02:00
Buffer *bufP, BTStack stack_in);
static int _bt_compare(Relation rel, TupleDesc itupdesc, Page page,
int keysz, ScanKey scankey, OffsetNumber offnum);
static bool
1997-09-08 22:59:27 +02:00
_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
static RetrieveIndexResult
_bt_endpoint(IndexScanDesc scan, ScanDirection dir);
/*
* _bt_search() -- Search for a scan key in the index.
*
* This routine is actually just a helper that sets things up and
* calls a recursive-descent search routine on the tree.
*/
BTStack
1997-09-08 22:59:27 +02:00
_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP)
{
*bufP = _bt_getroot(rel, BT_READ);
1998-09-01 05:29:17 +02:00
return _bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL);
}
/*
* _bt_searchr() -- Search the tree recursively for a particular scankey.
*/
static BTStack
_bt_searchr(Relation rel,
int keysz,
ScanKey scankey,
1997-09-08 22:59:27 +02:00
Buffer *bufP,
BTStack stack_in)
{
BTStack stack;
OffsetNumber offnum;
Page page;
BTPageOpaque opaque;
BlockNumber par_blkno;
BlockNumber blkno;
ItemId itemid;
BTItem btitem;
BTItem item_save;
int item_nbytes;
IndexTuple itup;
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (opaque->btpo_flags & BTP_LEAF)
1998-09-01 05:29:17 +02:00
return stack_in;
/*
* Find the appropriate item on the internal page, and get the child
* page that it points to.
*/
par_blkno = BufferGetBlockNumber(*bufP);
offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT);
itemid = PageGetItemId(page, offnum);
btitem = (BTItem) PageGetItem(page, itemid);
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
/*
* We need to save the bit image of the index entry we chose in the
* parent page on a stack. In case we split the tree, we'll use this
* bit image to figure out what our real parent page is, in case the
* parent splits while we're working lower in the tree. See the paper
* by Lehman and Yao for how this is detected and handled. (We use
* unique OIDs to disambiguate duplicate keys in the index -- Lehman
* and Yao disallow duplicate keys).
*/
item_nbytes = ItemIdGetLength(itemid);
item_save = (BTItem) palloc(item_nbytes);
memmove((char *) item_save, (char *) btitem, item_nbytes);
stack = (BTStack) palloc(sizeof(BTStackData));
stack->bts_blkno = par_blkno;
stack->bts_offset = offnum;
stack->bts_btitem = item_save;
stack->bts_parent = stack_in;
/* drop the read lock on the parent page and acquire one on the child */
_bt_relbuf(rel, *bufP, BT_READ);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
/*
* Race -- the page we just grabbed may have split since we read its
* pointer in the parent. If it has, we may need to move right to its
* new sibling. Do that.
*/
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
/* okay, all set to move down a level */
1998-09-01 05:29:17 +02:00
return _bt_searchr(rel, keysz, scankey, bufP, stack);
}
/*
* _bt_moveright() -- move right in the btree if necessary.
*
* When we drop and reacquire a pointer to a page, it is possible that
* the page has changed in the meanwhile. If this happens, we're
* guaranteed that the page has "split right" -- that is, that any
* data that appeared on the page originally is either on the page
* or strictly to the right of it.
*
* This routine decides whether or not we need to move right in the
* tree by examining the high key entry on the page. If that entry
* is strictly less than one we expect to be on the page, then our
* picture of the page is incorrect and we need to move right.
*
* On entry, we have the buffer pinned and a lock of the proper type.
* If we move right, we release the buffer and lock and acquire the
* same on the right sibling.
*/
Buffer
_bt_moveright(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
int access)
{
Page page;
BTPageOpaque opaque;
ItemId hikey;
BlockNumber rblkno;
int natts = rel->rd_rel->relnatts;
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* if we're on a rightmost page, we don't need to move right */
if (P_RIGHTMOST(opaque))
1998-09-01 05:29:17 +02:00
return buf;
/* by convention, item 0 on non-rightmost pages is the high key */
hikey = PageGetItemId(page, P_HIKEY);
/*
* If the scan key that brought us to this page is >= the high key
* stored on the page, then the page has split and we need to move
* right.
*/
if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
BTGreaterEqualStrategyNumber))
{
/* move right as long as we need to */
do
{
OffsetNumber offmax = PageGetMaxOffsetNumber(page);
/*
* If this page consists of all duplicate keys (hikey and
* first key on the page have the same value), then we don't
* need to step right.
*
* NOTE for multi-column indices: we may do scan using keys not
* for all attrs. But we handle duplicates using all attrs in
* _bt_insert/_bt_spool code. And so we've to compare scankey
* with _last_ item on this page to do not lose "good" tuples
* if number of attrs > keysize. Example: (2,0) - last items
* on this page, (2,1) - first item on next page (hikey), our
* scankey is x = 2. Scankey == (2,1) because of we compare
* first attrs only, but we shouldn't to move right of here. -
* vadim 04/15/97
*
* Also, if this page is not LEAF one (and # of attrs > keysize)
* then we can't move too. - vadim 10/22/97
*/
if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
BTEqualStrategyNumber))
{
if (opaque->btpo_flags & BTP_CHAIN)
{
Assert((opaque->btpo_flags & BTP_LEAF) || offmax > P_HIKEY);
break;
}
if (offmax > P_HIKEY)
{
if (natts == keysz) /* sanity checks */
{
if (_bt_skeycmp(rel, keysz, scankey, page,
PageGetItemId(page, P_FIRSTKEY),
BTEqualStrategyNumber))
elog(FATAL, "btree: BTP_CHAIN flag was expected in %s (access = %s)",
RelationGetRelationName(rel), access ? "bt_write" : "bt_read");
if (_bt_skeycmp(rel, keysz, scankey, page,
PageGetItemId(page, offmax),
BTEqualStrategyNumber))
elog(FATAL, "btree: unexpected equal last item");
if (_bt_skeycmp(rel, keysz, scankey, page,
PageGetItemId(page, offmax),
BTLessStrategyNumber))
elog(FATAL, "btree: unexpected greater last item");
/* move right */
}
else if (!(opaque->btpo_flags & BTP_LEAF))
break;
else if (_bt_skeycmp(rel, keysz, scankey, page,
PageGetItemId(page, offmax),
BTLessEqualStrategyNumber))
break;
}
}
/* step right one page */
rblkno = opaque->btpo_next;
_bt_relbuf(rel, buf, access);
buf = _bt_getbuf(rel, rblkno, access);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
hikey = PageGetItemId(page, P_HIKEY);
} while (!P_RIGHTMOST(opaque)
&& _bt_skeycmp(rel, keysz, scankey, page, hikey,
BTGreaterEqualStrategyNumber));
}
1998-09-01 05:29:17 +02:00
return buf;
}
/*
* _bt_skeycmp() -- compare a scan key to a particular item on a page using
* a requested strategy (<, <=, =, >=, >).
*
* We ignore the unique OIDs stored in the btree item here. Those
* numbers are intended for use internally only, in repositioning a
* scan after a page split. They do not impose any meaningful ordering.
*
* The comparison is A <op> B, where A is the scan key and B is the
* tuple pointed at by itemid on page.
*/
bool
_bt_skeycmp(Relation rel,
Size keysz,
ScanKey scankey,
Page page,
ItemId itemid,
StrategyNumber strat)
{
BTItem item;
IndexTuple indexTuple;
TupleDesc tupDes;
int i;
int32 compare = 0;
item = (BTItem) PageGetItem(page, itemid);
indexTuple = &(item->bti_itup);
1998-09-01 05:29:17 +02:00
tupDes = RelationGetDescr(rel);
for (i = 1; i <= (int) keysz; i++)
{
ScanKey entry = &scankey[i - 1];
Datum attrDatum;
bool isNull;
Datum keyDatum;
Assert(entry->sk_attno == i);
attrDatum = index_getattr(indexTuple,
entry->sk_attno,
tupDes,
&isNull);
keyDatum = entry->sk_argument;
/* see comments about NULLs handling in btbuild */
if (entry->sk_flags & SK_ISNULL) /* key is NULL */
{
if (isNull)
compare = 0; /* NULL key "=" NULL datum */
else
compare = 1; /* NULL key ">" not-NULL datum */
}
else if (isNull) /* key is NOT_NULL and item is NULL */
{
compare = -1; /* not-NULL key "<" NULL datum */
}
else
compare = (int32) FMGR_PTR2(&entry->sk_func, keyDatum, attrDatum);
if (compare != 0)
break; /* done when we find unequal attributes */
}
switch (strat)
{
case BTLessStrategyNumber:
return (bool) (compare < 0);
case BTLessEqualStrategyNumber:
return (bool) (compare <= 0);
case BTEqualStrategyNumber:
return (bool) (compare == 0);
case BTGreaterEqualStrategyNumber:
return (bool) (compare >= 0);
case BTGreaterStrategyNumber:
return (bool) (compare > 0);
}
elog(ERROR, "_bt_skeycmp: bogus strategy %d", (int) strat);
return false;
}
/*
* _bt_binsrch() -- Do a binary search for a key on a particular page.
*
* The scankey we get has the compare function stored in the procedure
* entry of each data struct. We invoke this regproc to do the
* comparison for every key in the scankey. _bt_binsrch() returns
* the OffsetNumber of the first matching key on the page, or the
* OffsetNumber at which the matching key would appear if it were
* on this page. (NOTE: in particular, this means it is possible to
* return a value 1 greater than the number of keys on the page, if
* the scankey is > all keys on the page.)
*
* By the time this procedure is called, we're sure we're looking
* at the right page -- don't need to walk right. _bt_binsrch() has
* no lock or refcount side effects on the buffer.
*/
OffsetNumber
_bt_binsrch(Relation rel,
Buffer buf,
int keysz,
ScanKey scankey,
int srchtype)
{
TupleDesc itupdesc;
Page page;
BTPageOpaque opaque;
OffsetNumber low,
high;
bool haveEq;
int natts = rel->rd_rel->relnatts;
int result;
1998-09-01 05:29:17 +02:00
itupdesc = RelationGetDescr(rel);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* by convention, item 1 on any non-rightmost page is the high key */
low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
high = PageGetMaxOffsetNumber(page);
/*
* If there are no keys on the page, return the first available slot.
* Note this covers two cases: the page is really empty (no keys), or
* it contains only a high key. The latter case is possible after
* vacuuming.
*/
if (high < low)
1998-09-01 05:29:17 +02:00
return low;
/*
* Binary search to find the first key on the page >= scan key. Loop
* invariant: all slots before 'low' are < scan key, all slots at or
* after 'high' are >= scan key. Also, haveEq is true if the tuple at
* 'high' is == scan key. We can fall out when high == low.
*/
high++; /* establish the loop invariant for high */
haveEq = false;
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
/* We have low <= mid < high, so mid points at a real slot */
result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid);
if (result > 0)
low = mid + 1;
else
{
high = mid;
haveEq = (result == 0);
}
}
/*--------------------
* At this point we have high == low, but be careful: they could point
* past the last slot on the page. We also know that haveEq is true
* if and only if there is an equal key (in which case high&low point
* at the first equal key).
*
* On a leaf page, we always return the first key >= scan key
* (which could be the last slot + 1).
*--------------------
*/
if (opaque->btpo_flags & BTP_LEAF)
return low;
/*--------------------
* On a non-leaf page, there are special cases:
*
* For an insertion (srchtype != BT_DESCENT and natts == keysz)
* always return first key >= scan key (which could be off the end).
*
* For a standard search (srchtype == BT_DESCENT and natts == keysz)
* return the first equal key if one exists, else the last lesser key
* if one exists, else the first slot on the page.
*
1999-07-17 18:02:50 +02:00
* For a partial-match search (srchtype == BT_DESCENT and natts > keysz)
* return the last lesser key if one exists, else the first slot.
*
* Old comments:
* For multi-column indices, we may scan using keys
* not for all attrs. But we handle duplicates using all attrs
* in _bt_insert/_bt_spool code. And so while searching on
* internal pages having number of attrs > keysize we want to
* point at the last item < the scankey, not at the first item
* = the scankey (!!!), and let _bt_moveright decide later
* whether to move right or not (see comments and example
* there). Note also that INSERTions are not affected by this
* code (since natts == keysz for inserts). - vadim 04/15/97
*--------------------
*/
if (haveEq)
{
/*
* There is an equal key. We return either the first equal key
* (which we just found), or the last lesser key.
*
* We need not check srchtype != BT_DESCENT here, since if that is
* true then natts == keysz by assumption.
*/
if (natts == keysz)
return low; /* return first equal key */
}
else
{
/*
* There is no equal key. We return either the first greater key
* (which we just found), or the last lesser key.
*/
if (srchtype != BT_DESCENT)
return low; /* return first greater key */
}
if (low == (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY))
return low; /* there is no prior item */
return OffsetNumberPrev(low);
}
/*
* _bt_compare() -- Compare scankey to a particular tuple on the page.
*
* This routine returns:
* -1 if scankey < tuple at offnum;
* 0 if scankey == tuple at offnum;
* +1 if scankey > tuple at offnum.
*
* -- Old comments:
* In order to avoid having to propagate changes up the tree any time
* a new minimal key is inserted, the leftmost entry on the leftmost
* page is less than all possible keys, by definition.
*
* -- New ones:
* New insertion code (fix against updating _in_place_ if new minimal
* key has bigger size than old one) may delete P_HIKEY entry on the
* root page in order to insert new minimal key - and so this definition
* does not work properly in this case and breaks key' order on root
* page. BTW, this propagation occures only while page' splitting,
* but not "any time a new min key is inserted" (see _bt_insertonpg).
* - vadim 12/05/96
*/
static int
_bt_compare(Relation rel,
TupleDesc itupdesc,
Page page,
int keysz,
ScanKey scankey,
OffsetNumber offnum)
{
Datum datum;
BTItem btitem;
IndexTuple itup;
BTPageOpaque opaque;
ScanKey entry;
AttrNumber attno;
int result;
int i;
bool null;
/*
* If this is a leftmost internal page, and if our comparison is with
* the first key on the page, then the item at that position is by
* definition less than the scan key.
*
* - see new comments above...
*/
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (!(opaque->btpo_flags & BTP_LEAF)
&& P_LEFTMOST(opaque)
&& offnum == P_HIKEY)
{
/*
* we just have to believe that this will only be called with
* offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the first
* actual data key (i.e., this is also a rightmost page). there
* doesn't seem to be any code that implies that the leftmost page
* is normally missing a high key as well as the rightmost page.
* but that implies that this code path only applies to the root
* -- which seems unlikely..
*
* - see new comments above...
*/
if (!P_RIGHTMOST(opaque))
elog(ERROR, "_bt_compare: invalid comparison to high key");
#ifdef NOT_USED
/*
* We just have to belive that right answer will not break
* anything. I've checked code and all seems to be ok. See new
* comments above...
*
* -- Old comments If the item on the page is equal to the scankey,
* that's okay to admit. We just can't claim that the first key
* on the page is greater than anything.
*/
if (_bt_skeycmp(rel, keysz, scankey, page, PageGetItemId(page, offnum),
BTEqualStrategyNumber))
1998-09-01 05:29:17 +02:00
return 0;
return 1;
#endif
}
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &(btitem->bti_itup);
/*
* The scan key is set up with the attribute number associated with
* each term in the key. It is important that, if the index is
* multi-key, the scan contain the first k key attributes, and that
* they be in order. If you think about how multi-key ordering works,
* you'll understand why this is.
*
* We don't test for violation of this condition here.
*/
for (i = 1; i <= keysz; i++)
{
long tmpres;
entry = &scankey[i - 1];
attno = entry->sk_attno;
datum = index_getattr(itup, attno, itupdesc, &null);
/* see comments about NULLs handling in btbuild */
if (entry->sk_flags & SK_ISNULL) /* key is NULL */
{
if (null)
tmpres = (long) 0; /* NULL "=" NULL */
else
tmpres = (long) 1; /* NULL ">" NOT_NULL */
}
else if (null) /* key is NOT_NULL and item is NULL */
{
tmpres = (long) -1; /* NOT_NULL "<" NULL */
}
else
tmpres = (long) FMGR_PTR2(&entry->sk_func, entry->sk_argument, datum);
result = tmpres;
/* if the keys are unequal, return the difference */
if (result != 0)
1998-09-01 05:29:17 +02:00
return result;
}
/* by here, the keys are equal */
1998-09-01 05:29:17 +02:00
return 0;
}
/*
* _bt_next() -- Get the next item in a scan.
*
* On entry, we have a valid currentItemData in the scan, and a
* read lock on the page that contains that item. We do not have
* the page pinned. We return the next item in the scan. On
* exit, we have the page containing the next item locked but not
* pinned.
*/
RetrieveIndexResult
_bt_next(IndexScanDesc scan, ScanDirection dir)
{
Relation rel;
Buffer buf;
Page page;
OffsetNumber offnum;
RetrieveIndexResult res;
ItemPointer current;
BTItem btitem;
IndexTuple itup;
BTScanOpaque so;
Size keysok;
rel = scan->relation;
so = (BTScanOpaque) scan->opaque;
current = &(scan->currentItemData);
1999-05-25 18:15:34 +02:00
Assert(BufferIsValid(so->btso_curbuf));
/* we still have the buffer pinned and locked */
buf = so->btso_curbuf;
do
{
/* step one tuple in the appropriate direction */
if (!_bt_step(scan, &buf, dir))
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
/* by here, current is the tuple we want to return */
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &btitem->bti_itup;
if (_bt_checkkeys(scan, itup, &keysok))
{
Assert(keysok == so->numberOfKeys);
res = FormRetrieveIndexResult(current, &(itup->t_tid));
/* remember which buffer we have pinned and locked */
so->btso_curbuf = buf;
1998-09-01 05:29:17 +02:00
return res;
}
1999-05-25 18:15:34 +02:00
} while (keysok >= so->numberOfFirstKeys ||
(keysok == ((Size) -1) && ScanDirectionIsBackward(dir)));
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf, BT_READ);
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
}
/*
* _bt_first() -- Find the first item in a scan.
*
* We need to be clever about the type of scan, the operation it's
* performing, and the tree ordering. We return the RetrieveIndexResult
* of the first item in the tree that satisfies the qualification
* associated with the scan descriptor. On exit, the page containing
* the current index tuple is read locked and pinned, and the scan's
* opaque data entry is updated to include the buffer.
*/
RetrieveIndexResult
_bt_first(IndexScanDesc scan, ScanDirection dir)
{
Relation rel;
TupleDesc itupdesc;
Buffer buf;
Page page;
BTPageOpaque pop;
BTStack stack;
OffsetNumber offnum,
maxoff;
bool offGmax = false;
BTItem btitem;
IndexTuple itup;
ItemPointer current;
BlockNumber blkno;
StrategyNumber strat;
RetrieveIndexResult res;
RegProcedure proc;
int result;
BTScanOpaque so;
Size keysok;
bool strategyCheck;
ScanKey scankeys = 0;
int keysCount = 0;
int *nKeyIs = 0;
int i,
j;
StrategyNumber strat_total;
rel = scan->relation;
so = (BTScanOpaque) scan->opaque;
/*
* Order the keys in the qualification and be sure that the scan
* exploits the tree order.
*/
so->numberOfFirstKeys = 0; /* may be changed by _bt_orderkeys */
so->qual_ok = 1; /* may be changed by _bt_orderkeys */
scan->scanFromEnd = false;
strategyCheck = false;
if (so->numberOfKeys > 0)
{
_bt_orderkeys(rel, so);
if (so->qual_ok)
strategyCheck = true;
}
strat_total = BTEqualStrategyNumber;
if (strategyCheck)
{
AttrNumber attno;
nKeyIs = (int *) palloc(so->numberOfKeys * sizeof(int));
for (i = 0; i < so->numberOfKeys; i++)
{
attno = so->keyData[i].sk_attno;
if (attno == keysCount)
continue;
if (attno > keysCount + 1)
break;
strat = _bt_getstrat(rel, attno,
so->keyData[i].sk_procedure);
if (strat == strat_total ||
strat == BTEqualStrategyNumber)
{
nKeyIs[keysCount++] = i;
continue;
}
if (ScanDirectionIsBackward(dir) &&
(strat == BTLessStrategyNumber ||
strat == BTLessEqualStrategyNumber))
{
nKeyIs[keysCount++] = i;
strat_total = strat;
if (strat == BTLessStrategyNumber)
break;
continue;
}
if (ScanDirectionIsForward(dir) &&
(strat == BTGreaterStrategyNumber ||
strat == BTGreaterEqualStrategyNumber))
{
nKeyIs[keysCount++] = i;
strat_total = strat;
if (strat == BTGreaterStrategyNumber)
break;
continue;
}
}
if (!keysCount)
scan->scanFromEnd = true;
}
else
scan->scanFromEnd = true;
if (so->qual_ok == 0)
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
/* if we just need to walk down one edge of the tree, do that */
if (scan->scanFromEnd)
{
if (nKeyIs)
pfree(nKeyIs);
1998-09-01 05:29:17 +02:00
return _bt_endpoint(scan, dir);
}
1998-09-01 05:29:17 +02:00
itupdesc = RelationGetDescr(rel);
current = &(scan->currentItemData);
/*
* Okay, we want something more complicated. What we'll do is use the
* first item in the scan key passed in (which has been correctly
* ordered to take advantage of index ordering) to position ourselves
* at the right place in the scan.
*/
/* _bt_orderkeys disallows it, but it's place to add some code latter */
scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
for (i = 0; i < keysCount; i++)
{
j = nKeyIs[i];
if (so->keyData[j].sk_flags & SK_ISNULL)
{
pfree(nKeyIs);
pfree(scankeys);
elog(ERROR, "_bt_first: btree doesn't support is(not)null, yet");
return ((RetrieveIndexResult) NULL);
}
proc = index_getprocid(rel, i + 1, BTORDER_PROC);
ScanKeyEntryInitialize(scankeys + i, so->keyData[j].sk_flags,
i + 1, proc, so->keyData[j].sk_argument);
}
if (nKeyIs)
pfree(nKeyIs);
stack = _bt_search(rel, keysCount, scankeys, &buf);
_bt_freestack(stack);
blkno = BufferGetBlockNumber(buf);
1997-05-30 20:35:40 +02:00
page = BufferGetPage(buf);
/*
* This will happen if the tree we're searching is entirely empty, or
* if we're doing a search for a key that would appear on an entirely
* empty internal page. In either case, there are no matching tuples
* in the index.
*/
if (PageIsEmpty(page))
{
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf, BT_READ);
pfree(scankeys);
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
1997-05-30 20:35:40 +02:00
}
maxoff = PageGetMaxOffsetNumber(page);
1997-05-30 20:35:40 +02:00
pop = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* Now _bt_moveright doesn't move from non-rightmost leaf page if
* scankey == hikey and there is only hikey there. It's good for
* insertion, but we need to do work for scan here. - vadim 05/27/97
*/
while (maxoff == P_HIKEY && !P_RIGHTMOST(pop) &&
_bt_skeycmp(rel, keysCount, scankeys, page,
PageGetItemId(page, P_HIKEY),
BTGreaterEqualStrategyNumber))
{
/* step right one page */
blkno = pop->btpo_next;
_bt_relbuf(rel, buf, BT_READ);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
if (PageIsEmpty(page))
{
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf, BT_READ);
pfree(scankeys);
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
}
maxoff = PageGetMaxOffsetNumber(page);
pop = (BTPageOpaque) PageGetSpecialPointer(page);
}
/* find the nearest match to the manufactured scan key on the page */
offnum = _bt_binsrch(rel, buf, keysCount, scankeys, BT_DESCENT);
if (offnum > maxoff)
{
offnum = maxoff;
offGmax = true;
}
ItemPointerSet(current, blkno, offnum);
/*
* Now find the right place to start the scan. Result is the value
* we're looking for minus the value we're looking at in the index.
*/
result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
/* it's yet other place to add some code latter for is(not)null */
strat = strat_total;
switch (strat)
1997-05-30 20:35:40 +02:00
{
case BTLessStrategyNumber:
if (result <= 0)
{
do
{
if (!_bt_twostep(scan, &buf, BackwardScanDirection))
break;
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
} while (result <= 0);
}
break;
case BTLessEqualStrategyNumber:
if (result >= 0)
{
do
{
if (!_bt_twostep(scan, &buf, ForwardScanDirection))
break;
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
} while (result >= 0);
}
if (result < 0)
_bt_twostep(scan, &buf, BackwardScanDirection);
break;
case BTEqualStrategyNumber:
if (result != 0)
{
_bt_relbuf(scan->relation, buf, BT_READ);
so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(&(scan->currentItemData));
pfree(scankeys);
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
}
else if (ScanDirectionIsBackward(dir))
{
do
{
if (!_bt_twostep(scan, &buf, ForwardScanDirection))
break;
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
} while (result == 0);
if (result < 0)
_bt_twostep(scan, &buf, BackwardScanDirection);
}
break;
case BTGreaterEqualStrategyNumber:
if (offGmax)
{
if (result < 0)
{
Assert(!P_RIGHTMOST(pop) && maxoff == P_HIKEY);
if (!_bt_step(scan, &buf, ForwardScanDirection))
{
_bt_relbuf(scan->relation, buf, BT_READ);
so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(&(scan->currentItemData));
pfree(scankeys);
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
}
}
else if (result > 0)
{ /* Just remember: _bt_binsrch() returns
* the OffsetNumber of the first matching
* key on the page, or the OffsetNumber at
* which the matching key WOULD APPEAR IF
* IT WERE on this page. No key on this
* page, but offnum from _bt_binsrch()
* greater maxoff - have to move right. -
* vadim 12/06/96 */
_bt_twostep(scan, &buf, ForwardScanDirection);
}
}
else if (result < 0)
{
do
{
if (!_bt_twostep(scan, &buf, BackwardScanDirection))
break;
page = BufferGetPage(buf);
offnum = ItemPointerGetOffsetNumber(current);
result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
} while (result < 0);
if (result > 0)
_bt_twostep(scan, &buf, ForwardScanDirection);
}
break;
case BTGreaterStrategyNumber:
/* offGmax helps as above */
if (result >= 0 || offGmax)
{
do
{
if (!_bt_twostep(scan, &buf, ForwardScanDirection))
break;
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum);
} while (result >= 0);
}
break;
1997-05-30 20:35:40 +02:00
}
pfree(scankeys);
/* okay, current item pointer for the scan is right */
offnum = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &btitem->bti_itup;
if (_bt_checkkeys(scan, itup, &keysok))
1997-05-30 20:35:40 +02:00
{
res = FormRetrieveIndexResult(current, &(itup->t_tid));
/* remember which buffer we have pinned */
so->btso_curbuf = buf;
}
else if (keysok >= so->numberOfFirstKeys)
{
so->btso_curbuf = buf;
1998-09-01 05:29:17 +02:00
return _bt_next(scan, dir);
}
else if (keysok == ((Size) -1) && ScanDirectionIsBackward(dir))
{
so->btso_curbuf = buf;
return _bt_next(scan, dir);
}
else
{
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf, BT_READ);
res = (RetrieveIndexResult) NULL;
}
1998-09-01 05:29:17 +02:00
return res;
}
/*
* _bt_step() -- Step one item in the requested direction in a scan on
* the tree.
*
* If no adjacent record exists in the requested direction, return
* false. Else, return true and set the currentItemData for the
* scan to the right thing.
*/
bool
1997-09-08 22:59:27 +02:00
_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
{
Page page;
BTPageOpaque opaque;
OffsetNumber offnum,
maxoff;
OffsetNumber start;
BlockNumber blkno;
BlockNumber obknum;
BTScanOpaque so;
ItemPointer current;
Relation rel;
rel = scan->relation;
current = &(scan->currentItemData);
1999-05-25 18:15:34 +02:00
/*
1999-05-25 18:15:34 +02:00
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
* due to ability of ip_posid to be equal 0.
*/
offnum = current->ip_posid;
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
so = (BTScanOpaque) scan->opaque;
maxoff = PageGetMaxOffsetNumber(page);
/* get the next tuple */
if (ScanDirectionIsForward(dir))
{
if (!PageIsEmpty(page) && offnum < maxoff)
offnum = OffsetNumberNext(offnum);
else
{
/* if we're at end of scan, release the buffer and return */
blkno = opaque->btpo_next;
if (P_RIGHTMOST(opaque))
{
_bt_relbuf(rel, *bufP, BT_READ);
ItemPointerSetInvalid(current);
*bufP = so->btso_curbuf = InvalidBuffer;
1998-09-01 05:29:17 +02:00
return false;
}
else
{
/* walk right to the next page with data */
_bt_relbuf(rel, *bufP, BT_READ);
for (;;)
{
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
if (!PageIsEmpty(page) && start <= maxoff)
break;
else
{
blkno = opaque->btpo_next;
_bt_relbuf(rel, *bufP, BT_READ);
if (blkno == P_NONE)
{
*bufP = so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(current);
1998-09-01 05:29:17 +02:00
return false;
}
}
}
offnum = start;
}
}
}
else if (ScanDirectionIsBackward(dir))
{
/* remember that high key is item zero on non-rightmost pages */
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
if (offnum > start)
offnum = OffsetNumberPrev(offnum);
else
{
/* if we're at end of scan, release the buffer and return */
blkno = opaque->btpo_prev;
if (P_LEFTMOST(opaque))
{
_bt_relbuf(rel, *bufP, BT_READ);
*bufP = so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(current);
1998-09-01 05:29:17 +02:00
return false;
}
else
{
obknum = BufferGetBlockNumber(*bufP);
/* walk right to the next page with data */
_bt_relbuf(rel, *bufP, BT_READ);
for (;;)
{
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
/*
* If the adjacent page just split, then we may have
* the wrong block. Handle this case. Because pages
* only split right, we don't have to worry about this
* failing to terminate.
*/
while (opaque->btpo_next != obknum)
{
blkno = opaque->btpo_next;
_bt_relbuf(rel, *bufP, BT_READ);
*bufP = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
}
/* don't consider the high key */
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
/* anything to look at here? */
if (!PageIsEmpty(page) && maxoff >= start)
break;
else
{
blkno = opaque->btpo_prev;
obknum = BufferGetBlockNumber(*bufP);
_bt_relbuf(rel, *bufP, BT_READ);
if (blkno == P_NONE)
{
*bufP = so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(current);
1998-09-01 05:29:17 +02:00
return false;
}
}
}
offnum = maxoff;/* XXX PageIsEmpty? */
}
}
}
blkno = BufferGetBlockNumber(*bufP);
so->btso_curbuf = *bufP;
ItemPointerSet(current, blkno, offnum);
1998-09-01 05:29:17 +02:00
return true;
}
/*
* _bt_twostep() -- Move to an adjacent record in a scan on the tree,
* if an adjacent record exists.
*
* This is like _bt_step, except that if no adjacent record exists
* it restores us to where we were before trying the step. This is
* only hairy when you cross page boundaries, since the page you cross
* from could have records inserted or deleted, or could even split.
* This is unlikely, but we try to handle it correctly here anyway.
*
* This routine contains the only case in which our changes to Lehman
* and Yao's algorithm.
*
* Like step, this routine leaves the scan's currentItemData in the
* proper state and acquires a lock and pin on *bufP. If the twostep
* succeeded, we return true; otherwise, we return false.
*/
static bool
1997-09-08 22:59:27 +02:00
_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
{
Page page;
BTPageOpaque opaque;
OffsetNumber offnum,
maxoff;
OffsetNumber start;
ItemPointer current;
ItemId itemid;
int itemsz;
BTItem btitem;
BTItem svitem;
BlockNumber blkno;
blkno = BufferGetBlockNumber(*bufP);
page = BufferGetPage(*bufP);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
current = &(scan->currentItemData);
offnum = ItemPointerGetOffsetNumber(current);
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
/* if we're safe, just do it */
if (ScanDirectionIsForward(dir) && offnum < maxoff)
{ /* XXX PageIsEmpty? */
ItemPointerSet(current, blkno, OffsetNumberNext(offnum));
1998-09-01 05:29:17 +02:00
return true;
}
else if (ScanDirectionIsBackward(dir) && offnum > start)
{
ItemPointerSet(current, blkno, OffsetNumberPrev(offnum));
1998-09-01 05:29:17 +02:00
return true;
}
/* if we've hit end of scan we don't have to do any work */
if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque))
1998-09-01 05:29:17 +02:00
return false;
else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque))
1998-09-01 05:29:17 +02:00
return false;
/*
* Okay, it's off the page; let _bt_step() do the hard work, and we'll
* try to remember where we were. This is not guaranteed to work;
* this is the only place in the code where concurrency can screw us
* up, and it's because we want to be able to move in two directions
* in the scan.
*/
itemid = PageGetItemId(page, offnum);
itemsz = ItemIdGetLength(itemid);
btitem = (BTItem) PageGetItem(page, itemid);
svitem = (BTItem) palloc(itemsz);
memmove((char *) svitem, (char *) btitem, itemsz);
if (_bt_step(scan, bufP, dir))
{
pfree(svitem);
1998-09-01 05:29:17 +02:00
return true;
}
/* try to find our place again */
*bufP = _bt_getbuf(scan->relation, blkno, BT_READ);
page = BufferGetPage(*bufP);
maxoff = PageGetMaxOffsetNumber(page);
while (offnum <= maxoff)
{
itemid = PageGetItemId(page, offnum);
btitem = (BTItem) PageGetItem(page, itemid);
if (BTItemSame(btitem, svitem))
{
pfree(svitem);
ItemPointerSet(current, blkno, offnum);
1998-09-01 05:29:17 +02:00
return false;
}
}
/*
* XXX crash and burn -- can't find our place. We can be a little
* smarter -- walk to the next page to the right, for example, since
* that's the only direction that splits happen in. Deletions screw
* us up less often since they're only done by the vacuum daemon.
*/
elog(ERROR, "btree synchronization error: concurrent update botched scan");
1998-09-01 05:29:17 +02:00
return false;
}
/*
* _bt_endpoint() -- Find the first or last key in the index.
*/
static RetrieveIndexResult
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
{
Relation rel;
Buffer buf;
Page page;
BTPageOpaque opaque;
ItemPointer current;
OffsetNumber offnum,
maxoff;
OffsetNumber start = 0;
BlockNumber blkno;
BTItem btitem;
IndexTuple itup;
BTScanOpaque so;
RetrieveIndexResult res;
Size keysok;
rel = scan->relation;
current = &(scan->currentItemData);
so = (BTScanOpaque) scan->opaque;
buf = _bt_getroot(rel, BT_READ);
blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
for (;;)
{
if (opaque->btpo_flags & BTP_LEAF)
break;
if (ScanDirectionIsForward(dir))
offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
else
offnum = PageGetMaxOffsetNumber(page);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
itup = &(btitem->bti_itup);
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
_bt_relbuf(rel, buf, BT_READ);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* Race condition: If the child page we just stepped onto is in
* the process of being split, we need to make sure we're all the
* way at the right edge of the tree. See the paper by Lehman and
* Yao.
*/
if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
{
do
{
blkno = opaque->btpo_next;
_bt_relbuf(rel, buf, BT_READ);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
} while (!P_RIGHTMOST(opaque));
}
}
/* okay, we've got the {left,right}-most page in the tree */
maxoff = PageGetMaxOffsetNumber(page);
if (ScanDirectionIsForward(dir))
{
if (!P_LEFTMOST(opaque))/* non-leftmost page ? */
elog(ERROR, "_bt_endpoint: leftmost page (%u) has not leftmost flag", blkno);
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
/*
* I don't understand this stuff! It doesn't work for
* non-rightmost pages with only one element (P_HIKEY) which we
* have after deletion itups by vacuum (it's case of start >
* maxoff). Scanning in BackwardScanDirection is not
* understandable at all. Well - new stuff. - vadim 12/06/96
*/
#ifdef NOT_USED
if (PageIsEmpty(page) || start > maxoff)
{
ItemPointerSet(current, blkno, maxoff);
if (!_bt_step(scan, &buf, BackwardScanDirection))
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
start = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
#endif
if (PageIsEmpty(page))
{
if (start != P_HIKEY) /* non-rightmost page */
elog(ERROR, "_bt_endpoint: non-rightmost page (%u) is empty", blkno);
/*
* It's left- & right- most page - root page, - and it's
* empty...
*/
_bt_relbuf(rel, buf, BT_READ);
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
}
if (start > maxoff) /* start == 2 && maxoff == 1 */
{
ItemPointerSet(current, blkno, maxoff);
if (!_bt_step(scan, &buf, ForwardScanDirection))
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
start = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
/* new stuff ends here */
else
ItemPointerSet(current, blkno, start);
}
else if (ScanDirectionIsBackward(dir))
{
/*
* I don't understand this stuff too! If RIGHT-most leaf page is
* empty why do scanning in ForwardScanDirection ??? Well - new
* stuff. - vadim 12/06/96
*/
#ifdef NOT_USED
if (PageIsEmpty(page))
{
ItemPointerSet(current, blkno, FirstOffsetNumber);
if (!_bt_step(scan, &buf, ForwardScanDirection))
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
start = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
#endif
if (PageIsEmpty(page))
{
/* If it's leftmost page too - it's empty root page... */
if (P_LEFTMOST(opaque))
{
_bt_relbuf(rel, buf, BT_READ);
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
}
/* Go back ! */
ItemPointerSet(current, blkno, FirstOffsetNumber);
if (!_bt_step(scan, &buf, BackwardScanDirection))
1998-09-01 05:29:17 +02:00
return (RetrieveIndexResult) NULL;
start = ItemPointerGetOffsetNumber(current);
page = BufferGetPage(buf);
}
/* new stuff ends here */
else
{
start = PageGetMaxOffsetNumber(page);
ItemPointerSet(current, blkno, start);
}
}
else
elog(ERROR, "Illegal scan direction %d", dir);
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
itup = &(btitem->bti_itup);
/* see if we picked a winner */
if (_bt_checkkeys(scan, itup, &keysok))
{
res = FormRetrieveIndexResult(current, &(itup->t_tid));
/* remember which buffer we have pinned */
so->btso_curbuf = buf;
}
else if (keysok >= so->numberOfFirstKeys)
{
so->btso_curbuf = buf;
1998-09-01 05:29:17 +02:00
return _bt_next(scan, dir);
}
else if (keysok == ((Size) -1) && ScanDirectionIsBackward(dir))
{
so->btso_curbuf = buf;
return _bt_next(scan, dir);
}
else
{
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
_bt_relbuf(rel, buf, BT_READ);
res = (RetrieveIndexResult) NULL;
}
1998-09-01 05:29:17 +02:00
return res;
}