1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* nbtsearch.c
|
2003-02-22 01:45:05 +01:00
|
|
|
* Search code for postgres btrees.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
2004-12-31 23:04:05 +01:00
|
|
|
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2004-12-31 23:04:05 +01:00
|
|
|
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.90 2004/12/31 21:59:22 pgsql Exp $
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "postgres.h"
|
1996-07-09 08:22:35 +02:00
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "access/genam.h"
|
|
|
|
#include "access/nbtree.h"
|
2003-11-12 22:15:59 +01:00
|
|
|
#include "utils/lsyscache.h"
|
1996-10-23 09:42:13 +02:00
|
|
|
|
1996-11-03 13:35:27 +01:00
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
static Buffer _bt_walk_left(Relation rel, Buffer buf);
|
2002-05-21 01:51:44 +02:00
|
|
|
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
|
1996-07-09 08:22:35 +02:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* _bt_search() -- Search the tree for a particular scankey,
|
|
|
|
* or more precisely for the first leaf page it could be on.
|
|
|
|
*
|
2003-12-21 02:23:06 +01:00
|
|
|
* When nextkey is false (the usual case), we are looking for the first
|
|
|
|
* item >= scankey. When nextkey is true, we are looking for the first
|
|
|
|
* item strictly greater than scankey.
|
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Return value is a stack of parent-page pointers. *bufP is set to the
|
|
|
|
* address of the leaf-page buffer, which is read-locked and pinned.
|
|
|
|
* No locks are held on the parent pages, however!
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* NOTE that the returned buffer is read-locked regardless of the access
|
|
|
|
* parameter. However, access = BT_WRITE will allow an empty root page
|
2001-03-22 05:01:46 +01:00
|
|
|
* to be created and returned. When access = BT_READ, an empty index
|
2000-07-21 08:42:39 +02:00
|
|
|
* will result in *bufP being set to InvalidBuffer.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
BTStack
|
2003-12-21 02:23:06 +01:00
|
|
|
_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
|
2000-07-21 08:42:39 +02:00
|
|
|
Buffer *bufP, int access)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
BTStack stack_in = NULL;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Get the root page to start with */
|
|
|
|
*bufP = _bt_getroot(rel, access);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* If index is empty and access = BT_READ, no root page is created. */
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!BufferIsValid(*bufP))
|
2000-07-21 08:42:39 +02:00
|
|
|
return (BTStack) NULL;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Loop iterates once per level descended in the tree */
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
ItemId itemid;
|
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
BlockNumber blkno;
|
|
|
|
BlockNumber par_blkno;
|
|
|
|
BTStack new_stack;
|
|
|
|
|
2003-07-30 00:18:38 +02:00
|
|
|
/*
|
|
|
|
* Race -- the page we just grabbed may have split since we read
|
2003-08-04 02:43:34 +02:00
|
|
|
* its pointer in the parent (or metapage). If it has, we may
|
|
|
|
* need to move right to its new sibling. Do that.
|
2003-07-30 00:18:38 +02:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
|
2003-07-30 00:18:38 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* if this is a leaf page, we're done */
|
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
if (P_ISLEAF(opaque))
|
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Find the appropriate item on the internal page, and get the
|
|
|
|
* child page that it points to.
|
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
|
2000-07-21 08:42:39 +02:00
|
|
|
itemid = PageGetItemId(page, offnum);
|
|
|
|
btitem = (BTItem) PageGetItem(page, itemid);
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
|
|
|
par_blkno = BufferGetBlockNumber(*bufP);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2003-08-04 02:43:34 +02:00
|
|
|
* We need to save the location of the index entry we chose in the
|
|
|
|
* parent page on a stack. In case we split the tree, we'll use
|
|
|
|
* the stack to work back up to the parent page. We also save the
|
|
|
|
* actual downlink (TID) to uniquely identify the index entry, in
|
|
|
|
* case it moves right while we're working lower in the tree. See
|
|
|
|
* the paper by Lehman and Yao for how this is detected and
|
|
|
|
* handled. (We use the child link to disambiguate duplicate keys
|
|
|
|
* in the index -- Lehman and Yao disallow duplicate keys.)
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
new_stack = (BTStack) palloc(sizeof(BTStackData));
|
|
|
|
new_stack->bts_blkno = par_blkno;
|
|
|
|
new_stack->bts_offset = offnum;
|
|
|
|
memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
|
|
|
|
new_stack->bts_parent = stack_in;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* drop the read lock on the parent page, acquire one on the child */
|
2004-04-21 20:24:26 +02:00
|
|
|
*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* okay, all set to move down a level */
|
|
|
|
stack_in = new_stack;
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
return stack_in;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_moveright() -- move right in the btree if necessary.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2003-12-21 02:23:06 +01:00
|
|
|
* When we follow a pointer to reach a page, it is possible that
|
|
|
|
* the page has changed in the meanwhile. If this happens, we're
|
|
|
|
* guaranteed that the page has "split right" -- that is, that any
|
|
|
|
* data that appeared on the page originally is either on the page
|
|
|
|
* or strictly to the right of it.
|
|
|
|
*
|
|
|
|
* When nextkey is false (the usual case), we are looking for the first
|
|
|
|
* item >= scankey. When nextkey is true, we are looking for the first
|
|
|
|
* item strictly greater than scankey.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2003-12-21 02:23:06 +01:00
|
|
|
* This routine decides whether or not we need to move right in the
|
|
|
|
* tree by examining the high key entry on the page. If that entry
|
|
|
|
* is strictly less than the scankey, or <= the scankey in the nextkey=true
|
|
|
|
* case, then we followed the wrong link and we need to move right.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2003-12-21 02:23:06 +01:00
|
|
|
* On entry, we have the buffer pinned and a lock of the type specified by
|
|
|
|
* 'access'. If we move right, we release the buffer and lock and acquire
|
|
|
|
* the same on the right sibling. Return value is the buffer we stop at.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
_bt_moveright(Relation rel,
|
1997-09-07 07:04:48 +02:00
|
|
|
Buffer buf,
|
|
|
|
int keysz,
|
|
|
|
ScanKey scankey,
|
2003-12-21 02:23:06 +01:00
|
|
|
bool nextkey,
|
1997-09-07 07:04:48 +02:00
|
|
|
int access)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
2003-12-21 02:23:06 +01:00
|
|
|
int32 cmpval;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* When nextkey = false (normal case): if the scan key that brought us
|
|
|
|
* to this page is > the high key stored on the page, then the page
|
|
|
|
* has split and we need to move right. (If the scan key is equal to
|
|
|
|
* the high key, we might or might not need to move right; have to
|
|
|
|
* scan the page first anyway.)
|
2003-12-21 02:23:06 +01:00
|
|
|
*
|
|
|
|
* When nextkey = true: move right if the scan key is >= page's high key.
|
|
|
|
*
|
2004-08-29 07:07:03 +02:00
|
|
|
* The page could even have split more than once, so scan as far as
|
|
|
|
* needed.
|
2003-02-22 01:45:05 +01:00
|
|
|
*
|
2003-08-04 02:43:34 +02:00
|
|
|
* We also have to move right if we followed a link that brought us to a
|
|
|
|
* dead page.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
cmpval = nextkey ? 0 : 1;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
while (!P_RIGHTMOST(opaque) &&
|
2003-02-22 01:45:05 +01:00
|
|
|
(P_IGNORE(opaque) ||
|
2003-12-21 02:23:06 +01:00
|
|
|
_bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
|
1997-04-16 03:48:29 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* step right one page */
|
2001-03-22 05:01:46 +01:00
|
|
|
BlockNumber rblkno = opaque->btpo_next;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2004-04-21 20:24:26 +02:00
|
|
|
buf = _bt_relandgetbuf(rel, buf, rblkno, access);
|
2000-07-21 08:42:39 +02:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
if (P_IGNORE(opaque))
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(ERROR, "fell off the end of \"%s\"",
|
2003-02-22 01:45:05 +01:00
|
|
|
RelationGetRelationName(rel));
|
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return buf;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* _bt_binsrch() -- Do a binary search for a key on a particular page.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2003-12-21 02:23:06 +01:00
|
|
|
* When nextkey is false (the usual case), we are looking for the first
|
|
|
|
* item >= scankey. When nextkey is true, we are looking for the first
|
|
|
|
* item strictly greater than scankey.
|
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* The scankey we get has the compare function stored in the procedure
|
|
|
|
* entry of each data struct. We invoke this regproc to do the
|
|
|
|
* comparison for every key in the scankey.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
|
2003-12-21 02:23:06 +01:00
|
|
|
* key >= given scankey, or > scankey if nextkey is true. (NOTE: in
|
|
|
|
* particular, this means it is possible to return a value 1 greater than the
|
|
|
|
* number of keys on the page, if the scankey is > all keys on the page.)
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
|
2003-12-21 02:23:06 +01:00
|
|
|
* of the last key < given scankey, or last key <= given scankey if nextkey
|
|
|
|
* is true. (Since _bt_compare treats the first data key of such a page as
|
|
|
|
* minus infinity, there will be at least one key < scankey, so the result
|
|
|
|
* always points at one of the keys on the page.) This key indicates the
|
|
|
|
* right place to descend to be sure we find all leaf keys >= given scankey
|
|
|
|
* (or leaf keys > given scankey when nextkey is true).
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* This procedure is not responsible for walking right, it just examines
|
2001-03-22 05:01:46 +01:00
|
|
|
* the given page. _bt_binsrch() has no lock or refcount side effects
|
2000-07-21 08:42:39 +02:00
|
|
|
* on the buffer.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
OffsetNumber
|
|
|
|
_bt_binsrch(Relation rel,
|
1997-09-07 07:04:48 +02:00
|
|
|
Buffer buf,
|
|
|
|
int keysz,
|
2003-12-21 02:23:06 +01:00
|
|
|
ScanKey scankey,
|
|
|
|
bool nextkey)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
TupleDesc itupdesc;
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
OffsetNumber low,
|
|
|
|
high;
|
2003-12-21 02:23:06 +01:00
|
|
|
int32 result,
|
|
|
|
cmpval;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
itupdesc = RelationGetDescr(rel);
|
1997-09-07 07:04:48 +02:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
low = P_FIRSTDATAKEY(opaque);
|
1997-09-07 07:04:48 +02:00
|
|
|
high = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
/*
|
1999-07-17 00:17:06 +02:00
|
|
|
* If there are no keys on the page, return the first available slot.
|
2000-04-12 19:17:23 +02:00
|
|
|
* Note this covers two cases: the page is really empty (no keys), or
|
|
|
|
* it contains only a high key. The latter case is possible after
|
2000-07-21 08:42:39 +02:00
|
|
|
* vacuuming. This can never happen on an internal page, however,
|
|
|
|
* since they are never empty (an internal page must have children).
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
1999-07-17 00:17:06 +02:00
|
|
|
if (high < low)
|
1998-09-01 05:29:17 +02:00
|
|
|
return low;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-07-17 00:17:06 +02:00
|
|
|
/*
|
2003-12-21 02:23:06 +01:00
|
|
|
* Binary search to find the first key on the page >= scan key, or
|
|
|
|
* first key > scankey when nextkey is true.
|
|
|
|
*
|
2004-08-29 07:07:03 +02:00
|
|
|
* For nextkey=false (cmpval=1), the loop invariant is: all slots before
|
|
|
|
* 'low' are < scan key, all slots at or after 'high' are >= scan key.
|
2003-12-21 02:23:06 +01:00
|
|
|
*
|
2004-08-29 07:07:03 +02:00
|
|
|
* For nextkey=true (cmpval=0), the loop invariant is: all slots before
|
|
|
|
* 'low' are <= scan key, all slots at or after 'high' are > scan key.
|
2003-12-21 02:23:06 +01:00
|
|
|
*
|
|
|
|
* We can fall out when high == low.
|
1999-07-17 00:17:06 +02:00
|
|
|
*/
|
|
|
|
high++; /* establish the loop invariant for high */
|
|
|
|
|
2003-12-21 02:23:06 +01:00
|
|
|
cmpval = nextkey ? 0 : 1; /* select comparison value */
|
|
|
|
|
1999-07-17 00:17:06 +02:00
|
|
|
while (high > low)
|
1997-04-16 03:48:29 +02:00
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
OffsetNumber mid = low + ((high - low) / 2);
|
|
|
|
|
1999-07-17 00:17:06 +02:00
|
|
|
/* We have low <= mid < high, so mid points at a real slot */
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
result = _bt_compare(rel, keysz, scankey, page, mid);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-12-21 02:23:06 +01:00
|
|
|
if (result >= cmpval)
|
1999-07-17 00:17:06 +02:00
|
|
|
low = mid + 1;
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
1999-07-17 00:17:06 +02:00
|
|
|
high = mid;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
1999-07-17 00:17:06 +02:00
|
|
|
* At this point we have high == low, but be careful: they could point
|
2000-07-21 08:42:39 +02:00
|
|
|
* past the last slot on the page.
|
1997-09-07 07:04:48 +02:00
|
|
|
*
|
2004-08-29 07:07:03 +02:00
|
|
|
* On a leaf page, we always return the first key >= scan key (resp. >
|
|
|
|
* scan key), which could be the last slot + 1.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
if (P_ISLEAF(opaque))
|
1999-07-17 00:17:06 +02:00
|
|
|
return low;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* On a non-leaf page, return the last key < scan key (resp. <= scan
|
|
|
|
* key). There must be one if _bt_compare() is playing by the rules.
|
1999-07-17 00:17:06 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
Assert(low > P_FIRSTDATAKEY(opaque));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-07-17 00:17:06 +02:00
|
|
|
return OffsetNumberPrev(low);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*----------
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_compare() -- Compare scankey to a particular tuple on the page.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* keysz: number of key conditions to be checked (might be less than the
|
|
|
|
* total length of the scan key!)
|
|
|
|
* page/offnum: location of btree item to be compared to.
|
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* This routine returns:
|
2000-05-30 06:25:00 +02:00
|
|
|
* <0 if scankey < tuple at offnum;
|
1997-09-07 07:04:48 +02:00
|
|
|
* 0 if scankey == tuple at offnum;
|
2000-05-30 06:25:00 +02:00
|
|
|
* >0 if scankey > tuple at offnum.
|
2000-07-21 08:42:39 +02:00
|
|
|
* NULLs in the keys are treated as sortable values. Therefore
|
|
|
|
* "equality" does not necessarily mean that the item should be
|
|
|
|
* returned to the caller as a matching key!
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
|
|
|
|
* "minus infinity": this routine will always claim it is less than the
|
|
|
|
* scankey. The actual key value stored (if any, which there probably isn't)
|
|
|
|
* does not matter. This convention allows us to implement the Lehman and
|
|
|
|
* Yao convention that the first down-link pointer is before the first key.
|
|
|
|
* See backend/access/nbtree/README for details.
|
|
|
|
*----------
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
int32
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_compare(Relation rel,
|
1997-09-07 07:04:48 +02:00
|
|
|
int keysz,
|
|
|
|
ScanKey scankey,
|
2000-07-21 08:42:39 +02:00
|
|
|
Page page,
|
1997-09-07 07:04:48 +02:00
|
|
|
OffsetNumber offnum)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
TupleDesc itupdesc = RelationGetDescr(rel);
|
|
|
|
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
1997-09-08 04:41:22 +02:00
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
int i;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Force result ">" if target item is first data item on an internal
|
|
|
|
* page --- see NOTE above.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
|
1998-09-01 05:29:17 +02:00
|
|
|
return 1;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* The scan key is set up with the attribute number associated with
|
|
|
|
* each term in the key. It is important that, if the index is
|
|
|
|
* multi-key, the scan contain the first k key attributes, and that
|
|
|
|
* they be in order. If you think about how multi-key ordering works,
|
|
|
|
* you'll understand why this is.
|
1996-12-06 10:41:45 +01:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* We don't test for violation of this condition here, however. The
|
|
|
|
* initial setup for the index scan had better have gotten it right
|
|
|
|
* (see _bt_first).
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
for (i = 1; i <= keysz; i++)
|
1997-03-24 09:48:16 +01:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
Datum datum;
|
|
|
|
bool isNull;
|
|
|
|
int32 result;
|
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* see comments about NULLs handling in btbuild */
|
2003-11-12 22:15:59 +01:00
|
|
|
if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (isNull)
|
2000-05-30 06:25:00 +02:00
|
|
|
result = 0; /* NULL "=" NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
2000-05-30 06:25:00 +02:00
|
|
|
result = 1; /* NULL ">" NOT_NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
else if (isNull) /* key is NOT_NULL and item is NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-05-30 06:25:00 +02:00
|
|
|
result = -1; /* NOT_NULL "<" NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
else
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
|
|
|
* The sk_func needs to be passed the index value as left arg
|
2004-08-29 07:07:03 +02:00
|
|
|
* and the sk_argument as right arg (they might be of
|
|
|
|
* different types). Since it is convenient for callers to
|
|
|
|
* think of _bt_compare as comparing the scankey to the index
|
|
|
|
* item, we have to flip the sign of the comparison result.
|
2003-11-12 22:15:59 +01:00
|
|
|
*
|
|
|
|
* Note: curious-looking coding is to avoid overflow if
|
|
|
|
* comparison function returns INT_MIN. There is no risk of
|
|
|
|
* overflow for positive results.
|
|
|
|
*/
|
|
|
|
result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
|
|
|
|
datum,
|
|
|
|
scankey->sk_argument));
|
|
|
|
result = (result < 0) ? 1 : -result;
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* if the keys are unequal, return the difference */
|
|
|
|
if (result != 0)
|
1998-09-01 05:29:17 +02:00
|
|
|
return result;
|
2003-11-12 22:15:59 +01:00
|
|
|
|
|
|
|
scankey++;
|
1996-10-30 07:08:10 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* if we get here, the keys are equal */
|
1998-09-01 05:29:17 +02:00
|
|
|
return 0;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_next() -- Get the next item in a scan.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* On entry, we have a valid currentItemData in the scan, and a
|
2000-07-21 08:42:39 +02:00
|
|
|
* read lock and pin count on the page that contains that item.
|
2002-05-21 01:51:44 +02:00
|
|
|
* We return the next item in the scan, or false if no more.
|
2000-07-21 08:42:39 +02:00
|
|
|
* On successful exit, the page containing the new item is locked
|
2002-05-21 01:51:44 +02:00
|
|
|
* and pinned; on failure exit, no lock or pin is held.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2002-05-21 01:51:44 +02:00
|
|
|
bool
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_next(IndexScanDesc scan, ScanDirection dir)
|
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
Relation rel;
|
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
ItemPointer current;
|
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
BTScanOpaque so;
|
2000-07-25 06:47:59 +02:00
|
|
|
bool continuescan;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-05-21 01:51:44 +02:00
|
|
|
rel = scan->indexRelation;
|
1997-09-07 07:04:48 +02:00
|
|
|
so = (BTScanOpaque) scan->opaque;
|
|
|
|
current = &(scan->currentItemData);
|
|
|
|
|
|
|
|
/* we still have the buffer pinned and locked */
|
|
|
|
buf = so->btso_curbuf;
|
2000-07-21 08:42:39 +02:00
|
|
|
Assert(BufferIsValid(buf));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
/* step one tuple in the appropriate direction */
|
|
|
|
if (!_bt_step(scan, &buf, dir))
|
2002-05-21 01:51:44 +02:00
|
|
|
return false;
|
1997-03-18 19:41:37 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* current is the next candidate tuple to return */
|
1997-09-07 07:04:48 +02:00
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &btitem->bti_itup;
|
|
|
|
|
2000-07-25 06:47:59 +02:00
|
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* tuple passes all scan key conditions, so return it */
|
2002-05-21 01:51:44 +02:00
|
|
|
scan->xs_ctup.t_self = itup->t_tid;
|
|
|
|
return true;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* This tuple doesn't pass, but there might be more that do */
|
2000-07-25 06:47:59 +02:00
|
|
|
} while (continuescan);
|
1997-03-18 19:41:37 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* No more items, so close down the current-item info */
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-05-21 01:51:44 +02:00
|
|
|
return false;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_first() -- Find the first item in a scan.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* We need to be clever about the type of scan, the operation it's
|
2002-05-21 01:51:44 +02:00
|
|
|
* performing, and the tree ordering. We find the
|
|
|
|
* first item in the tree that satisfies the qualification
|
1997-09-07 07:04:48 +02:00
|
|
|
* associated with the scan descriptor. On exit, the page containing
|
|
|
|
* the current index tuple is read locked and pinned, and the scan's
|
|
|
|
* opaque data entry is updated to include the buffer.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2002-05-21 01:51:44 +02:00
|
|
|
bool
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_first(IndexScanDesc scan, ScanDirection dir)
|
|
|
|
{
|
2002-05-24 20:57:57 +02:00
|
|
|
Relation rel = scan->indexRelation;
|
|
|
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
1997-09-08 04:41:22 +02:00
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
BTStack stack;
|
2000-07-21 08:42:39 +02:00
|
|
|
OffsetNumber offnum;
|
1997-09-08 04:41:22 +02:00
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
ItemPointer current;
|
|
|
|
BlockNumber blkno;
|
|
|
|
StrategyNumber strat;
|
2002-05-21 01:51:44 +02:00
|
|
|
bool res;
|
2003-12-21 02:23:06 +01:00
|
|
|
bool nextkey;
|
2003-12-21 18:52:34 +01:00
|
|
|
bool goback;
|
2000-07-25 06:47:59 +02:00
|
|
|
bool continuescan;
|
2003-12-21 04:00:04 +01:00
|
|
|
ScanKey scankeys;
|
2004-08-29 07:07:03 +02:00
|
|
|
ScanKey *startKeys = NULL;
|
2000-04-12 19:17:23 +02:00
|
|
|
int keysCount = 0;
|
2003-11-12 22:15:59 +01:00
|
|
|
int i;
|
2000-04-12 19:17:23 +02:00
|
|
|
StrategyNumber strat_total;
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
2003-11-12 22:15:59 +01:00
|
|
|
* Examine the scan keys and eliminate any redundant keys; also
|
|
|
|
* discover how many keys must be matched to continue the scan.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2003-11-12 22:15:59 +01:00
|
|
|
_bt_preprocess_keys(scan);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-25 06:47:59 +02:00
|
|
|
/*
|
2003-11-12 22:15:59 +01:00
|
|
|
* Quit now if _bt_preprocess_keys() discovered that the scan keys can
|
|
|
|
* never be satisfied (eg, x == 1 AND x > 2).
|
2000-07-25 06:47:59 +02:00
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!so->qual_ok)
|
2002-05-21 01:51:44 +02:00
|
|
|
return false;
|
2000-07-25 06:47:59 +02:00
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
/*----------
|
2000-07-25 06:47:59 +02:00
|
|
|
* Examine the scan keys to discover where we need to start the scan.
|
2003-11-12 22:15:59 +01:00
|
|
|
*
|
|
|
|
* We want to identify the keys that can be used as starting boundaries;
|
|
|
|
* these are =, >, or >= keys for a forward scan or =, <, <= keys for
|
|
|
|
* a backwards scan. We can use keys for multiple attributes so long as
|
2004-08-29 07:07:03 +02:00
|
|
|
* the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
|
2003-11-12 22:15:59 +01:00
|
|
|
* a > or < boundary or find an attribute with no boundary (which can be
|
|
|
|
* thought of as the same as "> -infinity"), we can't use keys for any
|
|
|
|
* attributes to its right, because it would break our simplistic notion
|
|
|
|
* of what initial positioning strategy to use.
|
|
|
|
*
|
|
|
|
* When the scan keys include non-default operators, _bt_preprocess_keys
|
|
|
|
* may not be able to eliminate redundant keys; in such cases we will
|
|
|
|
* arbitrarily pick a usable one for each attribute. This is correct
|
|
|
|
* but possibly not optimal behavior. (For example, with keys like
|
|
|
|
* "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
|
|
|
|
* x=5 would be more efficient.) Since the situation only arises in
|
|
|
|
* hokily-worded queries, live with it.
|
|
|
|
*
|
|
|
|
* When both equality and inequality keys appear for a single attribute
|
|
|
|
* (again, only possible when non-default operators appear), we *must*
|
|
|
|
* select one of the equality keys for the starting point, because
|
|
|
|
* _bt_checkkeys() will stop the scan as soon as an equality qual fails.
|
|
|
|
* For example, if we have keys like "x >= 4 AND x = 10" and we elect to
|
|
|
|
* start at x=4, we will fail and stop before reaching x=10. If multiple
|
|
|
|
* equality quals survive preprocessing, however, it doesn't matter which
|
|
|
|
* one we use --- by definition, they are either redundant or
|
|
|
|
* contradictory.
|
|
|
|
*----------
|
2000-07-25 06:47:59 +02:00
|
|
|
*/
|
1999-09-27 20:20:21 +02:00
|
|
|
strat_total = BTEqualStrategyNumber;
|
2000-07-25 06:47:59 +02:00
|
|
|
if (so->numberOfKeys > 0)
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
AttrNumber curattr;
|
|
|
|
ScanKey chosen;
|
|
|
|
ScanKey cur;
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
startKeys = (ScanKey *) palloc(so->numberOfKeys * sizeof(ScanKey));
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* chosen is the so-far-chosen key for the current attribute, if
|
|
|
|
* any. We don't cast the decision in stone until we reach keys
|
|
|
|
* for the next attribute.
|
2003-11-12 22:15:59 +01:00
|
|
|
*/
|
|
|
|
curattr = 1;
|
|
|
|
chosen = NULL;
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
|
|
|
* Loop iterates from 0 to numberOfKeys inclusive; we use the last
|
|
|
|
* pass to handle after-last-key processing. Actual exit from the
|
|
|
|
* loop is at one of the "break" statements below.
|
|
|
|
*/
|
|
|
|
for (cur = so->keyData, i = 0;; cur++, i++)
|
|
|
|
{
|
|
|
|
if (i >= so->numberOfKeys || cur->sk_attno != curattr)
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
|
|
|
* Done looking at keys for curattr. If we didn't find a
|
|
|
|
* usable boundary key, quit; else save the boundary key
|
|
|
|
* pointer in startKeys.
|
|
|
|
*/
|
|
|
|
if (chosen == NULL)
|
|
|
|
break;
|
|
|
|
startKeys[keysCount++] = chosen;
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* Adjust strat_total, and quit if we have stored a > or <
|
|
|
|
* key.
|
2003-11-12 22:15:59 +01:00
|
|
|
*/
|
|
|
|
strat = chosen->sk_strategy;
|
|
|
|
if (strat != BTEqualStrategyNumber)
|
|
|
|
{
|
|
|
|
strat_total = strat;
|
|
|
|
if (strat == BTGreaterStrategyNumber ||
|
|
|
|
strat == BTLessStrategyNumber)
|
|
|
|
break;
|
|
|
|
}
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
|
|
|
* Done if that was the last attribute.
|
|
|
|
*/
|
|
|
|
if (i >= so->numberOfKeys)
|
1999-04-13 19:18:29 +02:00
|
|
|
break;
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
|
|
|
* Reset for next attr, which should be in sequence.
|
|
|
|
*/
|
|
|
|
Assert(cur->sk_attno == curattr + 1);
|
|
|
|
curattr = cur->sk_attno;
|
|
|
|
chosen = NULL;
|
1999-09-27 20:20:21 +02:00
|
|
|
}
|
2003-11-12 22:15:59 +01:00
|
|
|
|
|
|
|
/* Can we use this key as a starting boundary for this attr? */
|
|
|
|
switch (cur->sk_strategy)
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
case BTLessStrategyNumber:
|
|
|
|
case BTLessEqualStrategyNumber:
|
|
|
|
if (chosen == NULL && ScanDirectionIsBackward(dir))
|
|
|
|
chosen = cur;
|
|
|
|
break;
|
|
|
|
case BTEqualStrategyNumber:
|
|
|
|
/* override any non-equality choice */
|
|
|
|
chosen = cur;
|
|
|
|
break;
|
|
|
|
case BTGreaterEqualStrategyNumber:
|
|
|
|
case BTGreaterStrategyNumber:
|
|
|
|
if (chosen == NULL && ScanDirectionIsForward(dir))
|
|
|
|
chosen = cur;
|
1999-04-13 19:18:29 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
|
|
|
* If we found no usable boundary keys, we have to start from one end
|
|
|
|
* of the tree. Walk down that edge to the first or last key, and
|
|
|
|
* scan from there.
|
|
|
|
*/
|
|
|
|
if (keysCount == 0)
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
if (startKeys)
|
|
|
|
pfree(startKeys);
|
1998-09-01 05:29:17 +02:00
|
|
|
return _bt_endpoint(scan, dir);
|
1999-09-27 20:20:21 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2000-07-25 06:47:59 +02:00
|
|
|
* We want to start the scan somewhere within the index. Set up a
|
2003-11-12 22:15:59 +01:00
|
|
|
* 3-way-comparison scankey we can use to search for the boundary
|
|
|
|
* point we identified above.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-04-12 19:17:23 +02:00
|
|
|
scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
|
|
|
|
for (i = 0; i < keysCount; i++)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
ScanKey cur = startKeys[i];
|
2001-03-22 05:01:46 +01:00
|
|
|
|
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* _bt_preprocess_keys disallows it, but it's place to add some
|
|
|
|
* code later
|
2001-03-22 05:01:46 +01:00
|
|
|
*/
|
2003-11-12 22:15:59 +01:00
|
|
|
if (cur->sk_flags & SK_ISNULL)
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
pfree(startKeys);
|
1999-09-27 20:20:21 +02:00
|
|
|
pfree(scankeys);
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(ERROR, "btree doesn't support is(not)null, yet");
|
2002-05-21 01:51:44 +02:00
|
|
|
return false;
|
2000-04-12 19:17:23 +02:00
|
|
|
}
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-11-09 22:30:38 +01:00
|
|
|
/*
|
2003-11-12 22:15:59 +01:00
|
|
|
* If scankey operator is of default subtype, we can use the
|
2004-08-29 07:07:03 +02:00
|
|
|
* cached comparison procedure; otherwise gotta look it up in the
|
|
|
|
* catalogs.
|
2003-11-09 22:30:38 +01:00
|
|
|
*/
|
2003-11-12 22:15:59 +01:00
|
|
|
if (cur->sk_subtype == InvalidOid)
|
|
|
|
{
|
|
|
|
FmgrInfo *procinfo;
|
|
|
|
|
|
|
|
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
|
|
|
|
ScanKeyEntryInitializeWithInfo(scankeys + i,
|
|
|
|
cur->sk_flags,
|
|
|
|
i + 1,
|
|
|
|
InvalidStrategy,
|
|
|
|
InvalidOid,
|
|
|
|
procinfo,
|
|
|
|
cur->sk_argument);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
RegProcedure cmp_proc;
|
|
|
|
|
|
|
|
cmp_proc = get_opclass_proc(rel->rd_index->indclass[i],
|
|
|
|
cur->sk_subtype,
|
|
|
|
BTORDER_PROC);
|
|
|
|
ScanKeyEntryInitialize(scankeys + i,
|
|
|
|
cur->sk_flags,
|
|
|
|
i + 1,
|
|
|
|
InvalidStrategy,
|
|
|
|
cur->sk_subtype,
|
|
|
|
cmp_proc,
|
|
|
|
cur->sk_argument);
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2003-11-12 22:15:59 +01:00
|
|
|
|
|
|
|
pfree(startKeys);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-12-21 02:23:06 +01:00
|
|
|
/*
|
2003-12-21 18:52:34 +01:00
|
|
|
* Examine the selected initial-positioning strategy to determine
|
2004-08-29 07:07:03 +02:00
|
|
|
* exactly where we need to start the scan, and set flag variables to
|
|
|
|
* control the code below.
|
2003-12-21 18:52:34 +01:00
|
|
|
*
|
2004-08-29 07:07:03 +02:00
|
|
|
* If nextkey = false, _bt_search and _bt_binsrch will locate the first
|
|
|
|
* item >= scan key. If nextkey = true, they will locate the first
|
|
|
|
* item > scan key.
|
2003-12-21 18:52:34 +01:00
|
|
|
*
|
2004-08-29 07:07:03 +02:00
|
|
|
* If goback = true, we will then step back one item, while if goback =
|
|
|
|
* false, we will start the scan on the located item.
|
2003-12-21 18:52:34 +01:00
|
|
|
*
|
|
|
|
* it's yet other place to add some code later for is(not)null ...
|
2003-12-21 02:23:06 +01:00
|
|
|
*/
|
|
|
|
switch (strat_total)
|
|
|
|
{
|
|
|
|
case BTLessStrategyNumber:
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-12-21 18:52:34 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* Find first item >= scankey, then back up one to arrive at
|
|
|
|
* last item < scankey. (Note: this positioning strategy is
|
|
|
|
* only used for a backward scan, so that is always the
|
|
|
|
* correct starting position.)
|
2003-12-21 18:52:34 +01:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
nextkey = false;
|
2003-12-21 18:52:34 +01:00
|
|
|
goback = true;
|
2003-12-21 02:23:06 +01:00
|
|
|
break;
|
|
|
|
|
|
|
|
case BTLessEqualStrategyNumber:
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-12-21 18:52:34 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* Find first item > scankey, then back up one to arrive at
|
|
|
|
* last item <= scankey. (Note: this positioning strategy is
|
|
|
|
* only used for a backward scan, so that is always the
|
|
|
|
* correct starting position.)
|
2003-12-21 18:52:34 +01:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
nextkey = true;
|
2003-12-21 18:52:34 +01:00
|
|
|
goback = true;
|
2003-12-21 02:23:06 +01:00
|
|
|
break;
|
|
|
|
|
|
|
|
case BTEqualStrategyNumber:
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-12-21 02:23:06 +01:00
|
|
|
/*
|
|
|
|
* If a backward scan was specified, need to start with last
|
|
|
|
* equal item not first one.
|
|
|
|
*/
|
|
|
|
if (ScanDirectionIsBackward(dir))
|
2003-12-21 18:52:34 +01:00
|
|
|
{
|
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* This is the same as the <= strategy. We will check at
|
|
|
|
* the end whether the found item is actually =.
|
2003-12-21 18:52:34 +01:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
nextkey = true;
|
2003-12-21 18:52:34 +01:00
|
|
|
goback = true;
|
|
|
|
}
|
2003-12-21 02:23:06 +01:00
|
|
|
else
|
2003-12-21 18:52:34 +01:00
|
|
|
{
|
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* This is the same as the >= strategy. We will check at
|
|
|
|
* the end whether the found item is actually =.
|
2003-12-21 18:52:34 +01:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
nextkey = false;
|
2003-12-21 18:52:34 +01:00
|
|
|
goback = false;
|
|
|
|
}
|
2003-12-21 02:23:06 +01:00
|
|
|
break;
|
|
|
|
|
|
|
|
case BTGreaterEqualStrategyNumber:
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-12-21 18:52:34 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* Find first item >= scankey. (This is only used for forward
|
|
|
|
* scans.)
|
2003-12-21 18:52:34 +01:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
nextkey = false;
|
2003-12-21 18:52:34 +01:00
|
|
|
goback = false;
|
2003-12-21 02:23:06 +01:00
|
|
|
break;
|
|
|
|
|
|
|
|
case BTGreaterStrategyNumber:
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2003-12-21 18:52:34 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* Find first item > scankey. (This is only used for forward
|
|
|
|
* scans.)
|
2003-12-21 18:52:34 +01:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
nextkey = true;
|
2003-12-21 18:52:34 +01:00
|
|
|
goback = false;
|
2003-12-21 02:23:06 +01:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
/* can't get here, but keep compiler quiet */
|
|
|
|
elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Use the manufactured scan key to descend the tree and position
|
|
|
|
* ourselves on the target leaf page.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2003-12-21 02:23:06 +01:00
|
|
|
stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* don't need to keep the stack around... */
|
|
|
|
_bt_freestack(stack);
|
|
|
|
|
2003-12-21 04:00:04 +01:00
|
|
|
current = &(scan->currentItemData);
|
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!BufferIsValid(buf))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Only get here if index is completely empty */
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
1999-09-27 20:20:21 +02:00
|
|
|
pfree(scankeys);
|
2002-05-21 01:51:44 +02:00
|
|
|
return false;
|
1997-05-30 20:35:40 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* remember which buffer we have pinned */
|
|
|
|
so->btso_curbuf = buf;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-25 06:47:59 +02:00
|
|
|
/* position to the precise item on the page */
|
2003-12-21 02:23:06 +01:00
|
|
|
offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2003-12-21 18:52:34 +01:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
blkno = BufferGetBlockNumber(buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSet(current, blkno, offnum);
|
|
|
|
|
2003-12-21 04:00:04 +01:00
|
|
|
/* done with manufactured scankey, now */
|
|
|
|
pfree(scankeys);
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* If nextkey = false, we are positioned at the first item >= scan
|
|
|
|
* key, or possibly at the end of a page on which all the existing
|
|
|
|
* items are less than the scan key and we know that everything on
|
|
|
|
* later pages is greater than or equal to scan key.
|
2001-10-25 07:50:21 +02:00
|
|
|
*
|
2004-08-29 07:07:03 +02:00
|
|
|
* If nextkey = true, we are positioned at the first item > scan key, or
|
|
|
|
* possibly at the end of a page on which all the existing items are
|
2003-12-21 02:23:06 +01:00
|
|
|
* less than or equal to the scan key and we know that everything on
|
|
|
|
* later pages is greater than scan key.
|
|
|
|
*
|
2003-12-21 18:52:34 +01:00
|
|
|
* The actually desired starting point is either this item or the prior
|
2004-08-29 07:07:03 +02:00
|
|
|
* one, or in the end-of-page case it's the first item on the next
|
|
|
|
* page or the last item on this page. We apply _bt_step if needed to
|
|
|
|
* get to the right place.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
2004-08-29 07:07:03 +02:00
|
|
|
* If _bt_step fails (meaning we fell off the end of the index in one
|
|
|
|
* direction or the other), then there are no matches so we just
|
2003-12-21 02:23:06 +01:00
|
|
|
* return false.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2003-12-21 18:52:34 +01:00
|
|
|
if (goback)
|
1997-05-30 20:35:40 +02:00
|
|
|
{
|
2003-12-21 18:52:34 +01:00
|
|
|
/* _bt_step will do the right thing if we are at end-of-page */
|
|
|
|
if (!_bt_step(scan, &buf, BackwardScanDirection))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* If we're at end-of-page, must step forward to next page */
|
|
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
|
|
|
{
|
|
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
2002-05-21 01:51:44 +02:00
|
|
|
return false;
|
2003-12-21 18:52:34 +01:00
|
|
|
}
|
1997-05-30 20:35:40 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* okay, current item pointer for the scan is right */
|
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &btitem->bti_itup;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* is the first item actually acceptable? */
|
2000-07-25 06:47:59 +02:00
|
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
1997-05-30 20:35:40 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* yes, return it */
|
2002-05-21 01:51:44 +02:00
|
|
|
scan->xs_ctup.t_self = itup->t_tid;
|
|
|
|
res = true;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
2000-07-25 06:47:59 +02:00
|
|
|
else if (continuescan)
|
1999-04-13 19:18:29 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* no, but there might be another one that is */
|
|
|
|
res = _bt_next(scan, dir);
|
1999-04-13 19:18:29 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* no tuples in the index match this scan key */
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
2002-05-21 01:51:44 +02:00
|
|
|
res = false;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return res;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_step() -- Step one item in the requested direction in a scan on
|
|
|
|
* the tree.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* *bufP is the current buffer (read-locked and pinned). If we change
|
|
|
|
* pages, it's updated appropriately.
|
|
|
|
*
|
|
|
|
* If successful, update scan's currentItemData and return true.
|
|
|
|
* If no adjacent record exists in the requested direction,
|
|
|
|
* release buffer pin/locks and return false.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
bool
|
1997-09-08 22:59:27 +02:00
|
|
|
_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2002-05-21 01:51:44 +02:00
|
|
|
Relation rel = scan->indexRelation;
|
2000-07-21 08:42:39 +02:00
|
|
|
ItemPointer current = &(scan->currentItemData);
|
|
|
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
1997-09-08 04:41:22 +02:00
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
OffsetNumber offnum,
|
|
|
|
maxoff;
|
|
|
|
BlockNumber blkno;
|
1999-05-25 18:15:34 +02:00
|
|
|
|
1999-03-28 22:32:42 +02:00
|
|
|
/*
|
1999-05-25 18:15:34 +02:00
|
|
|
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
|
|
|
|
* due to ability of ip_posid to be equal 0.
|
1999-03-28 22:32:42 +02:00
|
|
|
*/
|
|
|
|
offnum = current->ip_posid;
|
2000-07-21 08:42:39 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
if (ScanDirectionIsForward(dir))
|
|
|
|
{
|
|
|
|
if (!PageIsEmpty(page) && offnum < maxoff)
|
|
|
|
offnum = OffsetNumberNext(offnum);
|
|
|
|
else
|
|
|
|
{
|
2003-02-22 01:45:05 +01:00
|
|
|
/* Walk right to the next page with data */
|
2000-07-21 08:42:39 +02:00
|
|
|
for (;;)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* if we're at end of scan, release the buffer and return */
|
|
|
|
if (P_RIGHTMOST(opaque))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, *bufP);
|
2000-07-21 08:42:39 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
*bufP = so->btso_curbuf = InvalidBuffer;
|
|
|
|
return false;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
/* step right one page */
|
|
|
|
blkno = opaque->btpo_next;
|
2004-04-21 20:24:26 +02:00
|
|
|
*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
|
2000-07-21 08:42:39 +02:00
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
2003-02-22 01:45:05 +01:00
|
|
|
if (!P_IGNORE(opaque))
|
|
|
|
{
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
/* done if it's not empty */
|
|
|
|
offnum = P_FIRSTDATAKEY(opaque);
|
|
|
|
if (!PageIsEmpty(page) && offnum <= maxoff)
|
|
|
|
break;
|
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2003-08-04 02:43:34 +02:00
|
|
|
else
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
/* backwards scan */
|
2000-07-21 08:42:39 +02:00
|
|
|
if (offnum > P_FIRSTDATAKEY(opaque))
|
1997-09-07 07:04:48 +02:00
|
|
|
offnum = OffsetNumberPrev(offnum);
|
|
|
|
else
|
|
|
|
{
|
2003-02-22 01:45:05 +01:00
|
|
|
/*
|
|
|
|
* Walk left to the next page with data. This is much more
|
|
|
|
* complex than the walk-right case because of the possibility
|
2003-08-04 02:43:34 +02:00
|
|
|
* that the page to our left splits while we are in flight to
|
|
|
|
* it, plus the possibility that the page we were on gets
|
|
|
|
* deleted after we leave it. See nbtree/README for details.
|
2003-02-22 01:45:05 +01:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
for (;;)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2003-02-22 01:45:05 +01:00
|
|
|
*bufP = _bt_walk_left(rel, *bufP);
|
|
|
|
|
|
|
|
/* if we're at end of scan, return failure */
|
|
|
|
if (*bufP == InvalidBuffer)
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
|
|
|
ItemPointerSetInvalid(current);
|
2003-02-22 01:45:05 +01:00
|
|
|
so->btso_curbuf = InvalidBuffer;
|
2000-07-21 08:42:39 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
2003-08-04 02:43:34 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2003-02-22 01:45:05 +01:00
|
|
|
* Okay, we managed to move left to a non-deleted page.
|
2003-08-04 02:43:34 +02:00
|
|
|
* Done if it's not half-dead and not empty. Else loop
|
|
|
|
* back and do it all again.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
2003-02-22 01:45:05 +01:00
|
|
|
if (!P_IGNORE(opaque))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2003-02-22 01:45:05 +01:00
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
offnum = maxoff;
|
|
|
|
if (!PageIsEmpty(page) &&
|
|
|
|
maxoff >= P_FIRSTDATAKEY(opaque))
|
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
|
|
|
|
/* Update scan state */
|
1997-09-07 07:04:48 +02:00
|
|
|
so->btso_curbuf = *bufP;
|
2000-07-21 08:42:39 +02:00
|
|
|
blkno = BufferGetBlockNumber(*bufP);
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSet(current, blkno, offnum);
|
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return true;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
/*
|
|
|
|
* _bt_walk_left() -- step left one page, if possible
|
|
|
|
*
|
|
|
|
* The given buffer must be pinned and read-locked. This will be dropped
|
|
|
|
* before stepping left. On return, we have pin and read lock on the
|
|
|
|
* returned page, instead.
|
|
|
|
*
|
|
|
|
* Returns InvalidBuffer if there is no page to the left (no lock is held
|
|
|
|
* in that case).
|
|
|
|
*
|
|
|
|
* When working on a non-leaf level, it is possible for the returned page
|
|
|
|
* to be half-dead; the caller should check that condition and step left
|
|
|
|
* again if it's important.
|
|
|
|
*/
|
|
|
|
static Buffer
|
|
|
|
_bt_walk_left(Relation rel, Buffer buf)
|
|
|
|
{
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
BlockNumber obknum;
|
|
|
|
BlockNumber lblkno;
|
|
|
|
BlockNumber blkno;
|
|
|
|
int tries;
|
|
|
|
|
|
|
|
/* if we're at end of tree, release buf and return failure */
|
|
|
|
if (P_LEFTMOST(opaque))
|
|
|
|
{
|
|
|
|
_bt_relbuf(rel, buf);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* remember original page we are stepping left from */
|
|
|
|
obknum = BufferGetBlockNumber(buf);
|
|
|
|
/* step left */
|
|
|
|
blkno = lblkno = opaque->btpo_prev;
|
2004-04-21 20:24:26 +02:00
|
|
|
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
|
2003-02-22 01:45:05 +01:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
2003-08-04 02:43:34 +02:00
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
/*
|
2003-08-04 02:43:34 +02:00
|
|
|
* If this isn't the page we want, walk right till we find what we
|
2003-08-08 23:42:59 +02:00
|
|
|
* want --- but go no more than four hops (an arbitrary limit). If
|
|
|
|
* we don't find the correct page by then, the most likely bet is
|
|
|
|
* that the original page got deleted and isn't in the sibling
|
2003-08-04 02:43:34 +02:00
|
|
|
* chain at all anymore, not that its left sibling got split more
|
|
|
|
* than four times.
|
2003-02-22 01:45:05 +01:00
|
|
|
*
|
2003-08-04 02:43:34 +02:00
|
|
|
* Note that it is correct to test P_ISDELETED not P_IGNORE here,
|
|
|
|
* because half-dead pages are still in the sibling chain. Caller
|
|
|
|
* must reject half-dead pages if wanted.
|
2003-02-22 01:45:05 +01:00
|
|
|
*/
|
|
|
|
tries = 0;
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
|
|
|
|
{
|
|
|
|
/* Found desired page, return it */
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
if (P_RIGHTMOST(opaque) || ++tries > 4)
|
|
|
|
break;
|
|
|
|
blkno = opaque->btpo_next;
|
2004-04-21 20:24:26 +02:00
|
|
|
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
|
2003-02-22 01:45:05 +01:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return to the original page to see what's up */
|
2004-04-21 20:24:26 +02:00
|
|
|
buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
|
2003-02-22 01:45:05 +01:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
if (P_ISDELETED(opaque))
|
|
|
|
{
|
|
|
|
/*
|
2003-08-04 02:43:34 +02:00
|
|
|
* It was deleted. Move right to first nondeleted page (there
|
|
|
|
* must be one); that is the page that has acquired the
|
2003-02-22 01:45:05 +01:00
|
|
|
* deleted one's keyspace, so stepping left from it will take
|
|
|
|
* us where we want to be.
|
|
|
|
*/
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
if (P_RIGHTMOST(opaque))
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(ERROR, "fell off the end of \"%s\"",
|
2003-02-22 01:45:05 +01:00
|
|
|
RelationGetRelationName(rel));
|
|
|
|
blkno = opaque->btpo_next;
|
2004-04-21 20:24:26 +02:00
|
|
|
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
|
2003-02-22 01:45:05 +01:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
if (!P_ISDELETED(opaque))
|
|
|
|
break;
|
|
|
|
}
|
2003-08-04 02:43:34 +02:00
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
/*
|
2003-08-04 02:43:34 +02:00
|
|
|
* Now return to top of loop, resetting obknum to point to
|
|
|
|
* this nondeleted page, and try again.
|
2003-02-22 01:45:05 +01:00
|
|
|
*/
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
2003-08-04 02:43:34 +02:00
|
|
|
* It wasn't deleted; the explanation had better be that the
|
|
|
|
* page to the left got split or deleted. Without this check,
|
|
|
|
* we'd go into an infinite loop if there's anything wrong.
|
2003-02-22 01:45:05 +01:00
|
|
|
*/
|
|
|
|
if (opaque->btpo_prev == lblkno)
|
2003-07-28 02:09:16 +02:00
|
|
|
elog(ERROR, "could not find left sibling in \"%s\"",
|
2003-02-22 01:45:05 +01:00
|
|
|
RelationGetRelationName(rel));
|
|
|
|
/* Okay to try again with new lblkno value */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return InvalidBuffer;
|
|
|
|
}
|
|
|
|
|
2003-02-21 01:06:22 +01:00
|
|
|
/*
|
|
|
|
* _bt_get_endpoint() -- Find the first or last page on a given tree level
|
|
|
|
*
|
|
|
|
* If the index is empty, we will return InvalidBuffer; any other failure
|
2003-08-04 02:43:34 +02:00
|
|
|
* condition causes ereport(). We will not return a dead page.
|
2003-02-21 01:06:22 +01:00
|
|
|
*
|
|
|
|
* The returned buffer is pinned and read-locked.
|
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
|
|
|
|
{
|
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
BlockNumber blkno;
|
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are looking for a leaf page, okay to descend from fast root;
|
2003-08-04 02:43:34 +02:00
|
|
|
* otherwise better descend from true root. (There is no point in
|
|
|
|
* being smarter about intermediate levels.)
|
2003-02-21 01:06:22 +01:00
|
|
|
*/
|
|
|
|
if (level == 0)
|
|
|
|
buf = _bt_getroot(rel, BT_READ);
|
|
|
|
else
|
|
|
|
buf = _bt_gettrueroot(rel);
|
|
|
|
|
|
|
|
if (!BufferIsValid(buf))
|
|
|
|
{
|
|
|
|
/* empty index... */
|
|
|
|
return InvalidBuffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If we landed on a deleted page, step right to find a live page
|
2003-08-04 02:43:34 +02:00
|
|
|
* (there must be one). Also, if we want the rightmost page, step
|
|
|
|
* right if needed to get to it (this could happen if the page
|
|
|
|
* split since we obtained a pointer to it).
|
2003-02-21 01:06:22 +01:00
|
|
|
*/
|
2003-02-22 01:45:05 +01:00
|
|
|
while (P_IGNORE(opaque) ||
|
2003-02-21 01:06:22 +01:00
|
|
|
(rightmost && !P_RIGHTMOST(opaque)))
|
|
|
|
{
|
|
|
|
blkno = opaque->btpo_next;
|
|
|
|
if (blkno == P_NONE)
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(ERROR, "fell off the end of \"%s\"",
|
2003-02-22 01:45:05 +01:00
|
|
|
RelationGetRelationName(rel));
|
2004-04-21 20:24:26 +02:00
|
|
|
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
|
2003-02-21 01:06:22 +01:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Done? */
|
|
|
|
if (opaque->btpo.level == level)
|
|
|
|
break;
|
|
|
|
if (opaque->btpo.level < level)
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(ERROR, "btree level %u not found", level);
|
2003-02-21 01:06:22 +01:00
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
/* Descend to leftmost or rightmost child page */
|
2003-02-21 01:06:22 +01:00
|
|
|
if (rightmost)
|
|
|
|
offnum = PageGetMaxOffsetNumber(page);
|
|
|
|
else
|
|
|
|
offnum = P_FIRSTDATAKEY(opaque);
|
|
|
|
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
|
|
|
|
2004-04-21 20:24:26 +02:00
|
|
|
buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
|
2003-02-21 01:06:22 +01:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2003-11-12 22:15:59 +01:00
|
|
|
* _bt_endpoint() -- Find the first or last key in the index, and scan
|
|
|
|
* from there to the first key satisfying all the quals.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
|
|
|
* This is used by _bt_first() to set up a scan when we've determined
|
|
|
|
* that the scan must start at the beginning or end of the index (for
|
|
|
|
* a forward or backward scan respectively).
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2002-05-21 01:51:44 +02:00
|
|
|
static bool
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
Relation rel;
|
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
ItemPointer current;
|
2003-02-21 01:06:22 +01:00
|
|
|
OffsetNumber maxoff;
|
2000-07-21 08:42:39 +02:00
|
|
|
OffsetNumber start;
|
1997-09-08 04:41:22 +02:00
|
|
|
BlockNumber blkno;
|
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
BTScanOpaque so;
|
2002-05-21 01:51:44 +02:00
|
|
|
bool res;
|
2000-07-25 06:47:59 +02:00
|
|
|
bool continuescan;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-05-21 01:51:44 +02:00
|
|
|
rel = scan->indexRelation;
|
1997-09-07 07:04:48 +02:00
|
|
|
current = &(scan->currentItemData);
|
|
|
|
so = (BTScanOpaque) scan->opaque;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Scan down to the leftmost or rightmost leaf page. This is a
|
2001-03-22 05:01:46 +01:00
|
|
|
* simplified version of _bt_search(). We don't maintain a stack
|
2000-07-21 08:42:39 +02:00
|
|
|
* since we know we won't need it.
|
|
|
|
*/
|
2003-02-21 01:06:22 +01:00
|
|
|
buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
|
2000-07-21 08:42:39 +02:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!BufferIsValid(buf))
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
|
|
|
/* empty index... */
|
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
2002-05-21 01:51:44 +02:00
|
|
|
return false;
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
blkno = BufferGetBlockNumber(buf);
|
1996-07-09 08:22:35 +02:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
2003-02-21 01:06:22 +01:00
|
|
|
Assert(P_ISLEAF(opaque));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
if (ScanDirectionIsForward(dir))
|
|
|
|
{
|
2003-02-21 01:06:22 +01:00
|
|
|
/* There could be dead pages to the left, so not this: */
|
|
|
|
/* Assert(P_LEFTMOST(opaque)); */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
start = P_FIRSTDATAKEY(opaque);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
else if (ScanDirectionIsBackward(dir))
|
1996-12-06 10:41:45 +01:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
Assert(P_RIGHTMOST(opaque));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
start = PageGetMaxOffsetNumber(page);
|
2001-03-22 05:01:46 +01:00
|
|
|
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty
|
|
|
|
* page */
|
2000-07-21 08:42:39 +02:00
|
|
|
start = P_FIRSTDATAKEY(opaque);
|
1996-12-06 10:41:45 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(ERROR, "invalid scan direction: %d", (int) dir);
|
2000-07-21 08:42:39 +02:00
|
|
|
start = 0; /* keep compiler quiet */
|
|
|
|
}
|
|
|
|
|
|
|
|
ItemPointerSet(current, blkno, start);
|
|
|
|
/* remember which buffer we have pinned */
|
|
|
|
so->btso_curbuf = buf;
|
|
|
|
|
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* Left/rightmost page could be empty due to deletions, if so step
|
|
|
|
* till we find a nonempty page.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
if (start > maxoff)
|
|
|
|
{
|
|
|
|
if (!_bt_step(scan, &buf, dir))
|
2002-05-21 01:51:44 +02:00
|
|
|
return false;
|
2000-07-21 08:42:39 +02:00
|
|
|
start = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
|
2003-11-12 22:15:59 +01:00
|
|
|
/*
|
2004-08-29 07:07:03 +02:00
|
|
|
* Okay, we are on the first or last tuple. Does it pass all the
|
|
|
|
* quals?
|
2003-11-12 22:15:59 +01:00
|
|
|
*/
|
2000-07-25 06:47:59 +02:00
|
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* yes, return it */
|
2002-05-21 01:51:44 +02:00
|
|
|
scan->xs_ctup.t_self = itup->t_tid;
|
|
|
|
res = true;
|
1996-12-06 10:41:45 +01:00
|
|
|
}
|
2000-07-25 06:47:59 +02:00
|
|
|
else if (continuescan)
|
1999-04-13 19:18:29 +02:00
|
|
|
{
|
2003-11-12 22:15:59 +01:00
|
|
|
/* no, but there might be another one that does */
|
2000-07-21 08:42:39 +02:00
|
|
|
res = _bt_next(scan, dir);
|
1999-04-13 19:18:29 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
1996-12-06 10:41:45 +01:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* no tuples in the index match this scan key */
|
1997-01-05 11:56:36 +01:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
2002-05-21 01:51:44 +02:00
|
|
|
res = false;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return res;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|