1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* nbtsearch.c
|
1997-09-07 07:04:48 +02:00
|
|
|
* search code for postgres btrees.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
2001-01-24 20:43:33 +01:00
|
|
|
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2001-10-07 01:21:45 +02:00
|
|
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.68 2001/10/06 23:21:43 tgl Exp $
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "postgres.h"
|
1996-07-09 08:22:35 +02:00
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "access/genam.h"
|
|
|
|
#include "access/nbtree.h"
|
1996-10-23 09:42:13 +02:00
|
|
|
|
1996-11-03 13:35:27 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
|
1996-07-09 08:22:35 +02:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* _bt_search() -- Search the tree for a particular scankey,
|
|
|
|
* or more precisely for the first leaf page it could be on.
|
|
|
|
*
|
|
|
|
* Return value is a stack of parent-page pointers. *bufP is set to the
|
|
|
|
* address of the leaf-page buffer, which is read-locked and pinned.
|
|
|
|
* No locks are held on the parent pages, however!
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* NOTE that the returned buffer is read-locked regardless of the access
|
|
|
|
* parameter. However, access = BT_WRITE will allow an empty root page
|
2001-03-22 05:01:46 +01:00
|
|
|
* to be created and returned. When access = BT_READ, an empty index
|
2000-07-21 08:42:39 +02:00
|
|
|
* will result in *bufP being set to InvalidBuffer.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
BTStack
|
2000-07-21 08:42:39 +02:00
|
|
|
_bt_search(Relation rel, int keysz, ScanKey scankey,
|
|
|
|
Buffer *bufP, int access)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2001-03-22 05:01:46 +01:00
|
|
|
BTStack stack_in = NULL;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Get the root page to start with */
|
|
|
|
*bufP = _bt_getroot(rel, access);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* If index is empty and access = BT_READ, no root page is created. */
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!BufferIsValid(*bufP))
|
2000-07-21 08:42:39 +02:00
|
|
|
return (BTStack) NULL;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Loop iterates once per level descended in the tree */
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
ItemId itemid;
|
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
BlockNumber blkno;
|
|
|
|
BlockNumber par_blkno;
|
|
|
|
BTStack new_stack;
|
|
|
|
|
|
|
|
/* if this is a leaf page, we're done */
|
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
if (P_ISLEAF(opaque))
|
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Find the appropriate item on the internal page, and get the
|
|
|
|
* child page that it points to.
|
|
|
|
*/
|
|
|
|
offnum = _bt_binsrch(rel, *bufP, keysz, scankey);
|
|
|
|
itemid = PageGetItemId(page, offnum);
|
|
|
|
btitem = (BTItem) PageGetItem(page, itemid);
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
|
|
|
par_blkno = BufferGetBlockNumber(*bufP);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* We need to save the bit image of the index entry we chose in
|
|
|
|
* the parent page on a stack. In case we split the tree, we'll
|
|
|
|
* use this bit image to figure out what our real parent page is,
|
|
|
|
* in case the parent splits while we're working lower in the
|
|
|
|
* tree. See the paper by Lehman and Yao for how this is detected
|
|
|
|
* and handled. (We use the child link to disambiguate duplicate
|
|
|
|
* keys in the index -- Lehman and Yao disallow duplicate keys.)
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
new_stack = (BTStack) palloc(sizeof(BTStackData));
|
|
|
|
new_stack->bts_blkno = par_blkno;
|
|
|
|
new_stack->bts_offset = offnum;
|
|
|
|
memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
|
|
|
|
new_stack->bts_parent = stack_in;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* drop the read lock on the parent page, acquire one on the child */
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, *bufP);
|
2000-07-21 08:42:39 +02:00
|
|
|
*bufP = _bt_getbuf(rel, blkno, BT_READ);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* Race -- the page we just grabbed may have split since we read
|
|
|
|
* its pointer in the parent. If it has, we may need to move
|
|
|
|
* right to its new sibling. Do that.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* okay, all set to move down a level */
|
|
|
|
stack_in = new_stack;
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
return stack_in;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_moveright() -- move right in the btree if necessary.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* When we drop and reacquire a pointer to a page, it is possible that
|
|
|
|
* the page has changed in the meanwhile. If this happens, we're
|
|
|
|
* guaranteed that the page has "split right" -- that is, that any
|
|
|
|
* data that appeared on the page originally is either on the page
|
|
|
|
* or strictly to the right of it.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* This routine decides whether or not we need to move right in the
|
|
|
|
* tree by examining the high key entry on the page. If that entry
|
|
|
|
* is strictly less than one we expect to be on the page, then our
|
|
|
|
* picture of the page is incorrect and we need to move right.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* On entry, we have the buffer pinned and a lock of the proper type.
|
|
|
|
* If we move right, we release the buffer and lock and acquire the
|
2001-03-22 05:01:46 +01:00
|
|
|
* same on the right sibling. Return value is the buffer we stop at.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
_bt_moveright(Relation rel,
|
1997-09-07 07:04:48 +02:00
|
|
|
Buffer buf,
|
|
|
|
int keysz,
|
|
|
|
ScanKey scankey,
|
|
|
|
int access)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* If the scan key that brought us to this page is > the high key
|
1997-09-07 07:04:48 +02:00
|
|
|
* stored on the page, then the page has split and we need to move
|
2000-07-21 08:42:39 +02:00
|
|
|
* right. (If the scan key is equal to the high key, we might or
|
|
|
|
* might not need to move right; have to scan the page first anyway.)
|
|
|
|
* It could even have split more than once, so scan as far as needed.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
while (!P_RIGHTMOST(opaque) &&
|
|
|
|
_bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)
|
1997-04-16 03:48:29 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* step right one page */
|
2001-03-22 05:01:46 +01:00
|
|
|
BlockNumber rblkno = opaque->btpo_next;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
2000-07-21 08:42:39 +02:00
|
|
|
buf = _bt_getbuf(rel, rblkno, access);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return buf;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* _bt_binsrch() -- Do a binary search for a key on a particular page.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* The scankey we get has the compare function stored in the procedure
|
|
|
|
* entry of each data struct. We invoke this regproc to do the
|
|
|
|
* comparison for every key in the scankey.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
|
|
|
|
* key >= given scankey. (NOTE: in particular, this means it is possible
|
|
|
|
* to return a value 1 greater than the number of keys on the page,
|
|
|
|
* if the scankey is > all keys on the page.)
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
|
|
|
|
* of the last key < given scankey. (Since _bt_compare treats the first
|
|
|
|
* data key of such a page as minus infinity, there will be at least one
|
|
|
|
* key < scankey, so the result always points at one of the keys on the
|
|
|
|
* page.) This key indicates the right place to descend to be sure we
|
|
|
|
* find all leaf keys >= given scankey.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* This procedure is not responsible for walking right, it just examines
|
2001-03-22 05:01:46 +01:00
|
|
|
* the given page. _bt_binsrch() has no lock or refcount side effects
|
2000-07-21 08:42:39 +02:00
|
|
|
* on the buffer.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
OffsetNumber
|
|
|
|
_bt_binsrch(Relation rel,
|
1997-09-07 07:04:48 +02:00
|
|
|
Buffer buf,
|
|
|
|
int keysz,
|
2000-07-21 08:42:39 +02:00
|
|
|
ScanKey scankey)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
TupleDesc itupdesc;
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
OffsetNumber low,
|
|
|
|
high;
|
2000-05-30 06:25:00 +02:00
|
|
|
int32 result;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
itupdesc = RelationGetDescr(rel);
|
1997-09-07 07:04:48 +02:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
low = P_FIRSTDATAKEY(opaque);
|
1997-09-07 07:04:48 +02:00
|
|
|
high = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
/*
|
1999-07-17 00:17:06 +02:00
|
|
|
* If there are no keys on the page, return the first available slot.
|
2000-04-12 19:17:23 +02:00
|
|
|
* Note this covers two cases: the page is really empty (no keys), or
|
|
|
|
* it contains only a high key. The latter case is possible after
|
2000-07-21 08:42:39 +02:00
|
|
|
* vacuuming. This can never happen on an internal page, however,
|
|
|
|
* since they are never empty (an internal page must have children).
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
1999-07-17 00:17:06 +02:00
|
|
|
if (high < low)
|
1998-09-01 05:29:17 +02:00
|
|
|
return low;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-07-17 00:17:06 +02:00
|
|
|
/*
|
2000-04-12 19:17:23 +02:00
|
|
|
* Binary search to find the first key on the page >= scan key. Loop
|
|
|
|
* invariant: all slots before 'low' are < scan key, all slots at or
|
2000-07-21 08:42:39 +02:00
|
|
|
* after 'high' are >= scan key. We can fall out when high == low.
|
1999-07-17 00:17:06 +02:00
|
|
|
*/
|
|
|
|
high++; /* establish the loop invariant for high */
|
|
|
|
|
|
|
|
while (high > low)
|
1997-04-16 03:48:29 +02:00
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
OffsetNumber mid = low + ((high - low) / 2);
|
|
|
|
|
1999-07-17 00:17:06 +02:00
|
|
|
/* We have low <= mid < high, so mid points at a real slot */
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
result = _bt_compare(rel, keysz, scankey, page, mid);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
if (result > 0)
|
1999-07-17 00:17:06 +02:00
|
|
|
low = mid + 1;
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
1999-07-17 00:17:06 +02:00
|
|
|
high = mid;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
1999-07-17 00:17:06 +02:00
|
|
|
* At this point we have high == low, but be careful: they could point
|
2000-07-21 08:42:39 +02:00
|
|
|
* past the last slot on the page.
|
1997-09-07 07:04:48 +02:00
|
|
|
*
|
2001-03-22 07:16:21 +01:00
|
|
|
* On a leaf page, we always return the first key >= scan key (which
|
|
|
|
* could be the last slot + 1).
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
if (P_ISLEAF(opaque))
|
1999-07-17 00:17:06 +02:00
|
|
|
return low;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
|
|
|
* On a non-leaf page, return the last key < scan key. There must be
|
|
|
|
* one if _bt_compare() is playing by the rules.
|
1999-07-17 00:17:06 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
Assert(low > P_FIRSTDATAKEY(opaque));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-07-17 00:17:06 +02:00
|
|
|
return OffsetNumberPrev(low);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*----------
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_compare() -- Compare scankey to a particular tuple on the page.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* keysz: number of key conditions to be checked (might be less than the
|
|
|
|
* total length of the scan key!)
|
|
|
|
* page/offnum: location of btree item to be compared to.
|
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* This routine returns:
|
2000-05-30 06:25:00 +02:00
|
|
|
* <0 if scankey < tuple at offnum;
|
1997-09-07 07:04:48 +02:00
|
|
|
* 0 if scankey == tuple at offnum;
|
2000-05-30 06:25:00 +02:00
|
|
|
* >0 if scankey > tuple at offnum.
|
2000-07-21 08:42:39 +02:00
|
|
|
* NULLs in the keys are treated as sortable values. Therefore
|
|
|
|
* "equality" does not necessarily mean that the item should be
|
|
|
|
* returned to the caller as a matching key!
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
|
|
|
|
* "minus infinity": this routine will always claim it is less than the
|
|
|
|
* scankey. The actual key value stored (if any, which there probably isn't)
|
|
|
|
* does not matter. This convention allows us to implement the Lehman and
|
|
|
|
* Yao convention that the first down-link pointer is before the first key.
|
|
|
|
* See backend/access/nbtree/README for details.
|
|
|
|
*----------
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
int32
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_compare(Relation rel,
|
1997-09-07 07:04:48 +02:00
|
|
|
int keysz,
|
|
|
|
ScanKey scankey,
|
2000-07-21 08:42:39 +02:00
|
|
|
Page page,
|
1997-09-07 07:04:48 +02:00
|
|
|
OffsetNumber offnum)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
TupleDesc itupdesc = RelationGetDescr(rel);
|
|
|
|
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
1997-09-08 04:41:22 +02:00
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
int i;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Force result ">" if target item is first data item on an internal
|
|
|
|
* page --- see NOTE above.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
|
1998-09-01 05:29:17 +02:00
|
|
|
return 1;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* The scan key is set up with the attribute number associated with
|
|
|
|
* each term in the key. It is important that, if the index is
|
|
|
|
* multi-key, the scan contain the first k key attributes, and that
|
|
|
|
* they be in order. If you think about how multi-key ordering works,
|
|
|
|
* you'll understand why this is.
|
1996-12-06 10:41:45 +01:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* We don't test for violation of this condition here, however. The
|
|
|
|
* initial setup for the index scan had better have gotten it right
|
|
|
|
* (see _bt_first).
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
for (i = 0; i < keysz; i++)
|
1997-03-24 09:48:16 +01:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
ScanKey entry = &scankey[i];
|
|
|
|
Datum datum;
|
|
|
|
bool isNull;
|
|
|
|
int32 result;
|
|
|
|
|
|
|
|
datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* see comments about NULLs handling in btbuild */
|
2001-03-22 05:01:46 +01:00
|
|
|
if (entry->sk_flags & SK_ISNULL) /* key is NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (isNull)
|
2000-05-30 06:25:00 +02:00
|
|
|
result = 0; /* NULL "=" NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
2000-05-30 06:25:00 +02:00
|
|
|
result = 1; /* NULL ">" NOT_NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
else if (isNull) /* key is NOT_NULL and item is NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-05-30 06:25:00 +02:00
|
|
|
result = -1; /* NOT_NULL "<" NULL */
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
else
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
2000-05-30 06:25:00 +02:00
|
|
|
result = DatumGetInt32(FunctionCall2(&entry->sk_func,
|
2000-07-21 08:42:39 +02:00
|
|
|
entry->sk_argument,
|
|
|
|
datum));
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* if the keys are unequal, return the difference */
|
|
|
|
if (result != 0)
|
1998-09-01 05:29:17 +02:00
|
|
|
return result;
|
1996-10-30 07:08:10 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* if we get here, the keys are equal */
|
1998-09-01 05:29:17 +02:00
|
|
|
return 0;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_next() -- Get the next item in a scan.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* On entry, we have a valid currentItemData in the scan, and a
|
2000-07-21 08:42:39 +02:00
|
|
|
* read lock and pin count on the page that contains that item.
|
|
|
|
* We return the next item in the scan, or NULL if no more.
|
|
|
|
* On successful exit, the page containing the new item is locked
|
|
|
|
* and pinned; on NULL exit, no lock or pin is held.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
RetrieveIndexResult
|
|
|
|
_bt_next(IndexScanDesc scan, ScanDirection dir)
|
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
Relation rel;
|
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
ItemPointer current;
|
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
BTScanOpaque so;
|
2000-07-25 06:47:59 +02:00
|
|
|
bool continuescan;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
rel = scan->relation;
|
|
|
|
so = (BTScanOpaque) scan->opaque;
|
|
|
|
current = &(scan->currentItemData);
|
|
|
|
|
|
|
|
/* we still have the buffer pinned and locked */
|
|
|
|
buf = so->btso_curbuf;
|
2000-07-21 08:42:39 +02:00
|
|
|
Assert(BufferIsValid(buf));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
do
|
|
|
|
{
|
|
|
|
/* step one tuple in the appropriate direction */
|
|
|
|
if (!_bt_step(scan, &buf, dir))
|
1998-09-01 05:29:17 +02:00
|
|
|
return (RetrieveIndexResult) NULL;
|
1997-03-18 19:41:37 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* current is the next candidate tuple to return */
|
1997-09-07 07:04:48 +02:00
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &btitem->bti_itup;
|
|
|
|
|
2000-07-25 06:47:59 +02:00
|
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* tuple passes all scan key conditions, so return it */
|
|
|
|
return FormRetrieveIndexResult(current, &(itup->t_tid));
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* This tuple doesn't pass, but there might be more that do */
|
2000-07-25 06:47:59 +02:00
|
|
|
} while (continuescan);
|
1997-03-18 19:41:37 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* No more items, so close down the current-item info */
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return (RetrieveIndexResult) NULL;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_first() -- Find the first item in a scan.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* We need to be clever about the type of scan, the operation it's
|
|
|
|
* performing, and the tree ordering. We return the RetrieveIndexResult
|
|
|
|
* of the first item in the tree that satisfies the qualification
|
|
|
|
* associated with the scan descriptor. On exit, the page containing
|
|
|
|
* the current index tuple is read locked and pinned, and the scan's
|
|
|
|
* opaque data entry is updated to include the buffer.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
RetrieveIndexResult
|
|
|
|
_bt_first(IndexScanDesc scan, ScanDirection dir)
|
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
Relation rel;
|
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
BTStack stack;
|
2000-07-21 08:42:39 +02:00
|
|
|
OffsetNumber offnum;
|
1997-09-08 04:41:22 +02:00
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
ItemPointer current;
|
|
|
|
BlockNumber blkno;
|
|
|
|
StrategyNumber strat;
|
1997-09-07 07:04:48 +02:00
|
|
|
RetrieveIndexResult res;
|
2000-05-30 06:25:00 +02:00
|
|
|
int32 result;
|
1997-09-08 04:41:22 +02:00
|
|
|
BTScanOpaque so;
|
2000-07-25 06:47:59 +02:00
|
|
|
bool continuescan;
|
|
|
|
ScanKey scankeys = NULL;
|
2000-04-12 19:17:23 +02:00
|
|
|
int keysCount = 0;
|
2000-07-25 06:47:59 +02:00
|
|
|
int *nKeyIs = NULL;
|
2000-04-12 19:17:23 +02:00
|
|
|
int i,
|
|
|
|
j;
|
|
|
|
StrategyNumber strat_total;
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
rel = scan->relation;
|
|
|
|
so = (BTScanOpaque) scan->opaque;
|
|
|
|
|
|
|
|
/*
|
2000-07-25 06:47:59 +02:00
|
|
|
* Order the scan keys in our canonical fashion and eliminate any
|
|
|
|
* redundant keys.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-25 06:47:59 +02:00
|
|
|
_bt_orderkeys(rel, so);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-25 06:47:59 +02:00
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* Quit now if _bt_orderkeys() discovered that the scan keys can never
|
|
|
|
* be satisfied (eg, x == 1 AND x > 2).
|
2000-07-25 06:47:59 +02:00
|
|
|
*/
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!so->qual_ok)
|
2000-07-25 06:47:59 +02:00
|
|
|
return (RetrieveIndexResult) NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Examine the scan keys to discover where we need to start the scan.
|
|
|
|
*/
|
|
|
|
scan->scanFromEnd = false;
|
1999-09-27 20:20:21 +02:00
|
|
|
strat_total = BTEqualStrategyNumber;
|
2000-07-25 06:47:59 +02:00
|
|
|
if (so->numberOfKeys > 0)
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
2000-04-12 19:17:23 +02:00
|
|
|
nKeyIs = (int *) palloc(so->numberOfKeys * sizeof(int));
|
|
|
|
for (i = 0; i < so->numberOfKeys; i++)
|
1999-04-13 19:18:29 +02:00
|
|
|
{
|
2000-07-25 06:47:59 +02:00
|
|
|
AttrNumber attno = so->keyData[i].sk_attno;
|
|
|
|
|
|
|
|
/* ignore keys for already-determined attrs */
|
|
|
|
if (attno <= keysCount)
|
1999-09-27 20:20:21 +02:00
|
|
|
continue;
|
2000-07-25 06:47:59 +02:00
|
|
|
/* if we didn't find a boundary for the preceding attr, quit */
|
1999-09-27 20:20:21 +02:00
|
|
|
if (attno > keysCount + 1)
|
|
|
|
break;
|
|
|
|
strat = _bt_getstrat(rel, attno,
|
2000-04-12 19:17:23 +02:00
|
|
|
so->keyData[i].sk_procedure);
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2000-07-25 06:47:59 +02:00
|
|
|
/*
|
|
|
|
* Can we use this key as a starting boundary for this attr?
|
|
|
|
*
|
2001-03-22 05:01:46 +01:00
|
|
|
* We can use multiple keys if they look like, say, = >= = but we
|
|
|
|
* have to stop after accepting a > or < boundary.
|
2000-07-25 06:47:59 +02:00
|
|
|
*/
|
1999-09-27 20:20:21 +02:00
|
|
|
if (strat == strat_total ||
|
2000-04-12 19:17:23 +02:00
|
|
|
strat == BTEqualStrategyNumber)
|
1999-09-27 20:20:21 +02:00
|
|
|
nKeyIs[keysCount++] = i;
|
2000-07-25 06:47:59 +02:00
|
|
|
else if (ScanDirectionIsBackward(dir) &&
|
|
|
|
(strat == BTLessStrategyNumber ||
|
|
|
|
strat == BTLessEqualStrategyNumber))
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
|
|
|
nKeyIs[keysCount++] = i;
|
|
|
|
strat_total = strat;
|
|
|
|
if (strat == BTLessStrategyNumber)
|
1999-04-13 19:18:29 +02:00
|
|
|
break;
|
1999-09-27 20:20:21 +02:00
|
|
|
}
|
2000-07-25 06:47:59 +02:00
|
|
|
else if (ScanDirectionIsForward(dir) &&
|
|
|
|
(strat == BTGreaterStrategyNumber ||
|
|
|
|
strat == BTGreaterEqualStrategyNumber))
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
|
|
|
nKeyIs[keysCount++] = i;
|
|
|
|
strat_total = strat;
|
|
|
|
if (strat == BTGreaterStrategyNumber)
|
1999-04-13 19:18:29 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2000-07-25 06:47:59 +02:00
|
|
|
if (keysCount == 0)
|
1997-09-07 07:04:48 +02:00
|
|
|
scan->scanFromEnd = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
scan->scanFromEnd = true;
|
|
|
|
|
|
|
|
/* if we just need to walk down one edge of the tree, do that */
|
|
|
|
if (scan->scanFromEnd)
|
1999-09-27 20:20:21 +02:00
|
|
|
{
|
|
|
|
if (nKeyIs)
|
|
|
|
pfree(nKeyIs);
|
1998-09-01 05:29:17 +02:00
|
|
|
return _bt_endpoint(scan, dir);
|
1999-09-27 20:20:21 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2000-07-25 06:47:59 +02:00
|
|
|
* We want to start the scan somewhere within the index. Set up a
|
|
|
|
* scankey we can use to search for the correct starting point.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-04-12 19:17:23 +02:00
|
|
|
scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData));
|
|
|
|
for (i = 0; i < keysCount; i++)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2001-10-07 01:21:45 +02:00
|
|
|
FmgrInfo *procinfo;
|
|
|
|
|
1999-09-27 20:20:21 +02:00
|
|
|
j = nKeyIs[i];
|
2001-03-22 05:01:46 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_orderkeys disallows it, but it's place to add some code
|
|
|
|
* later
|
|
|
|
*/
|
1999-09-27 20:20:21 +02:00
|
|
|
if (so->keyData[j].sk_flags & SK_ISNULL)
|
|
|
|
{
|
|
|
|
pfree(nKeyIs);
|
|
|
|
pfree(scankeys);
|
|
|
|
elog(ERROR, "_bt_first: btree doesn't support is(not)null, yet");
|
|
|
|
return ((RetrieveIndexResult) NULL);
|
2000-04-12 19:17:23 +02:00
|
|
|
}
|
2001-10-07 01:21:45 +02:00
|
|
|
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
|
|
|
|
ScanKeyEntryInitializeWithInfo(scankeys + i,
|
|
|
|
so->keyData[j].sk_flags,
|
|
|
|
i + 1,
|
|
|
|
procinfo,
|
|
|
|
CurrentMemoryContext,
|
|
|
|
so->keyData[j].sk_argument);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-04-12 19:17:23 +02:00
|
|
|
if (nKeyIs)
|
|
|
|
pfree(nKeyIs);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
current = &(scan->currentItemData);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Use the manufactured scan key to descend the tree and position
|
|
|
|
* ourselves on the target leaf page.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* don't need to keep the stack around... */
|
|
|
|
_bt_freestack(stack);
|
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!BufferIsValid(buf))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Only get here if index is completely empty */
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
1999-09-27 20:20:21 +02:00
|
|
|
pfree(scankeys);
|
1998-09-01 05:29:17 +02:00
|
|
|
return (RetrieveIndexResult) NULL;
|
1997-05-30 20:35:40 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* remember which buffer we have pinned */
|
|
|
|
so->btso_curbuf = buf;
|
|
|
|
blkno = BufferGetBlockNumber(buf);
|
|
|
|
page = BufferGetPage(buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-25 06:47:59 +02:00
|
|
|
/* position to the precise item on the page */
|
2000-07-21 08:42:39 +02:00
|
|
|
offnum = _bt_binsrch(rel, buf, keysCount, scankeys);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
ItemPointerSet(current, blkno, offnum);
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
|
|
|
* At this point we are positioned at the first item >= scan key, or
|
2001-03-23 05:49:58 +01:00
|
|
|
* possibly at the end of a page on which all the existing items are
|
|
|
|
* greater than the scan key and we know that everything on later pages
|
|
|
|
* is less than or equal to scan key.
|
|
|
|
*
|
2001-03-22 07:16:21 +01:00
|
|
|
* We could step forward in the latter case, but that'd be a waste of
|
|
|
|
* time if we want to scan backwards. So, it's now time to examine
|
|
|
|
* the scan strategy to find the exact place to start the scan.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
2001-03-22 07:16:21 +01:00
|
|
|
* Note: if _bt_step fails (meaning we fell off the end of the index in
|
|
|
|
* one direction or the other), we either return NULL (no matches) or
|
|
|
|
* call _bt_endpoint() to set up a scan starting at that index
|
|
|
|
* endpoint, as appropriate for the desired scan type.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
2000-07-25 06:47:59 +02:00
|
|
|
* it's yet other place to add some code later for is(not)null ...
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
switch (strat_total)
|
1997-05-30 20:35:40 +02:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
case BTLessStrategyNumber:
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Back up one to arrive at last item < scankey
|
|
|
|
*/
|
|
|
|
if (!_bt_step(scan, &buf, BackwardScanDirection))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
pfree(scankeys);
|
|
|
|
return (RetrieveIndexResult) NULL;
|
1997-09-08 04:41:22 +02:00
|
|
|
}
|
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1997-09-08 04:41:22 +02:00
|
|
|
case BTLessEqualStrategyNumber:
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* We need to find the last item <= scankey, so step forward
|
|
|
|
* till we find one > scankey, then step back one.
|
|
|
|
*/
|
|
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
1997-09-08 04:41:22 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
pfree(scankeys);
|
|
|
|
return _bt_endpoint(scan, dir);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
|
|
|
if (result < 0)
|
|
|
|
break;
|
|
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
|
|
{
|
|
|
|
pfree(scankeys);
|
|
|
|
return _bt_endpoint(scan, dir);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!_bt_step(scan, &buf, BackwardScanDirection))
|
|
|
|
{
|
|
|
|
pfree(scankeys);
|
|
|
|
return (RetrieveIndexResult) NULL;
|
1997-09-08 04:41:22 +02:00
|
|
|
}
|
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1997-09-08 04:41:22 +02:00
|
|
|
case BTEqualStrategyNumber:
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* Make sure we are on the first equal item; might have to
|
|
|
|
* step forward if currently at end of page.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
1997-09-08 04:41:22 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
|
|
{
|
|
|
|
pfree(scankeys);
|
|
|
|
return (RetrieveIndexResult) NULL;
|
|
|
|
}
|
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
1997-09-08 04:41:22 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
|
|
|
if (result != 0)
|
2001-03-22 05:01:46 +01:00
|
|
|
goto nomatches; /* no equal items! */
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* If a backward scan was specified, need to start with last
|
|
|
|
* equal item not first one.
|
|
|
|
*/
|
|
|
|
if (ScanDirectionIsBackward(dir))
|
1999-04-13 19:18:29 +02:00
|
|
|
{
|
|
|
|
do
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
|
|
{
|
|
|
|
pfree(scankeys);
|
|
|
|
return _bt_endpoint(scan, dir);
|
|
|
|
}
|
1999-04-13 19:18:29 +02:00
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
2000-07-21 08:42:39 +02:00
|
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
1999-04-13 19:18:29 +02:00
|
|
|
} while (result == 0);
|
2000-07-21 08:42:39 +02:00
|
|
|
if (!_bt_step(scan, &buf, BackwardScanDirection))
|
|
|
|
elog(ERROR, "_bt_first: equal items disappeared?");
|
1999-04-13 19:18:29 +02:00
|
|
|
}
|
1997-09-08 04:41:22 +02:00
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1997-09-08 04:41:22 +02:00
|
|
|
case BTGreaterEqualStrategyNumber:
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* We want the first item >= scankey, which is where we are...
|
|
|
|
* unless we're not anywhere at all...
|
|
|
|
*/
|
|
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
pfree(scankeys);
|
|
|
|
return (RetrieveIndexResult) NULL;
|
1997-09-08 04:41:22 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
1997-09-08 04:41:22 +02:00
|
|
|
break;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1997-09-08 04:41:22 +02:00
|
|
|
case BTGreaterStrategyNumber:
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* We want the first item > scankey, so make sure we are on an
|
|
|
|
* item and then step over any equal items.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
if (offnum > PageGetMaxOffsetNumber(page))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
1997-09-08 04:41:22 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
pfree(scankeys);
|
|
|
|
return (RetrieveIndexResult) NULL;
|
|
|
|
}
|
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
}
|
|
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
|
|
|
while (result == 0)
|
|
|
|
{
|
|
|
|
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
|
|
{
|
|
|
|
pfree(scankeys);
|
|
|
|
return (RetrieveIndexResult) NULL;
|
|
|
|
}
|
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
result = _bt_compare(rel, keysCount, scankeys, page, offnum);
|
1997-09-08 04:41:22 +02:00
|
|
|
}
|
|
|
|
break;
|
1997-05-30 20:35:40 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* okay, current item pointer for the scan is right */
|
|
|
|
offnum = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &btitem->bti_itup;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* is the first item actually acceptable? */
|
2000-07-25 06:47:59 +02:00
|
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
1997-05-30 20:35:40 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* yes, return it */
|
1997-09-07 07:04:48 +02:00
|
|
|
res = FormRetrieveIndexResult(current, &(itup->t_tid));
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
2000-07-25 06:47:59 +02:00
|
|
|
else if (continuescan)
|
1999-04-13 19:18:29 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* no, but there might be another one that is */
|
|
|
|
res = _bt_next(scan, dir);
|
1999-04-13 19:18:29 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* no tuples in the index match this scan key */
|
|
|
|
nomatches:
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
res = (RetrieveIndexResult) NULL;
|
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
pfree(scankeys);
|
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return res;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_step() -- Step one item in the requested direction in a scan on
|
|
|
|
* the tree.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* *bufP is the current buffer (read-locked and pinned). If we change
|
|
|
|
* pages, it's updated appropriately.
|
|
|
|
*
|
|
|
|
* If successful, update scan's currentItemData and return true.
|
|
|
|
* If no adjacent record exists in the requested direction,
|
|
|
|
* release buffer pin/locks and return false.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
bool
|
1997-09-08 22:59:27 +02:00
|
|
|
_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
Relation rel = scan->relation;
|
|
|
|
ItemPointer current = &(scan->currentItemData);
|
|
|
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
1997-09-08 04:41:22 +02:00
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
OffsetNumber offnum,
|
|
|
|
maxoff;
|
|
|
|
BlockNumber blkno;
|
|
|
|
BlockNumber obknum;
|
1999-05-25 18:15:34 +02:00
|
|
|
|
1999-03-28 22:32:42 +02:00
|
|
|
/*
|
1999-05-25 18:15:34 +02:00
|
|
|
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
|
|
|
|
* due to ability of ip_posid to be equal 0.
|
1999-03-28 22:32:42 +02:00
|
|
|
*/
|
|
|
|
offnum = current->ip_posid;
|
2000-07-21 08:42:39 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
if (ScanDirectionIsForward(dir))
|
|
|
|
{
|
|
|
|
if (!PageIsEmpty(page) && offnum < maxoff)
|
|
|
|
offnum = OffsetNumberNext(offnum);
|
|
|
|
else
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* walk right to the next page with data */
|
|
|
|
for (;;)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* if we're at end of scan, release the buffer and return */
|
|
|
|
if (P_RIGHTMOST(opaque))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, *bufP);
|
2000-07-21 08:42:39 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
*bufP = so->btso_curbuf = InvalidBuffer;
|
|
|
|
return false;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
/* step right one page */
|
|
|
|
blkno = opaque->btpo_next;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, *bufP);
|
2000-07-21 08:42:39 +02:00
|
|
|
*bufP = _bt_getbuf(rel, blkno, BT_READ);
|
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
/* done if it's not empty */
|
|
|
|
offnum = P_FIRSTDATAKEY(opaque);
|
|
|
|
if (!PageIsEmpty(page) && offnum <= maxoff)
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
else
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (offnum > P_FIRSTDATAKEY(opaque))
|
1997-09-07 07:04:48 +02:00
|
|
|
offnum = OffsetNumberPrev(offnum);
|
|
|
|
else
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* walk left to the next page with data */
|
|
|
|
for (;;)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* if we're at end of scan, release the buffer and return */
|
|
|
|
if (P_LEFTMOST(opaque))
|
|
|
|
{
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, *bufP);
|
2000-07-21 08:42:39 +02:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
*bufP = so->btso_curbuf = InvalidBuffer;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
/* step left */
|
1997-09-07 07:04:48 +02:00
|
|
|
obknum = BufferGetBlockNumber(*bufP);
|
2000-07-21 08:42:39 +02:00
|
|
|
blkno = opaque->btpo_prev;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, *bufP);
|
2000-07-21 08:42:39 +02:00
|
|
|
*bufP = _bt_getbuf(rel, blkno, BT_READ);
|
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
2001-03-22 05:01:46 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* If the adjacent page just split, then we have to walk
|
2001-03-22 05:01:46 +01:00
|
|
|
* right to find the block that's now adjacent to where we
|
|
|
|
* were. Because pages only split right, we don't have to
|
|
|
|
* worry about this failing to terminate.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
while (opaque->btpo_next != obknum)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
blkno = opaque->btpo_next;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, *bufP);
|
1997-09-07 07:04:48 +02:00
|
|
|
*bufP = _bt_getbuf(rel, blkno, BT_READ);
|
|
|
|
page = BufferGetPage(*bufP);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
/* done if it's not empty */
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
offnum = maxoff;
|
|
|
|
if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque))
|
|
|
|
break;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
|
|
|
|
/* Update scan state */
|
1997-09-07 07:04:48 +02:00
|
|
|
so->btso_curbuf = *bufP;
|
2000-07-21 08:42:39 +02:00
|
|
|
blkno = BufferGetBlockNumber(*bufP);
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSet(current, blkno, offnum);
|
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return true;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_endpoint() -- Find the first or last key in the index.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
|
|
|
* This is used by _bt_first() to set up a scan when we've determined
|
|
|
|
* that the scan must start at the beginning or end of the index (for
|
|
|
|
* a forward or backward scan respectively).
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
1997-09-08 04:41:22 +02:00
|
|
|
static RetrieveIndexResult
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
Relation rel;
|
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
ItemPointer current;
|
|
|
|
OffsetNumber offnum,
|
|
|
|
maxoff;
|
2000-07-21 08:42:39 +02:00
|
|
|
OffsetNumber start;
|
1997-09-08 04:41:22 +02:00
|
|
|
BlockNumber blkno;
|
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
BTScanOpaque so;
|
1997-09-07 07:04:48 +02:00
|
|
|
RetrieveIndexResult res;
|
2000-07-25 06:47:59 +02:00
|
|
|
bool continuescan;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
rel = scan->relation;
|
|
|
|
current = &(scan->currentItemData);
|
|
|
|
so = (BTScanOpaque) scan->opaque;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Scan down to the leftmost or rightmost leaf page. This is a
|
2001-03-22 05:01:46 +01:00
|
|
|
* simplified version of _bt_search(). We don't maintain a stack
|
2000-07-21 08:42:39 +02:00
|
|
|
* since we know we won't need it.
|
|
|
|
*/
|
1997-09-07 07:04:48 +02:00
|
|
|
buf = _bt_getroot(rel, BT_READ);
|
2000-07-21 08:42:39 +02:00
|
|
|
|
2001-03-22 05:01:46 +01:00
|
|
|
if (!BufferIsValid(buf))
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
|
|
|
/* empty index... */
|
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
|
|
|
return (RetrieveIndexResult) NULL;
|
|
|
|
}
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
blkno = BufferGetBlockNumber(buf);
|
1996-07-09 08:22:35 +02:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
for (;;)
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (P_ISLEAF(opaque))
|
1997-09-07 07:04:48 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
if (ScanDirectionIsForward(dir))
|
2000-07-21 08:42:39 +02:00
|
|
|
offnum = P_FIRSTDATAKEY(opaque);
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
|
|
|
offnum = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
|
|
|
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
1996-07-09 08:22:35 +02:00
|
|
|
buf = _bt_getbuf(rel, blkno, BT_READ);
|
2000-07-21 08:42:39 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Race condition: If the child page we just stepped onto was just
|
|
|
|
* split, we need to make sure we're all the way at the right edge
|
|
|
|
* of the tree. See the paper by Lehman and Yao.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
|
|
|
if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
|
|
|
|
{
|
|
|
|
do
|
|
|
|
{
|
|
|
|
blkno = opaque->btpo_next;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
buf = _bt_getbuf(rel, blkno, BT_READ);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
} while (!P_RIGHTMOST(opaque));
|
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* okay, we've got the {left,right}-most page in the tree */
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
if (ScanDirectionIsForward(dir))
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
Assert(P_LEFTMOST(opaque));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
start = P_FIRSTDATAKEY(opaque);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
else if (ScanDirectionIsBackward(dir))
|
1996-12-06 10:41:45 +01:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
Assert(P_RIGHTMOST(opaque));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
start = PageGetMaxOffsetNumber(page);
|
2001-03-22 05:01:46 +01:00
|
|
|
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty
|
|
|
|
* page */
|
2000-07-21 08:42:39 +02:00
|
|
|
start = P_FIRSTDATAKEY(opaque);
|
1996-12-06 10:41:45 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
1998-01-07 22:07:04 +01:00
|
|
|
elog(ERROR, "Illegal scan direction %d", dir);
|
2000-07-21 08:42:39 +02:00
|
|
|
start = 0; /* keep compiler quiet */
|
|
|
|
}
|
|
|
|
|
|
|
|
ItemPointerSet(current, blkno, start);
|
|
|
|
/* remember which buffer we have pinned */
|
|
|
|
so->btso_curbuf = buf;
|
|
|
|
|
|
|
|
/*
|
2001-03-22 05:01:46 +01:00
|
|
|
* Left/rightmost page could be empty due to deletions, if so step
|
|
|
|
* till we find a nonempty page.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
if (start > maxoff)
|
|
|
|
{
|
|
|
|
if (!_bt_step(scan, &buf, dir))
|
|
|
|
return (RetrieveIndexResult) NULL;
|
|
|
|
start = ItemPointerGetOffsetNumber(current);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
|
|
|
|
/* see if we picked a winner */
|
2000-07-25 06:47:59 +02:00
|
|
|
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* yes, return it */
|
1997-09-07 07:04:48 +02:00
|
|
|
res = FormRetrieveIndexResult(current, &(itup->t_tid));
|
1996-12-06 10:41:45 +01:00
|
|
|
}
|
2000-07-25 06:47:59 +02:00
|
|
|
else if (continuescan)
|
1999-04-13 19:18:29 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* no, but there might be another one that is */
|
|
|
|
res = _bt_next(scan, dir);
|
1999-04-13 19:18:29 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
1996-12-06 10:41:45 +01:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* no tuples in the index match this scan key */
|
1997-01-05 11:56:36 +01:00
|
|
|
ItemPointerSetInvalid(current);
|
|
|
|
so->btso_curbuf = InvalidBuffer;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
_bt_relbuf(rel, buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
res = (RetrieveIndexResult) NULL;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return res;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|