Rewrite btree index scans to work a page at a time in all cases (both
btgettuple and btgetmulti). This eliminates the problem of "re-finding" the exact stopping point, since the stopping point is effectively always a page boundary, and index items are never moved across pre-existing page boundaries. A small penalty is that the keys_are_unique optimization is effectively disabled (and, therefore, is removed in this patch), causing us to apply _bt_checkkeys() to at least one more tuple than necessary when looking up a unique key. However, the advantages for non-unique cases seem great enough to accept this tradeoff. Aside from simplifying and (sometimes) speeding up the indexscan code, this will allow us to reimplement btbulkdelete as a largely sequential scan instead of index-order traversal, thereby significantly reducing the cost of VACUUM. Those changes will come in a separate patch. Original patch by Heikki Linnakangas, rework by Tom Lane.
This commit is contained in:
parent
88d94a11bb
commit
09cb5c0e7d
|
@ -8,7 +8,7 @@
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.54 2006/03/05 15:58:21 momjian Exp $
|
* $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.55 2006/05/07 01:21:30 tgl Exp $
|
||||||
*
|
*
|
||||||
* NOTES
|
* NOTES
|
||||||
* many of the old access method routines have been turned into
|
* many of the old access method routines have been turned into
|
||||||
|
@ -90,8 +90,6 @@ RelationGetIndexScan(Relation indexRelation,
|
||||||
scan->have_lock = false; /* ditto */
|
scan->have_lock = false; /* ditto */
|
||||||
scan->kill_prior_tuple = false;
|
scan->kill_prior_tuple = false;
|
||||||
scan->ignore_killed_tuples = true; /* default setting */
|
scan->ignore_killed_tuples = true; /* default setting */
|
||||||
scan->keys_are_unique = false; /* may be set by index AM */
|
|
||||||
scan->got_tuple = false;
|
|
||||||
|
|
||||||
scan->opaque = NULL;
|
scan->opaque = NULL;
|
||||||
|
|
||||||
|
@ -102,9 +100,6 @@ RelationGetIndexScan(Relation indexRelation,
|
||||||
scan->xs_ctup.t_data = NULL;
|
scan->xs_ctup.t_data = NULL;
|
||||||
scan->xs_cbuf = InvalidBuffer;
|
scan->xs_cbuf = InvalidBuffer;
|
||||||
|
|
||||||
scan->unique_tuple_pos = 0;
|
|
||||||
scan->unique_tuple_mark = 0;
|
|
||||||
|
|
||||||
pgstat_initstats(&scan->xs_pgstat_info, indexRelation);
|
pgstat_initstats(&scan->xs_pgstat_info, indexRelation);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.92 2006/05/02 22:25:10 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.93 2006/05/07 01:21:30 tgl Exp $
|
||||||
*
|
*
|
||||||
* INTERFACE ROUTINES
|
* INTERFACE ROUTINES
|
||||||
* index_open - open an index relation by relation OID
|
* index_open - open an index relation by relation OID
|
||||||
|
@ -362,10 +362,6 @@ index_rescan(IndexScanDesc scan, ScanKey key)
|
||||||
}
|
}
|
||||||
|
|
||||||
scan->kill_prior_tuple = false; /* for safety */
|
scan->kill_prior_tuple = false; /* for safety */
|
||||||
scan->keys_are_unique = false; /* may be set by index AM */
|
|
||||||
scan->got_tuple = false;
|
|
||||||
scan->unique_tuple_pos = 0;
|
|
||||||
scan->unique_tuple_mark = 0;
|
|
||||||
|
|
||||||
FunctionCall2(procedure,
|
FunctionCall2(procedure,
|
||||||
PointerGetDatum(scan),
|
PointerGetDatum(scan),
|
||||||
|
@ -417,8 +413,6 @@ index_markpos(IndexScanDesc scan)
|
||||||
SCAN_CHECKS;
|
SCAN_CHECKS;
|
||||||
GET_SCAN_PROCEDURE(ammarkpos);
|
GET_SCAN_PROCEDURE(ammarkpos);
|
||||||
|
|
||||||
scan->unique_tuple_mark = scan->unique_tuple_pos;
|
|
||||||
|
|
||||||
FunctionCall1(procedure, PointerGetDatum(scan));
|
FunctionCall1(procedure, PointerGetDatum(scan));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -440,13 +434,6 @@ index_restrpos(IndexScanDesc scan)
|
||||||
|
|
||||||
scan->kill_prior_tuple = false; /* for safety */
|
scan->kill_prior_tuple = false; /* for safety */
|
||||||
|
|
||||||
/*
|
|
||||||
* We do not reset got_tuple; so if the scan is actually being
|
|
||||||
* short-circuited by index_getnext, the effective position restoration is
|
|
||||||
* done by restoring unique_tuple_pos.
|
|
||||||
*/
|
|
||||||
scan->unique_tuple_pos = scan->unique_tuple_mark;
|
|
||||||
|
|
||||||
FunctionCall1(procedure, PointerGetDatum(scan));
|
FunctionCall1(procedure, PointerGetDatum(scan));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -456,8 +443,7 @@ index_restrpos(IndexScanDesc scan)
|
||||||
* The result is the next heap tuple satisfying the scan keys and the
|
* The result is the next heap tuple satisfying the scan keys and the
|
||||||
* snapshot, or NULL if no more matching tuples exist. On success,
|
* snapshot, or NULL if no more matching tuples exist. On success,
|
||||||
* the buffer containing the heap tuple is pinned (the pin will be dropped
|
* the buffer containing the heap tuple is pinned (the pin will be dropped
|
||||||
* at the next index_getnext or index_endscan). The index TID corresponding
|
* at the next index_getnext or index_endscan).
|
||||||
* to the heap tuple can be obtained if needed from scan->currentItemData.
|
|
||||||
* ----------------
|
* ----------------
|
||||||
*/
|
*/
|
||||||
HeapTuple
|
HeapTuple
|
||||||
|
@ -469,65 +455,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||||
SCAN_CHECKS;
|
SCAN_CHECKS;
|
||||||
GET_SCAN_PROCEDURE(amgettuple);
|
GET_SCAN_PROCEDURE(amgettuple);
|
||||||
|
|
||||||
/*
|
|
||||||
* If we already got a tuple and it must be unique, there's no need to
|
|
||||||
* make the index AM look through any additional tuples. (This can save a
|
|
||||||
* useful amount of work in scenarios where there are many dead tuples due
|
|
||||||
* to heavy update activity.)
|
|
||||||
*
|
|
||||||
* To do this we must keep track of the logical scan position
|
|
||||||
* (before/on/after tuple). Also, we have to be sure to release scan
|
|
||||||
* resources before returning NULL; if we fail to do so then a multi-index
|
|
||||||
* scan can easily run the system out of free buffers. We can release
|
|
||||||
* index-level resources fairly cheaply by calling index_rescan. This
|
|
||||||
* means there are two persistent states as far as the index AM is
|
|
||||||
* concerned: on-tuple and rescanned. If we are actually asked to
|
|
||||||
* re-fetch the single tuple, we have to go through a fresh indexscan
|
|
||||||
* startup, which penalizes that (infrequent) case.
|
|
||||||
*/
|
|
||||||
if (scan->keys_are_unique && scan->got_tuple)
|
|
||||||
{
|
|
||||||
int new_tuple_pos = scan->unique_tuple_pos;
|
|
||||||
|
|
||||||
if (ScanDirectionIsForward(direction))
|
|
||||||
{
|
|
||||||
if (new_tuple_pos <= 0)
|
|
||||||
new_tuple_pos++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (new_tuple_pos >= 0)
|
|
||||||
new_tuple_pos--;
|
|
||||||
}
|
|
||||||
if (new_tuple_pos == 0)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* We are moving onto the unique tuple from having been off it. We
|
|
||||||
* just fall through and let the index AM do the work. Note we
|
|
||||||
* should get the right answer regardless of scan direction.
|
|
||||||
*/
|
|
||||||
scan->unique_tuple_pos = 0; /* need to update position */
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Moving off the tuple; must do amrescan to release index-level
|
|
||||||
* pins before we return NULL. Since index_rescan will reset my
|
|
||||||
* state, must save and restore...
|
|
||||||
*/
|
|
||||||
int unique_tuple_mark = scan->unique_tuple_mark;
|
|
||||||
|
|
||||||
index_rescan(scan, NULL /* no change to key */ );
|
|
||||||
|
|
||||||
scan->keys_are_unique = true;
|
|
||||||
scan->got_tuple = true;
|
|
||||||
scan->unique_tuple_pos = new_tuple_pos;
|
|
||||||
scan->unique_tuple_mark = unique_tuple_mark;
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* just make sure this is false... */
|
/* just make sure this is false... */
|
||||||
scan->kill_prior_tuple = false;
|
scan->kill_prior_tuple = false;
|
||||||
|
|
||||||
|
@ -588,14 +515,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Success exit */
|
/* Success exit */
|
||||||
scan->got_tuple = true;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we just fetched a known-unique tuple, then subsequent calls will go
|
|
||||||
* through the short-circuit code above. unique_tuple_pos has been
|
|
||||||
* initialized to 0, which is the correct state ("on row").
|
|
||||||
*/
|
|
||||||
|
|
||||||
return heapTuple;
|
return heapTuple;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -608,8 +527,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||||
* (which most callers of this routine will probably want to suppress by
|
* (which most callers of this routine will probably want to suppress by
|
||||||
* setting scan->ignore_killed_tuples = false).
|
* setting scan->ignore_killed_tuples = false).
|
||||||
*
|
*
|
||||||
* On success (TRUE return), the found index TID is in scan->currentItemData,
|
* On success (TRUE return), the heap TID of the found index entry is in
|
||||||
* and its heap TID is in scan->xs_ctup.t_self. scan->xs_cbuf is untouched.
|
* scan->xs_ctup.t_self. scan->xs_cbuf is untouched.
|
||||||
* ----------------
|
* ----------------
|
||||||
*/
|
*/
|
||||||
bool
|
bool
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $
|
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.11 2006/05/07 01:21:30 tgl Exp $
|
||||||
|
|
||||||
This directory contains a correct implementation of Lehman and Yao's
|
This directory contains a correct implementation of Lehman and Yao's
|
||||||
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
|
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
|
||||||
|
@ -67,13 +67,22 @@ move right until we find a page whose right-link matches the page we
|
||||||
came from. (Actually, it's even harder than that; see deletion discussion
|
came from. (Actually, it's even harder than that; see deletion discussion
|
||||||
below.)
|
below.)
|
||||||
|
|
||||||
Read locks on a page are held for as long as a scan is examining a page.
|
Page read locks are held only for as long as a scan is examining a page.
|
||||||
But nbtree.c arranges to drop the read lock, but not the buffer pin,
|
To minimize lock/unlock traffic, an index scan always searches a leaf page
|
||||||
on the current page of a scan before control leaves nbtree. When we
|
to identify all the matching items at once, copying their heap tuple IDs
|
||||||
come back to resume the scan, we have to re-grab the read lock and
|
into backend-local storage. The heap tuple IDs are then processed while
|
||||||
then move right if the current item moved (see _bt_restscan()). Keeping
|
not holding any page lock within the index. We do continue to hold a pin
|
||||||
the pin ensures that the current item cannot move left or be deleted
|
on the leaf page, to protect against concurrent deletions (see below).
|
||||||
(see btbulkdelete).
|
In this state the scan is effectively stopped "between" pages, either
|
||||||
|
before or after the page it has pinned. This is safe in the presence of
|
||||||
|
concurrent insertions and even page splits, because items are never moved
|
||||||
|
across pre-existing page boundaries --- so the scan cannot miss any items
|
||||||
|
it should have seen, nor accidentally return the same item twice. The scan
|
||||||
|
must remember the page's right-link at the time it was scanned, since that
|
||||||
|
is the page to move right to; if we move right to the current right-link
|
||||||
|
then we'd re-scan any items moved by a page split. We don't similarly
|
||||||
|
remember the left-link, since it's best to use the most up-to-date
|
||||||
|
left-link when trying to move left (see detailed move-left algorithm below).
|
||||||
|
|
||||||
In most cases we release our lock and pin on a page before attempting
|
In most cases we release our lock and pin on a page before attempting
|
||||||
to acquire pin and lock on the page we are moving to. In a few places
|
to acquire pin and lock on the page we are moving to. In a few places
|
||||||
|
@ -119,14 +128,33 @@ item doesn't fit on the split page where it needs to go!
|
||||||
The deletion algorithm
|
The deletion algorithm
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
Deletions of leaf items are handled by getting a super-exclusive lock on
|
Before deleting a leaf item, we get a super-exclusive lock on the target
|
||||||
the target page, so that no other backend has a pin on the page when the
|
page, so that no other backend has a pin on the page when the deletion
|
||||||
deletion starts. This means no scan is pointing at the page, so no other
|
starts. This is not necessary for correctness in terms of the btree index
|
||||||
backend can lose its place due to the item deletion.
|
operations themselves; as explained above, index scans logically stop
|
||||||
|
"between" pages and so can't lose their place. The reason we do it is to
|
||||||
|
provide an interlock between non-full VACUUM and indexscans. Since VACUUM
|
||||||
|
deletes index entries before deleting tuples, the super-exclusive lock
|
||||||
|
guarantees that VACUUM can't delete any heap tuple that an indexscanning
|
||||||
|
process might be about to visit. (This guarantee works only for simple
|
||||||
|
indexscans that visit the heap in sync with the index scan, not for bitmap
|
||||||
|
scans. We only need the guarantee when using non-MVCC snapshot rules such
|
||||||
|
as SnapshotNow, so in practice this is only important for system catalog
|
||||||
|
accesses.)
|
||||||
|
|
||||||
The above does not work for deletion of items in internal pages, since
|
Because a page can be split even while someone holds a pin on it, it is
|
||||||
other backends keep no lock nor pin on a page they have descended past.
|
possible that an indexscan will return items that are no longer stored on
|
||||||
Instead, when a backend is ascending the tree using its stack, it must
|
the page it has a pin on, but rather somewhere to the right of that page.
|
||||||
|
To ensure that VACUUM can't prematurely remove such heap tuples, we require
|
||||||
|
btbulkdelete to obtain super-exclusive lock on every leaf page in the index
|
||||||
|
(even pages that don't contain any deletable tuples). This guarantees that
|
||||||
|
the btbulkdelete call cannot return while any indexscan is still holding
|
||||||
|
a copy of a deleted index tuple. Note that this requirement does not say
|
||||||
|
that btbulkdelete must visit the pages in any particular order.
|
||||||
|
|
||||||
|
There is no such interlocking for deletion of items in internal pages,
|
||||||
|
since backends keep no lock nor pin on a page they have descended past.
|
||||||
|
Hence, when a backend is ascending the tree using its stack, it must
|
||||||
be prepared for the possibility that the item it wants is to the left of
|
be prepared for the possibility that the item it wants is to the left of
|
||||||
the recorded position (but it can't have moved left out of the recorded
|
the recorded position (but it can't have moved left out of the recorded
|
||||||
page). Since we hold a lock on the lower page (per L&Y) until we have
|
page). Since we hold a lock on the lower page (per L&Y) until we have
|
||||||
|
@ -201,7 +229,7 @@ accordingly. Searches and forward scans simply follow the right-link
|
||||||
until they find a non-dead page --- this will be where the deleted page's
|
until they find a non-dead page --- this will be where the deleted page's
|
||||||
key-space moved to.
|
key-space moved to.
|
||||||
|
|
||||||
Stepping left in a backward scan is complicated because we must consider
|
Moving left in a backward scan is complicated because we must consider
|
||||||
the possibility that the left sibling was just split (meaning we must find
|
the possibility that the left sibling was just split (meaning we must find
|
||||||
the rightmost page derived from the left sibling), plus the possibility
|
the rightmost page derived from the left sibling), plus the possibility
|
||||||
that the page we were just on has now been deleted and hence isn't in the
|
that the page we were just on has now been deleted and hence isn't in the
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.146 2006/05/02 22:25:10 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.147 2006/05/07 01:21:30 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -48,7 +48,6 @@ typedef struct
|
||||||
} BTBuildState;
|
} BTBuildState;
|
||||||
|
|
||||||
|
|
||||||
static void _bt_restscan(IndexScanDesc scan);
|
|
||||||
static void btbuildCallback(Relation index,
|
static void btbuildCallback(Relation index,
|
||||||
HeapTuple htup,
|
HeapTuple htup,
|
||||||
Datum *values,
|
Datum *values,
|
||||||
|
@ -218,8 +217,6 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||||
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||||
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
|
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
|
||||||
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
Page page;
|
|
||||||
OffsetNumber offnum;
|
|
||||||
bool res;
|
bool res;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -227,33 +224,26 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||||
* appropriate direction. If we haven't done so yet, we call a routine to
|
* appropriate direction. If we haven't done so yet, we call a routine to
|
||||||
* get the first item in the scan.
|
* get the first item in the scan.
|
||||||
*/
|
*/
|
||||||
if (ItemPointerIsValid(&(scan->currentItemData)))
|
if (BTScanPosIsValid(so->currPos))
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
* Restore scan position using heap TID returned by previous call to
|
|
||||||
* btgettuple(). _bt_restscan() re-grabs the read lock on the buffer,
|
|
||||||
* too.
|
|
||||||
*/
|
|
||||||
_bt_restscan(scan);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check to see if we should kill the previously-fetched tuple.
|
* Check to see if we should kill the previously-fetched tuple.
|
||||||
*/
|
*/
|
||||||
if (scan->kill_prior_tuple)
|
if (scan->kill_prior_tuple)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Yes, so mark it by setting the LP_DELETE bit in the item flags.
|
* Yes, remember it for later. (We'll deal with all such tuples
|
||||||
|
* at once right before leaving the index page.) The test for
|
||||||
|
* numKilled overrun is not just paranoia: if the caller reverses
|
||||||
|
* direction in the indexscan then the same item might get entered
|
||||||
|
* multiple times. It's not worth trying to optimize that, so we
|
||||||
|
* don't detect it, but instead just forget any excess entries.
|
||||||
*/
|
*/
|
||||||
offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData));
|
if (so->killedItems == NULL)
|
||||||
page = BufferGetPage(so->btso_curbuf);
|
so->killedItems = (int *)
|
||||||
PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
|
palloc(MaxIndexTuplesPerPage * sizeof(int));
|
||||||
|
if (so->numKilled < MaxIndexTuplesPerPage)
|
||||||
/*
|
so->killedItems[so->numKilled++] = so->currPos.itemIndex;
|
||||||
* Since this can be redone later if needed, it's treated the same
|
|
||||||
* as a commit-hint-bit status update for heap tuples: we mark the
|
|
||||||
* buffer dirty but don't make a WAL log entry.
|
|
||||||
*/
|
|
||||||
SetBufferCommitInfoNeedsSave(so->btso_curbuf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -264,29 +254,15 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||||
else
|
else
|
||||||
res = _bt_first(scan, dir);
|
res = _bt_first(scan, dir);
|
||||||
|
|
||||||
/*
|
|
||||||
* Save heap TID to use it in _bt_restscan. Then release the read lock on
|
|
||||||
* the buffer so that we aren't blocking other backends.
|
|
||||||
*
|
|
||||||
* NOTE: we do keep the pin on the buffer! This is essential to ensure
|
|
||||||
* that someone else doesn't delete the index entry we are stopped on.
|
|
||||||
*/
|
|
||||||
if (res)
|
|
||||||
{
|
|
||||||
((BTScanOpaque) scan->opaque)->curHeapIptr = scan->xs_ctup.t_self;
|
|
||||||
LockBuffer(((BTScanOpaque) scan->opaque)->btso_curbuf,
|
|
||||||
BUFFER_LOCK_UNLOCK);
|
|
||||||
}
|
|
||||||
|
|
||||||
PG_RETURN_BOOL(res);
|
PG_RETURN_BOOL(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* btgetmulti() -- get multiple tuples at once
|
* btgetmulti() -- get multiple tuples at once
|
||||||
*
|
*
|
||||||
* This is a somewhat generic implementation: it avoids the _bt_restscan
|
* In the current implementation there seems no strong reason to stop at
|
||||||
* overhead, but there's no smarts about picking especially good stopping
|
* index page boundaries; we just press on until we fill the caller's buffer
|
||||||
* points such as index page boundaries.
|
* or run out of matches.
|
||||||
*/
|
*/
|
||||||
Datum
|
Datum
|
||||||
btgetmulti(PG_FUNCTION_ARGS)
|
btgetmulti(PG_FUNCTION_ARGS)
|
||||||
|
@ -299,36 +275,41 @@ btgetmulti(PG_FUNCTION_ARGS)
|
||||||
bool res = true;
|
bool res = true;
|
||||||
int32 ntids = 0;
|
int32 ntids = 0;
|
||||||
|
|
||||||
/*
|
if (max_tids <= 0) /* behave correctly in boundary case */
|
||||||
* Restore prior state if we were already called at least once.
|
PG_RETURN_BOOL(true);
|
||||||
*/
|
|
||||||
if (ItemPointerIsValid(&(scan->currentItemData)))
|
|
||||||
_bt_restscan(scan);
|
|
||||||
|
|
||||||
while (ntids < max_tids)
|
/* If we haven't started the scan yet, fetch the first page & tuple. */
|
||||||
|
if (!BTScanPosIsValid(so->currPos))
|
||||||
{
|
{
|
||||||
/*
|
res = _bt_first(scan, ForwardScanDirection);
|
||||||
* Start scan, or advance to next tuple.
|
|
||||||
*/
|
|
||||||
if (ItemPointerIsValid(&(scan->currentItemData)))
|
|
||||||
res = _bt_next(scan, ForwardScanDirection);
|
|
||||||
else
|
|
||||||
res = _bt_first(scan, ForwardScanDirection);
|
|
||||||
if (!res)
|
if (!res)
|
||||||
break;
|
{
|
||||||
|
/* empty scan */
|
||||||
|
*returned_tids = ntids;
|
||||||
|
PG_RETURN_BOOL(res);
|
||||||
|
}
|
||||||
/* Save tuple ID, and continue scanning */
|
/* Save tuple ID, and continue scanning */
|
||||||
tids[ntids] = scan->xs_ctup.t_self;
|
tids[ntids] = scan->xs_ctup.t_self;
|
||||||
ntids++;
|
ntids++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
while (ntids < max_tids)
|
||||||
* Save heap TID to use it in _bt_restscan. Then release the read lock on
|
|
||||||
* the buffer so that we aren't blocking other backends.
|
|
||||||
*/
|
|
||||||
if (res)
|
|
||||||
{
|
{
|
||||||
so->curHeapIptr = scan->xs_ctup.t_self;
|
/*
|
||||||
LockBuffer(so->btso_curbuf, BUFFER_LOCK_UNLOCK);
|
* Advance to next tuple within page. This is the same as the
|
||||||
|
* easy case in _bt_next().
|
||||||
|
*/
|
||||||
|
if (++so->currPos.itemIndex > so->currPos.lastItem)
|
||||||
|
{
|
||||||
|
/* let _bt_next do the heavy lifting */
|
||||||
|
res = _bt_next(scan, ForwardScanDirection);
|
||||||
|
if (!res)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Save tuple ID, and continue scanning */
|
||||||
|
tids[ntids] = so->currPos.items[so->currPos.itemIndex].heapTid;
|
||||||
|
ntids++;
|
||||||
}
|
}
|
||||||
|
|
||||||
*returned_tids = ntids;
|
*returned_tids = ntids;
|
||||||
|
@ -360,7 +341,6 @@ btrescan(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||||
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
|
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
|
||||||
ItemPointer iptr;
|
|
||||||
BTScanOpaque so;
|
BTScanOpaque so;
|
||||||
|
|
||||||
so = (BTScanOpaque) scan->opaque;
|
so = (BTScanOpaque) scan->opaque;
|
||||||
|
@ -368,31 +348,30 @@ btrescan(PG_FUNCTION_ARGS)
|
||||||
if (so == NULL) /* if called from btbeginscan */
|
if (so == NULL) /* if called from btbeginscan */
|
||||||
{
|
{
|
||||||
so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
|
so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
|
||||||
so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
|
so->currPos.buf = so->markPos.buf = InvalidBuffer;
|
||||||
ItemPointerSetInvalid(&(so->curHeapIptr));
|
|
||||||
ItemPointerSetInvalid(&(so->mrkHeapIptr));
|
|
||||||
if (scan->numberOfKeys > 0)
|
if (scan->numberOfKeys > 0)
|
||||||
so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
|
so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
|
||||||
else
|
else
|
||||||
so->keyData = NULL;
|
so->keyData = NULL;
|
||||||
|
so->killedItems = NULL; /* until needed */
|
||||||
|
so->numKilled = 0;
|
||||||
scan->opaque = so;
|
scan->opaque = so;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* we aren't holding any read locks, but gotta drop the pins */
|
/* we aren't holding any read locks, but gotta drop the pins */
|
||||||
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
|
if (BTScanPosIsValid(so->currPos))
|
||||||
{
|
{
|
||||||
ReleaseBuffer(so->btso_curbuf);
|
/* Before leaving current page, deal with any killed items */
|
||||||
so->btso_curbuf = InvalidBuffer;
|
if (so->numKilled > 0)
|
||||||
ItemPointerSetInvalid(&(so->curHeapIptr));
|
_bt_killitems(scan, false);
|
||||||
ItemPointerSetInvalid(iptr);
|
ReleaseBuffer(so->currPos.buf);
|
||||||
|
so->currPos.buf = InvalidBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
|
if (BTScanPosIsValid(so->markPos))
|
||||||
{
|
{
|
||||||
ReleaseBuffer(so->btso_mrkbuf);
|
ReleaseBuffer(so->markPos.buf);
|
||||||
so->btso_mrkbuf = InvalidBuffer;
|
so->markPos.buf = InvalidBuffer;
|
||||||
ItemPointerSetInvalid(&(so->mrkHeapIptr));
|
|
||||||
ItemPointerSetInvalid(iptr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -415,28 +394,26 @@ Datum
|
||||||
btendscan(PG_FUNCTION_ARGS)
|
btendscan(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||||
ItemPointer iptr;
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
BTScanOpaque so;
|
|
||||||
|
|
||||||
so = (BTScanOpaque) scan->opaque;
|
|
||||||
|
|
||||||
/* we aren't holding any read locks, but gotta drop the pins */
|
/* we aren't holding any read locks, but gotta drop the pins */
|
||||||
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
|
if (BTScanPosIsValid(so->currPos))
|
||||||
{
|
{
|
||||||
if (BufferIsValid(so->btso_curbuf))
|
/* Before leaving current page, deal with any killed items */
|
||||||
ReleaseBuffer(so->btso_curbuf);
|
if (so->numKilled > 0)
|
||||||
so->btso_curbuf = InvalidBuffer;
|
_bt_killitems(scan, false);
|
||||||
ItemPointerSetInvalid(iptr);
|
ReleaseBuffer(so->currPos.buf);
|
||||||
|
so->currPos.buf = InvalidBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
|
if (BTScanPosIsValid(so->markPos))
|
||||||
{
|
{
|
||||||
if (BufferIsValid(so->btso_mrkbuf))
|
ReleaseBuffer(so->markPos.buf);
|
||||||
ReleaseBuffer(so->btso_mrkbuf);
|
so->markPos.buf = InvalidBuffer;
|
||||||
so->btso_mrkbuf = InvalidBuffer;
|
|
||||||
ItemPointerSetInvalid(iptr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (so->killedItems != NULL)
|
||||||
|
pfree(so->killedItems);
|
||||||
if (so->keyData != NULL)
|
if (so->keyData != NULL)
|
||||||
pfree(so->keyData);
|
pfree(so->keyData);
|
||||||
pfree(so);
|
pfree(so);
|
||||||
|
@ -451,26 +428,22 @@ Datum
|
||||||
btmarkpos(PG_FUNCTION_ARGS)
|
btmarkpos(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||||
ItemPointer iptr;
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
BTScanOpaque so;
|
|
||||||
|
|
||||||
so = (BTScanOpaque) scan->opaque;
|
|
||||||
|
|
||||||
/* we aren't holding any read locks, but gotta drop the pin */
|
/* we aren't holding any read locks, but gotta drop the pin */
|
||||||
if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
|
if (BTScanPosIsValid(so->markPos))
|
||||||
{
|
{
|
||||||
ReleaseBuffer(so->btso_mrkbuf);
|
ReleaseBuffer(so->markPos.buf);
|
||||||
so->btso_mrkbuf = InvalidBuffer;
|
so->markPos.buf = InvalidBuffer;
|
||||||
ItemPointerSetInvalid(iptr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* bump pin on current buffer for assignment to mark buffer */
|
/* bump pin on current buffer for assignment to mark buffer */
|
||||||
if (ItemPointerIsValid(&(scan->currentItemData)))
|
if (BTScanPosIsValid(so->currPos))
|
||||||
{
|
{
|
||||||
IncrBufferRefCount(so->btso_curbuf);
|
IncrBufferRefCount(so->currPos.buf);
|
||||||
so->btso_mrkbuf = so->btso_curbuf;
|
memcpy(&so->markPos, &so->currPos,
|
||||||
scan->currentMarkData = scan->currentItemData;
|
offsetof(BTScanPosData, items[1]) +
|
||||||
so->mrkHeapIptr = so->curHeapIptr;
|
so->currPos.lastItem * sizeof(BTScanPosItem));
|
||||||
}
|
}
|
||||||
|
|
||||||
PG_RETURN_VOID();
|
PG_RETURN_VOID();
|
||||||
|
@ -483,26 +456,26 @@ Datum
|
||||||
btrestrpos(PG_FUNCTION_ARGS)
|
btrestrpos(PG_FUNCTION_ARGS)
|
||||||
{
|
{
|
||||||
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||||
ItemPointer iptr;
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
BTScanOpaque so;
|
|
||||||
|
|
||||||
so = (BTScanOpaque) scan->opaque;
|
|
||||||
|
|
||||||
/* we aren't holding any read locks, but gotta drop the pin */
|
/* we aren't holding any read locks, but gotta drop the pin */
|
||||||
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
|
if (BTScanPosIsValid(so->currPos))
|
||||||
{
|
{
|
||||||
ReleaseBuffer(so->btso_curbuf);
|
/* Before leaving current page, deal with any killed items */
|
||||||
so->btso_curbuf = InvalidBuffer;
|
if (so->numKilled > 0 &&
|
||||||
ItemPointerSetInvalid(iptr);
|
so->currPos.buf != so->markPos.buf)
|
||||||
|
_bt_killitems(scan, false);
|
||||||
|
ReleaseBuffer(so->currPos.buf);
|
||||||
|
so->currPos.buf = InvalidBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* bump pin on marked buffer */
|
/* bump pin on marked buffer */
|
||||||
if (ItemPointerIsValid(&(scan->currentMarkData)))
|
if (BTScanPosIsValid(so->markPos))
|
||||||
{
|
{
|
||||||
IncrBufferRefCount(so->btso_mrkbuf);
|
IncrBufferRefCount(so->markPos.buf);
|
||||||
so->btso_curbuf = so->btso_mrkbuf;
|
memcpy(&so->currPos, &so->markPos,
|
||||||
scan->currentItemData = scan->currentMarkData;
|
offsetof(BTScanPosData, items[1]) +
|
||||||
so->curHeapIptr = so->mrkHeapIptr;
|
so->markPos.lastItem * sizeof(BTScanPosItem));
|
||||||
}
|
}
|
||||||
|
|
||||||
PG_RETURN_VOID();
|
PG_RETURN_VOID();
|
||||||
|
@ -537,12 +510,11 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||||
* sequence left to right. It sounds attractive to only exclusive-lock
|
* sequence left to right. It sounds attractive to only exclusive-lock
|
||||||
* those containing items we need to delete, but unfortunately that is not
|
* those containing items we need to delete, but unfortunately that is not
|
||||||
* safe: we could then pass a stopped indexscan, which could in rare cases
|
* safe: we could then pass a stopped indexscan, which could in rare cases
|
||||||
* lead to deleting the item it needs to find when it resumes. (See
|
* lead to deleting items that the indexscan will still return later.
|
||||||
* _bt_restscan --- this could only happen if an indexscan stops on a
|
* (See discussion in nbtree/README.) We can skip obtaining exclusive
|
||||||
* deletable item and then a page split moves that item into a page
|
* lock on empty pages though, since no indexscan could be stopped on
|
||||||
* further to its right, which the indexscan will have no pin on.) We can
|
* those. (Note: this presumes that a split couldn't have left either
|
||||||
* skip obtaining exclusive lock on empty pages though, since no indexscan
|
* page totally empty.)
|
||||||
* could be stopped on those.
|
|
||||||
*/
|
*/
|
||||||
buf = _bt_get_endpoint(rel, 0, false);
|
buf = _bt_get_endpoint(rel, 0, false);
|
||||||
|
|
||||||
|
@ -823,108 +795,3 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||||
|
|
||||||
PG_RETURN_POINTER(stats);
|
PG_RETURN_POINTER(stats);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Restore scan position when btgettuple is called to continue a scan.
|
|
||||||
*
|
|
||||||
* This is nontrivial because concurrent insertions might have moved the
|
|
||||||
* index tuple we stopped on. We assume the tuple can only have moved to
|
|
||||||
* the right from our stop point, because we kept a pin on the buffer,
|
|
||||||
* and so no deletion can have occurred on that page.
|
|
||||||
*
|
|
||||||
* On entry, we have a pin but no read lock on the buffer that contained
|
|
||||||
* the index tuple we stopped the scan on. On exit, we have pin and read
|
|
||||||
* lock on the buffer that now contains that index tuple, and the scandesc's
|
|
||||||
* current position is updated to point at it.
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
_bt_restscan(IndexScanDesc scan)
|
|
||||||
{
|
|
||||||
Relation rel = scan->indexRelation;
|
|
||||||
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
|
||||||
Buffer buf = so->btso_curbuf;
|
|
||||||
Page page;
|
|
||||||
ItemPointer current = &(scan->currentItemData);
|
|
||||||
OffsetNumber offnum = ItemPointerGetOffsetNumber(current),
|
|
||||||
maxoff;
|
|
||||||
BTPageOpaque opaque;
|
|
||||||
Buffer nextbuf;
|
|
||||||
ItemPointer target = &(so->curHeapIptr);
|
|
||||||
IndexTuple itup;
|
|
||||||
BlockNumber blkno;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Reacquire read lock on the buffer. (We should still have a
|
|
||||||
* reference-count pin on it, so need not get that.)
|
|
||||||
*/
|
|
||||||
LockBuffer(buf, BT_READ);
|
|
||||||
|
|
||||||
page = BufferGetPage(buf);
|
|
||||||
maxoff = PageGetMaxOffsetNumber(page);
|
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We use this as flag when first index tuple on page is deleted but we do
|
|
||||||
* not move left (this would slowdown vacuum) - so we set
|
|
||||||
* current->ip_posid before first index tuple on the current page
|
|
||||||
* (_bt_step will move it right)... XXX still needed?
|
|
||||||
*/
|
|
||||||
if (!ItemPointerIsValid(target))
|
|
||||||
{
|
|
||||||
ItemPointerSetOffsetNumber(current,
|
|
||||||
OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The item we were on may have moved right due to insertions. Find it
|
|
||||||
* again. We use the heap TID to identify the item uniquely.
|
|
||||||
*/
|
|
||||||
for (;;)
|
|
||||||
{
|
|
||||||
/* Check for item on this page */
|
|
||||||
for (;
|
|
||||||
offnum <= maxoff;
|
|
||||||
offnum = OffsetNumberNext(offnum))
|
|
||||||
{
|
|
||||||
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
|
|
||||||
if (BTTidSame(itup->t_tid, *target))
|
|
||||||
{
|
|
||||||
/* Found it */
|
|
||||||
current->ip_posid = offnum;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The item we're looking for moved right at least one page, so move
|
|
||||||
* right. We are careful here to pin and read-lock the next non-dead
|
|
||||||
* page before releasing the current one. This ensures that a
|
|
||||||
* concurrent btbulkdelete scan cannot pass our position --- if it
|
|
||||||
* did, it might be able to reach and delete our target item before we
|
|
||||||
* can find it again.
|
|
||||||
*/
|
|
||||||
if (P_RIGHTMOST(opaque))
|
|
||||||
elog(ERROR, "failed to re-find previous key in \"%s\"",
|
|
||||||
RelationGetRelationName(rel));
|
|
||||||
/* Advance to next non-dead page --- there must be one */
|
|
||||||
nextbuf = InvalidBuffer;
|
|
||||||
for (;;)
|
|
||||||
{
|
|
||||||
blkno = opaque->btpo_next;
|
|
||||||
nextbuf = _bt_relandgetbuf(rel, nextbuf, blkno, BT_READ);
|
|
||||||
page = BufferGetPage(nextbuf);
|
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
if (!P_IGNORE(opaque))
|
|
||||||
break;
|
|
||||||
if (P_RIGHTMOST(opaque))
|
|
||||||
elog(ERROR, "fell off the end of \"%s\"",
|
|
||||||
RelationGetRelationName(rel));
|
|
||||||
}
|
|
||||||
_bt_relbuf(rel, buf);
|
|
||||||
so->btso_curbuf = buf = nextbuf;
|
|
||||||
maxoff = PageGetMaxOffsetNumber(page);
|
|
||||||
offnum = P_FIRSTDATAKEY(opaque);
|
|
||||||
ItemPointerSet(current, blkno, offnum);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.104 2006/03/05 15:58:21 momjian Exp $
|
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.105 2006/05/07 01:21:30 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -21,6 +21,9 @@
|
||||||
#include "utils/lsyscache.h"
|
#include "utils/lsyscache.h"
|
||||||
|
|
||||||
|
|
||||||
|
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
|
||||||
|
OffsetNumber offnum);
|
||||||
|
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
|
||||||
static Buffer _bt_walk_left(Relation rel, Buffer buf);
|
static Buffer _bt_walk_left(Relation rel, Buffer buf);
|
||||||
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
|
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
|
||||||
|
|
||||||
|
@ -410,70 +413,19 @@ _bt_compare(Relation rel,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* _bt_next() -- Get the next item in a scan.
|
|
||||||
*
|
|
||||||
* On entry, we have a valid currentItemData in the scan, and a
|
|
||||||
* read lock and pin count on the page that contains that item.
|
|
||||||
* We return the next item in the scan, or false if no more.
|
|
||||||
* On successful exit, the page containing the new item is locked
|
|
||||||
* and pinned; on failure exit, no lock or pin is held.
|
|
||||||
*/
|
|
||||||
bool
|
|
||||||
_bt_next(IndexScanDesc scan, ScanDirection dir)
|
|
||||||
{
|
|
||||||
Relation rel;
|
|
||||||
Buffer buf;
|
|
||||||
Page page;
|
|
||||||
OffsetNumber offnum;
|
|
||||||
ItemPointer current;
|
|
||||||
BTScanOpaque so;
|
|
||||||
bool continuescan;
|
|
||||||
|
|
||||||
rel = scan->indexRelation;
|
|
||||||
so = (BTScanOpaque) scan->opaque;
|
|
||||||
current = &(scan->currentItemData);
|
|
||||||
|
|
||||||
/* we still have the buffer pinned and locked */
|
|
||||||
buf = so->btso_curbuf;
|
|
||||||
Assert(BufferIsValid(buf));
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
|
||||||
/* step one tuple in the appropriate direction */
|
|
||||||
if (!_bt_step(scan, &buf, dir))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
/* current is the next candidate tuple to return */
|
|
||||||
offnum = ItemPointerGetOffsetNumber(current);
|
|
||||||
page = BufferGetPage(buf);
|
|
||||||
|
|
||||||
if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
|
|
||||||
{
|
|
||||||
/* tuple passes all scan key conditions, so return it */
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This tuple doesn't pass, but there might be more that do */
|
|
||||||
} while (continuescan);
|
|
||||||
|
|
||||||
/* No more items, so close down the current-item info */
|
|
||||||
ItemPointerSetInvalid(current);
|
|
||||||
so->btso_curbuf = InvalidBuffer;
|
|
||||||
_bt_relbuf(rel, buf);
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* _bt_first() -- Find the first item in a scan.
|
* _bt_first() -- Find the first item in a scan.
|
||||||
*
|
*
|
||||||
* We need to be clever about the direction of scan, the search
|
* We need to be clever about the direction of scan, the search
|
||||||
* conditions, and the tree ordering. We find the first item (or,
|
* conditions, and the tree ordering. We find the first item (or,
|
||||||
* if backwards scan, the last item) in the tree that satisfies the
|
* if backwards scan, the last item) in the tree that satisfies the
|
||||||
* qualifications in the scan key. On exit, the page containing
|
* qualifications in the scan key. On success exit, the page containing
|
||||||
* the current index tuple is read locked and pinned, and the scan's
|
* the current index tuple is pinned but not locked, and data about
|
||||||
* opaque data entry is updated to include the buffer.
|
* the matching tuple(s) on the page has been loaded into so->currPos,
|
||||||
|
* and scan->xs_ctup.t_self is set to the heap TID of the current tuple.
|
||||||
|
*
|
||||||
|
* If there are no matching items in the index, we return FALSE, with no
|
||||||
|
* pins or locks held.
|
||||||
*
|
*
|
||||||
* Note that scan->keyData[], and the so->keyData[] scankey built from it,
|
* Note that scan->keyData[], and the so->keyData[] scankey built from it,
|
||||||
* are both search-type scankeys (see nbtree/README for more about this).
|
* are both search-type scankeys (see nbtree/README for more about this).
|
||||||
|
@ -486,16 +438,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||||
Relation rel = scan->indexRelation;
|
Relation rel = scan->indexRelation;
|
||||||
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
Buffer buf;
|
Buffer buf;
|
||||||
Page page;
|
|
||||||
BTStack stack;
|
BTStack stack;
|
||||||
OffsetNumber offnum;
|
OffsetNumber offnum;
|
||||||
ItemPointer current;
|
|
||||||
BlockNumber blkno;
|
|
||||||
StrategyNumber strat;
|
StrategyNumber strat;
|
||||||
bool res;
|
|
||||||
bool nextkey;
|
bool nextkey;
|
||||||
bool goback;
|
bool goback;
|
||||||
bool continuescan;
|
|
||||||
ScanKey startKeys[INDEX_MAX_KEYS];
|
ScanKey startKeys[INDEX_MAX_KEYS];
|
||||||
ScanKeyData scankeys[INDEX_MAX_KEYS];
|
ScanKeyData scankeys[INDEX_MAX_KEYS];
|
||||||
int keysCount = 0;
|
int keysCount = 0;
|
||||||
|
@ -847,26 +794,31 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||||
/* don't need to keep the stack around... */
|
/* don't need to keep the stack around... */
|
||||||
_bt_freestack(stack);
|
_bt_freestack(stack);
|
||||||
|
|
||||||
current = &(scan->currentItemData);
|
/* remember which buffer we have pinned, if any */
|
||||||
|
so->currPos.buf = buf;
|
||||||
|
|
||||||
if (!BufferIsValid(buf))
|
if (!BufferIsValid(buf))
|
||||||
{
|
{
|
||||||
/* Only get here if index is completely empty */
|
/* Only get here if index is completely empty */
|
||||||
ItemPointerSetInvalid(current);
|
|
||||||
so->btso_curbuf = InvalidBuffer;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* remember which buffer we have pinned */
|
/* initialize moreLeft/moreRight appropriately for scan direction */
|
||||||
so->btso_curbuf = buf;
|
if (ScanDirectionIsForward(dir))
|
||||||
|
{
|
||||||
|
so->currPos.moreLeft = false;
|
||||||
|
so->currPos.moreRight = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
so->currPos.moreLeft = true;
|
||||||
|
so->currPos.moreRight = false;
|
||||||
|
}
|
||||||
|
so->numKilled = 0; /* just paranoia */
|
||||||
|
|
||||||
/* position to the precise item on the page */
|
/* position to the precise item on the page */
|
||||||
offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
|
offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
|
||||||
|
|
||||||
page = BufferGetPage(buf);
|
|
||||||
blkno = BufferGetBlockNumber(buf);
|
|
||||||
ItemPointerSet(current, blkno, offnum);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If nextkey = false, we are positioned at the first item >= scan key, or
|
* If nextkey = false, we are positioned at the first item >= scan key, or
|
||||||
* possibly at the end of a page on which all the existing items are less
|
* possibly at the end of a page on which all the existing items are less
|
||||||
|
@ -880,174 +832,310 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||||
*
|
*
|
||||||
* The actually desired starting point is either this item or the prior
|
* The actually desired starting point is either this item or the prior
|
||||||
* one, or in the end-of-page case it's the first item on the next page or
|
* one, or in the end-of-page case it's the first item on the next page or
|
||||||
* the last item on this page. We apply _bt_step if needed to get to the
|
* the last item on this page. Adjust the starting offset if needed.
|
||||||
* right place.
|
* (If this results in an offset before the first item or after the last
|
||||||
*
|
* one, _bt_readpage will report no items found, and then we'll step to
|
||||||
* If _bt_step fails (meaning we fell off the end of the index in one
|
* the next page as needed.)
|
||||||
* direction or the other), then there are no matches so we just return
|
|
||||||
* false.
|
|
||||||
*/
|
*/
|
||||||
if (goback)
|
if (goback)
|
||||||
|
offnum = OffsetNumberPrev(offnum);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Now load data from the first page of the scan.
|
||||||
|
*/
|
||||||
|
if (!_bt_readpage(scan, dir, offnum))
|
||||||
{
|
{
|
||||||
/* _bt_step will do the right thing if we are at end-of-page */
|
/*
|
||||||
if (!_bt_step(scan, &buf, BackwardScanDirection))
|
* There's no actually-matching data on this page. Try to advance to
|
||||||
|
* the next page. Return false if there's no matching data at all.
|
||||||
|
*/
|
||||||
|
if (!_bt_steppage(scan, dir))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
/* If we're at end-of-page, must step forward to next page */
|
|
||||||
if (offnum > PageGetMaxOffsetNumber(page))
|
|
||||||
{
|
|
||||||
if (!_bt_step(scan, &buf, ForwardScanDirection))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* okay, current item pointer for the scan is right */
|
/* Drop the lock, but not pin, on the current page */
|
||||||
offnum = ItemPointerGetOffsetNumber(current);
|
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
|
||||||
page = BufferGetPage(buf);
|
|
||||||
|
|
||||||
/* is the first item actually acceptable? */
|
/* OK, itemIndex says what to return */
|
||||||
if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
|
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
|
||||||
{
|
|
||||||
/* yes, return it */
|
|
||||||
res = true;
|
|
||||||
}
|
|
||||||
else if (continuescan)
|
|
||||||
{
|
|
||||||
/* no, but there might be another one that is */
|
|
||||||
res = _bt_next(scan, dir);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* no tuples in the index match this scan key */
|
|
||||||
ItemPointerSetInvalid(current);
|
|
||||||
so->btso_curbuf = InvalidBuffer;
|
|
||||||
_bt_relbuf(rel, buf);
|
|
||||||
res = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* _bt_step() -- Step one item in the requested direction in a scan on
|
* _bt_next() -- Get the next item in a scan.
|
||||||
* the tree.
|
|
||||||
*
|
*
|
||||||
* *bufP is the current buffer (read-locked and pinned). If we change
|
* On entry, so->currPos describes the current page, which is pinned
|
||||||
* pages, it's updated appropriately.
|
* but not locked, and so->currPos.itemIndex identifies which item was
|
||||||
|
* previously returned.
|
||||||
*
|
*
|
||||||
* If successful, update scan's currentItemData and return true.
|
* On successful exit, scan->xs_ctup.t_self is set to the TID of the
|
||||||
* If no adjacent record exists in the requested direction,
|
* next heap tuple, and so->currPos is updated as needed.
|
||||||
* release buffer pin/locks and return false.
|
*
|
||||||
|
* On failure exit (no more tuples), we release pin and set
|
||||||
|
* so->currPos.buf to InvalidBuffer.
|
||||||
*/
|
*/
|
||||||
bool
|
bool
|
||||||
_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
_bt_next(IndexScanDesc scan, ScanDirection dir)
|
||||||
|
{
|
||||||
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Advance to next tuple on current page; or if there's no more,
|
||||||
|
* try to step to the next page with data.
|
||||||
|
*/
|
||||||
|
if (ScanDirectionIsForward(dir))
|
||||||
|
{
|
||||||
|
if (++so->currPos.itemIndex > so->currPos.lastItem)
|
||||||
|
{
|
||||||
|
/* We must acquire lock before applying _bt_steppage */
|
||||||
|
Assert(BufferIsValid(so->currPos.buf));
|
||||||
|
LockBuffer(so->currPos.buf, BT_READ);
|
||||||
|
if (!_bt_steppage(scan, dir))
|
||||||
|
return false;
|
||||||
|
/* Drop the lock, but not pin, on the new page */
|
||||||
|
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (--so->currPos.itemIndex < so->currPos.firstItem)
|
||||||
|
{
|
||||||
|
/* We must acquire lock before applying _bt_steppage */
|
||||||
|
Assert(BufferIsValid(so->currPos.buf));
|
||||||
|
LockBuffer(so->currPos.buf, BT_READ);
|
||||||
|
if (!_bt_steppage(scan, dir))
|
||||||
|
return false;
|
||||||
|
/* Drop the lock, but not pin, on the new page */
|
||||||
|
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* OK, itemIndex says what to return */
|
||||||
|
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* _bt_readpage() -- Load data from current index page into so->currPos
|
||||||
|
*
|
||||||
|
* Caller must have pinned and read-locked so->currPos.buf; the buffer's state
|
||||||
|
* is not changed here. Also, currPos.moreLeft and moreRight must be valid;
|
||||||
|
* they are updated as appropriate. All other fields of so->currPos are
|
||||||
|
* initialized from scratch here.
|
||||||
|
*
|
||||||
|
* We scan the current page starting at offnum and moving in the indicated
|
||||||
|
* direction. All items matching the scan keys are loaded into currPos.items.
|
||||||
|
* moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
|
||||||
|
* that there can be no more matching tuples in the current scan direction.
|
||||||
|
*
|
||||||
|
* Returns true if any matching items found on the page, false if none.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
|
||||||
|
{
|
||||||
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
|
Page page;
|
||||||
|
BTPageOpaque opaque;
|
||||||
|
OffsetNumber minoff;
|
||||||
|
OffsetNumber maxoff;
|
||||||
|
int itemIndex;
|
||||||
|
bool continuescan;
|
||||||
|
|
||||||
|
/* we must have the buffer pinned and locked */
|
||||||
|
Assert(BufferIsValid(so->currPos.buf));
|
||||||
|
|
||||||
|
page = BufferGetPage(so->currPos.buf);
|
||||||
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
minoff = P_FIRSTDATAKEY(opaque);
|
||||||
|
maxoff = PageGetMaxOffsetNumber(page);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we must save the page's right-link while scanning it; this tells us
|
||||||
|
* where to step right to after we're done with these items. There is
|
||||||
|
* no corresponding need for the left-link, since splits always go right.
|
||||||
|
*/
|
||||||
|
so->currPos.nextPage = opaque->btpo_next;
|
||||||
|
|
||||||
|
if (ScanDirectionIsForward(dir))
|
||||||
|
{
|
||||||
|
/* load items[] in ascending order */
|
||||||
|
itemIndex = 0;
|
||||||
|
|
||||||
|
offnum = Max(offnum, minoff);
|
||||||
|
|
||||||
|
while (offnum <= maxoff)
|
||||||
|
{
|
||||||
|
if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
|
||||||
|
{
|
||||||
|
/* tuple passes all scan key conditions, so remember it */
|
||||||
|
/* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
|
||||||
|
so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
|
||||||
|
so->currPos.items[itemIndex].indexOffset = offnum;
|
||||||
|
itemIndex++;
|
||||||
|
}
|
||||||
|
if (!continuescan)
|
||||||
|
{
|
||||||
|
/* there can't be any more matches, so stop */
|
||||||
|
so->currPos.moreRight = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
offnum = OffsetNumberNext(offnum);
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert(itemIndex <= MaxIndexTuplesPerPage);
|
||||||
|
so->currPos.firstItem = 0;
|
||||||
|
so->currPos.lastItem = itemIndex - 1;
|
||||||
|
so->currPos.itemIndex = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* load items[] in descending order */
|
||||||
|
itemIndex = MaxIndexTuplesPerPage;
|
||||||
|
|
||||||
|
offnum = Min(offnum, maxoff);
|
||||||
|
|
||||||
|
while (offnum >= minoff)
|
||||||
|
{
|
||||||
|
if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
|
||||||
|
{
|
||||||
|
/* tuple passes all scan key conditions, so remember it */
|
||||||
|
/* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
|
||||||
|
itemIndex--;
|
||||||
|
so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
|
||||||
|
so->currPos.items[itemIndex].indexOffset = offnum;
|
||||||
|
}
|
||||||
|
if (!continuescan)
|
||||||
|
{
|
||||||
|
/* there can't be any more matches, so stop */
|
||||||
|
so->currPos.moreLeft = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
offnum = OffsetNumberPrev(offnum);
|
||||||
|
}
|
||||||
|
|
||||||
|
Assert(itemIndex >= 0);
|
||||||
|
so->currPos.firstItem = itemIndex;
|
||||||
|
so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
|
||||||
|
so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (so->currPos.firstItem <= so->currPos.lastItem);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* _bt_steppage() -- Step to next page containing valid data for scan
|
||||||
|
*
|
||||||
|
* On entry, so->currPos.buf must be pinned and read-locked. We'll drop
|
||||||
|
* the lock and pin before moving to next page.
|
||||||
|
*
|
||||||
|
* On success exit, we hold pin and read-lock on the next interesting page,
|
||||||
|
* and so->currPos is updated to contain data from that page.
|
||||||
|
*
|
||||||
|
* If there are no more matching records in the given direction, we drop all
|
||||||
|
* locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
|
||||||
{
|
{
|
||||||
ItemPointer current = &(scan->currentItemData);
|
|
||||||
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
Relation rel;
|
Relation rel;
|
||||||
Page page;
|
Page page;
|
||||||
BTPageOpaque opaque;
|
BTPageOpaque opaque;
|
||||||
OffsetNumber offnum,
|
|
||||||
maxoff;
|
|
||||||
BlockNumber blkno;
|
|
||||||
|
|
||||||
/*
|
/* we must have the buffer pinned and locked */
|
||||||
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion due
|
Assert(BufferIsValid(so->currPos.buf));
|
||||||
* to ability of ip_posid to be equal 0.
|
|
||||||
*/
|
|
||||||
offnum = current->ip_posid;
|
|
||||||
|
|
||||||
page = BufferGetPage(*bufP);
|
/* Before leaving current page, deal with any killed items */
|
||||||
maxoff = PageGetMaxOffsetNumber(page);
|
if (so->numKilled > 0)
|
||||||
|
_bt_killitems(scan, true);
|
||||||
|
|
||||||
|
rel = scan->indexRelation;
|
||||||
|
|
||||||
if (ScanDirectionIsForward(dir))
|
if (ScanDirectionIsForward(dir))
|
||||||
{
|
{
|
||||||
if (offnum < maxoff)
|
/* Walk right to the next page with data */
|
||||||
offnum = OffsetNumberNext(offnum);
|
/* We must rely on the previously saved nextPage link! */
|
||||||
else
|
BlockNumber blkno = so->currPos.nextPage;
|
||||||
|
|
||||||
|
/* Remember we left a page with data */
|
||||||
|
so->currPos.moreLeft = true;
|
||||||
|
|
||||||
|
for (;;)
|
||||||
{
|
{
|
||||||
/* Walk right to the next page with data */
|
/* if we're at end of scan, release the buffer and return */
|
||||||
rel = scan->indexRelation;
|
if (blkno == P_NONE || !so->currPos.moreRight)
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
for (;;)
|
|
||||||
{
|
{
|
||||||
/* if we're at end of scan, release the buffer and return */
|
_bt_relbuf(rel, so->currPos.buf);
|
||||||
if (P_RIGHTMOST(opaque))
|
so->currPos.buf = InvalidBuffer;
|
||||||
{
|
return false;
|
||||||
_bt_relbuf(rel, *bufP);
|
|
||||||
ItemPointerSetInvalid(current);
|
|
||||||
*bufP = so->btso_curbuf = InvalidBuffer;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
/* step right one page */
|
|
||||||
blkno = opaque->btpo_next;
|
|
||||||
*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
|
|
||||||
page = BufferGetPage(*bufP);
|
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
if (!P_IGNORE(opaque))
|
|
||||||
{
|
|
||||||
/* done if it's not empty */
|
|
||||||
maxoff = PageGetMaxOffsetNumber(page);
|
|
||||||
offnum = P_FIRSTDATAKEY(opaque);
|
|
||||||
if (offnum <= maxoff)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
/* step right one page */
|
||||||
|
so->currPos.buf = _bt_relandgetbuf(rel, so->currPos.buf,
|
||||||
|
blkno, BT_READ);
|
||||||
|
/* check for deleted page */
|
||||||
|
page = BufferGetPage(so->currPos.buf);
|
||||||
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
if (!P_IGNORE(opaque))
|
||||||
|
{
|
||||||
|
/* see if there are any matches on this page */
|
||||||
|
/* note that this will clear moreRight if we can stop */
|
||||||
|
if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
/* nope, keep going */
|
||||||
|
blkno = opaque->btpo_next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* backwards scan */
|
/* Remember we left a page with data */
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
so->currPos.moreRight = true;
|
||||||
if (offnum > P_FIRSTDATAKEY(opaque))
|
|
||||||
offnum = OffsetNumberPrev(offnum);
|
/*
|
||||||
else
|
* Walk left to the next page with data. This is much more
|
||||||
|
* complex than the walk-right case because of the possibility
|
||||||
|
* that the page to our left splits while we are in flight to it,
|
||||||
|
* plus the possibility that the page we were on gets deleted
|
||||||
|
* after we leave it. See nbtree/README for details.
|
||||||
|
*/
|
||||||
|
for (;;)
|
||||||
{
|
{
|
||||||
/*
|
/* Done if we know there are no matching keys to the left */
|
||||||
* Walk left to the next page with data. This is much more
|
if (!so->currPos.moreLeft)
|
||||||
* complex than the walk-right case because of the possibility
|
|
||||||
* that the page to our left splits while we are in flight to it,
|
|
||||||
* plus the possibility that the page we were on gets deleted
|
|
||||||
* after we leave it. See nbtree/README for details.
|
|
||||||
*/
|
|
||||||
rel = scan->indexRelation;
|
|
||||||
for (;;)
|
|
||||||
{
|
{
|
||||||
*bufP = _bt_walk_left(rel, *bufP);
|
_bt_relbuf(rel, so->currPos.buf);
|
||||||
|
so->currPos.buf = InvalidBuffer;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/* if we're at end of scan, return failure */
|
/* Step to next physical page */
|
||||||
if (*bufP == InvalidBuffer)
|
so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
|
||||||
{
|
|
||||||
ItemPointerSetInvalid(current);
|
|
||||||
so->btso_curbuf = InvalidBuffer;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
page = BufferGetPage(*bufP);
|
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
||||||
|
|
||||||
/*
|
/* if we're physically at end of index, return failure */
|
||||||
* Okay, we managed to move left to a non-deleted page. Done
|
if (so->currPos.buf == InvalidBuffer)
|
||||||
* if it's not half-dead and not empty. Else loop back and do
|
return false;
|
||||||
* it all again.
|
|
||||||
*/
|
/*
|
||||||
if (!P_IGNORE(opaque))
|
* Okay, we managed to move left to a non-deleted page.
|
||||||
{
|
* Done if it's not half-dead and contains matching tuples.
|
||||||
maxoff = PageGetMaxOffsetNumber(page);
|
* Else loop back and do it all again.
|
||||||
offnum = maxoff;
|
*/
|
||||||
if (maxoff >= P_FIRSTDATAKEY(opaque))
|
page = BufferGetPage(so->currPos.buf);
|
||||||
break;
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
}
|
if (!P_IGNORE(opaque))
|
||||||
|
{
|
||||||
|
/* see if there are any matches on this page */
|
||||||
|
/* note that this will clear moreLeft if we can stop */
|
||||||
|
if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Update scan state */
|
|
||||||
so->btso_curbuf = *bufP;
|
|
||||||
blkno = BufferGetBlockNumber(*bufP);
|
|
||||||
ItemPointerSet(current, blkno, offnum);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1250,31 +1338,23 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* _bt_endpoint() -- Find the first or last key in the index, and scan
|
* _bt_endpoint() -- Find the first or last page in the index, and scan
|
||||||
* from there to the first key satisfying all the quals.
|
* from there to the first key satisfying all the quals.
|
||||||
*
|
*
|
||||||
* This is used by _bt_first() to set up a scan when we've determined
|
* This is used by _bt_first() to set up a scan when we've determined
|
||||||
* that the scan must start at the beginning or end of the index (for
|
* that the scan must start at the beginning or end of the index (for
|
||||||
* a forward or backward scan respectively).
|
* a forward or backward scan respectively). Exit conditions are the
|
||||||
|
* same as for _bt_first().
|
||||||
*/
|
*/
|
||||||
static bool
|
static bool
|
||||||
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||||
{
|
{
|
||||||
Relation rel;
|
Relation rel = scan->indexRelation;
|
||||||
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
Buffer buf;
|
Buffer buf;
|
||||||
Page page;
|
Page page;
|
||||||
BTPageOpaque opaque;
|
BTPageOpaque opaque;
|
||||||
ItemPointer current;
|
|
||||||
OffsetNumber maxoff;
|
|
||||||
OffsetNumber start;
|
OffsetNumber start;
|
||||||
BlockNumber blkno;
|
|
||||||
BTScanOpaque so;
|
|
||||||
bool res;
|
|
||||||
bool continuescan;
|
|
||||||
|
|
||||||
rel = scan->indexRelation;
|
|
||||||
current = &(scan->currentItemData);
|
|
||||||
so = (BTScanOpaque) scan->opaque;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Scan down to the leftmost or rightmost leaf page. This is a simplified
|
* Scan down to the leftmost or rightmost leaf page. This is a simplified
|
||||||
|
@ -1286,18 +1366,14 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||||
if (!BufferIsValid(buf))
|
if (!BufferIsValid(buf))
|
||||||
{
|
{
|
||||||
/* empty index... */
|
/* empty index... */
|
||||||
ItemPointerSetInvalid(current);
|
so->currPos.buf = InvalidBuffer;
|
||||||
so->btso_curbuf = InvalidBuffer;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
blkno = BufferGetBlockNumber(buf);
|
|
||||||
page = BufferGetPage(buf);
|
page = BufferGetPage(buf);
|
||||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
Assert(P_ISLEAF(opaque));
|
Assert(P_ISLEAF(opaque));
|
||||||
|
|
||||||
maxoff = PageGetMaxOffsetNumber(page);
|
|
||||||
|
|
||||||
if (ScanDirectionIsForward(dir))
|
if (ScanDirectionIsForward(dir))
|
||||||
{
|
{
|
||||||
/* There could be dead pages to the left, so not this: */
|
/* There could be dead pages to the left, so not this: */
|
||||||
|
@ -1310,8 +1386,6 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||||
Assert(P_RIGHTMOST(opaque));
|
Assert(P_RIGHTMOST(opaque));
|
||||||
|
|
||||||
start = PageGetMaxOffsetNumber(page);
|
start = PageGetMaxOffsetNumber(page);
|
||||||
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty page */
|
|
||||||
start = P_FIRSTDATAKEY(opaque);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1319,43 +1393,40 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||||
start = 0; /* keep compiler quiet */
|
start = 0; /* keep compiler quiet */
|
||||||
}
|
}
|
||||||
|
|
||||||
ItemPointerSet(current, blkno, start);
|
|
||||||
/* remember which buffer we have pinned */
|
/* remember which buffer we have pinned */
|
||||||
so->btso_curbuf = buf;
|
so->currPos.buf = buf;
|
||||||
|
|
||||||
/*
|
/* initialize moreLeft/moreRight appropriately for scan direction */
|
||||||
* Left/rightmost page could be empty due to deletions, if so step till we
|
if (ScanDirectionIsForward(dir))
|
||||||
* find a nonempty page.
|
|
||||||
*/
|
|
||||||
if (start > maxoff)
|
|
||||||
{
|
{
|
||||||
if (!_bt_step(scan, &buf, dir))
|
so->currPos.moreLeft = false;
|
||||||
return false;
|
so->currPos.moreRight = true;
|
||||||
start = ItemPointerGetOffsetNumber(current);
|
|
||||||
page = BufferGetPage(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Okay, we are on the first or last tuple. Does it pass all the quals?
|
|
||||||
*/
|
|
||||||
if (_bt_checkkeys(scan, page, start, dir, &continuescan))
|
|
||||||
{
|
|
||||||
/* yes, return it */
|
|
||||||
res = true;
|
|
||||||
}
|
|
||||||
else if (continuescan)
|
|
||||||
{
|
|
||||||
/* no, but there might be another one that does */
|
|
||||||
res = _bt_next(scan, dir);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* no tuples in the index match this scan key */
|
so->currPos.moreLeft = true;
|
||||||
ItemPointerSetInvalid(current);
|
so->currPos.moreRight = false;
|
||||||
so->btso_curbuf = InvalidBuffer;
|
}
|
||||||
_bt_relbuf(rel, buf);
|
so->numKilled = 0; /* just paranoia */
|
||||||
res = false;
|
|
||||||
|
/*
|
||||||
|
* Now load data from the first page of the scan.
|
||||||
|
*/
|
||||||
|
if (!_bt_readpage(scan, dir, start))
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* There's no actually-matching data on this page. Try to advance to
|
||||||
|
* the next page. Return false if there's no matching data at all.
|
||||||
|
*/
|
||||||
|
if (!_bt_steppage(scan, dir))
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return res;
|
/* Drop the lock, but not pin, on the current page */
|
||||||
|
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
|
||||||
|
|
||||||
|
/* OK, itemIndex says what to return */
|
||||||
|
scan->xs_ctup.t_self = so->currPos.items[so->currPos.itemIndex].heapTid;
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.72 2006/03/05 15:58:21 momjian Exp $
|
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.73 2006/05/07 01:21:30 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -196,11 +196,6 @@ _bt_freestack(BTStack stack)
|
||||||
* Again though, only keys with RHS datatype equal to the index datatype
|
* Again though, only keys with RHS datatype equal to the index datatype
|
||||||
* can be checked for contradictions.
|
* can be checked for contradictions.
|
||||||
*
|
*
|
||||||
* Furthermore, we detect the case where the index is unique and we have
|
|
||||||
* equality quals for all columns. In this case there can be at most one
|
|
||||||
* (visible) matching tuple. index_getnext uses this to avoid uselessly
|
|
||||||
* continuing the scan after finding one match.
|
|
||||||
*
|
|
||||||
* Row comparison keys are treated the same as comparisons to nondefault
|
* Row comparison keys are treated the same as comparisons to nondefault
|
||||||
* datatypes: we just transfer them into the preprocessed array without any
|
* datatypes: we just transfer them into the preprocessed array without any
|
||||||
* editorialization. We can treat them the same as an ordinary inequality
|
* editorialization. We can treat them the same as an ordinary inequality
|
||||||
|
@ -216,7 +211,6 @@ _bt_freestack(BTStack stack)
|
||||||
void
|
void
|
||||||
_bt_preprocess_keys(IndexScanDesc scan)
|
_bt_preprocess_keys(IndexScanDesc scan)
|
||||||
{
|
{
|
||||||
Relation relation = scan->indexRelation;
|
|
||||||
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
int numberOfKeys = scan->numberOfKeys;
|
int numberOfKeys = scan->numberOfKeys;
|
||||||
int new_numberOfKeys;
|
int new_numberOfKeys;
|
||||||
|
@ -234,7 +228,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||||
/* initialize result variables */
|
/* initialize result variables */
|
||||||
so->qual_ok = true;
|
so->qual_ok = true;
|
||||||
so->numberOfKeys = 0;
|
so->numberOfKeys = 0;
|
||||||
scan->keys_are_unique = false;
|
|
||||||
|
|
||||||
if (numberOfKeys < 1)
|
if (numberOfKeys < 1)
|
||||||
return; /* done if qual-less scan */
|
return; /* done if qual-less scan */
|
||||||
|
@ -256,13 +249,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||||
*/
|
*/
|
||||||
if (cur->sk_flags & SK_ISNULL)
|
if (cur->sk_flags & SK_ISNULL)
|
||||||
so->qual_ok = false;
|
so->qual_ok = false;
|
||||||
else if (relation->rd_index->indisunique &&
|
|
||||||
relation->rd_rel->relnatts == 1)
|
|
||||||
{
|
|
||||||
/* it's a unique index, do we have an equality qual? */
|
|
||||||
if (cur->sk_strategy == BTEqualStrategyNumber)
|
|
||||||
scan->keys_are_unique = true;
|
|
||||||
}
|
|
||||||
memcpy(outkeys, inkeys, sizeof(ScanKeyData));
|
memcpy(outkeys, inkeys, sizeof(ScanKeyData));
|
||||||
so->numberOfKeys = 1;
|
so->numberOfKeys = 1;
|
||||||
/* We can mark the qual as required if it's for first index col */
|
/* We can mark the qual as required if it's for first index col */
|
||||||
|
@ -464,14 +450,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||||
}
|
}
|
||||||
|
|
||||||
so->numberOfKeys = new_numberOfKeys;
|
so->numberOfKeys = new_numberOfKeys;
|
||||||
|
|
||||||
/*
|
|
||||||
* If unique index and we have equality keys for all columns, set
|
|
||||||
* keys_are_unique flag for higher levels.
|
|
||||||
*/
|
|
||||||
if (relation->rd_index->indisunique &&
|
|
||||||
relation->rd_rel->relnatts == numberOfEqualCols)
|
|
||||||
scan->keys_are_unique = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -826,3 +804,89 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* _bt_killitems - set LP_DELETE bit for items an indexscan caller has
|
||||||
|
* told us were killed
|
||||||
|
*
|
||||||
|
* scan->so contains information about the current page and killed tuples
|
||||||
|
* thereon (generally, this should only be called if so->numKilled > 0).
|
||||||
|
*
|
||||||
|
* The caller must have pin on so->currPos.buf, but may or may not have
|
||||||
|
* read-lock, as indicated by haveLock. Note that we assume read-lock
|
||||||
|
* is sufficient for setting LP_DELETE hint bits.
|
||||||
|
*
|
||||||
|
* We match items by heap TID before assuming they are the right ones to
|
||||||
|
* delete. We cope with cases where items have moved right due to insertions.
|
||||||
|
* If an item has moved off the current page due to a split, we'll fail to
|
||||||
|
* find it and do nothing (this is not an error case --- we assume the item
|
||||||
|
* will eventually get marked in a future indexscan). Likewise, if the item
|
||||||
|
* has moved left due to deletions or disappeared itself, we'll not find it,
|
||||||
|
* but these cases are not worth optimizing. (Since deletions are only done
|
||||||
|
* by VACUUM, any deletion makes it highly likely that the dead item has been
|
||||||
|
* removed itself, and therefore searching left is not worthwhile.)
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
_bt_killitems(IndexScanDesc scan, bool haveLock)
|
||||||
|
{
|
||||||
|
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||||
|
Page page;
|
||||||
|
BTPageOpaque opaque;
|
||||||
|
OffsetNumber minoff;
|
||||||
|
OffsetNumber maxoff;
|
||||||
|
int i;
|
||||||
|
bool killedsomething = false;
|
||||||
|
|
||||||
|
Assert(BufferIsValid(so->currPos.buf));
|
||||||
|
|
||||||
|
if (!haveLock)
|
||||||
|
LockBuffer(so->currPos.buf, BT_READ);
|
||||||
|
|
||||||
|
page = BufferGetPage(so->currPos.buf);
|
||||||
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||||
|
minoff = P_FIRSTDATAKEY(opaque);
|
||||||
|
maxoff = PageGetMaxOffsetNumber(page);
|
||||||
|
|
||||||
|
for (i = 0; i < so->numKilled; i++)
|
||||||
|
{
|
||||||
|
int itemIndex = so->killedItems[i];
|
||||||
|
BTScanPosItem *kitem = &so->currPos.items[itemIndex];
|
||||||
|
OffsetNumber offnum = kitem->indexOffset;
|
||||||
|
|
||||||
|
Assert(itemIndex >= so->currPos.firstItem &&
|
||||||
|
itemIndex <= so->currPos.lastItem);
|
||||||
|
if (offnum < minoff)
|
||||||
|
continue; /* pure paranoia */
|
||||||
|
while (offnum <= maxoff)
|
||||||
|
{
|
||||||
|
ItemId iid = PageGetItemId(page, offnum);
|
||||||
|
IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
|
||||||
|
|
||||||
|
if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
|
||||||
|
{
|
||||||
|
/* found the item */
|
||||||
|
iid->lp_flags |= LP_DELETE;
|
||||||
|
killedsomething = true;
|
||||||
|
break; /* out of inner search loop */
|
||||||
|
}
|
||||||
|
offnum = OffsetNumberNext(offnum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Since this can be redone later if needed, it's treated the same
|
||||||
|
* as a commit-hint-bit status update for heap tuples: we mark the
|
||||||
|
* buffer dirty but don't make a WAL log entry.
|
||||||
|
*/
|
||||||
|
if (killedsomething)
|
||||||
|
SetBufferCommitInfoNeedsSave(so->currPos.buf);
|
||||||
|
|
||||||
|
if (!haveLock)
|
||||||
|
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Always reset the scan state, so we don't look for same items
|
||||||
|
* on other pages.
|
||||||
|
*/
|
||||||
|
so->numKilled = 0;
|
||||||
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/access/itup.h,v 1.45 2006/03/05 15:58:53 momjian Exp $
|
* $PostgreSQL: pgsql/src/include/access/itup.h,v 1.46 2006/05/07 01:21:30 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -126,6 +126,17 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap;
|
||||||
) \
|
) \
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* MaxIndexTuplesPerPage is an upper bound on the number of tuples that can
|
||||||
|
* fit on one index page. An index tuple must have either data or a null
|
||||||
|
* bitmap, so we can safely assume it's at least 1 byte bigger than a bare
|
||||||
|
* IndexTupleData struct. We arrive at the divisor because each tuple
|
||||||
|
* must be maxaligned, and it must have an associated item pointer.
|
||||||
|
*/
|
||||||
|
#define MaxIndexTuplesPerPage \
|
||||||
|
((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \
|
||||||
|
(MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))))
|
||||||
|
|
||||||
|
|
||||||
/* routines in indextuple.c */
|
/* routines in indextuple.c */
|
||||||
extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor,
|
extern IndexTuple index_form_tuple(TupleDesc tupleDescriptor,
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.96 2006/04/13 03:53:05 tgl Exp $
|
* $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.97 2006/05/07 01:21:30 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -336,30 +336,82 @@ typedef struct BTStackData
|
||||||
typedef BTStackData *BTStack;
|
typedef BTStackData *BTStack;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* BTScanOpaqueData is used to remember which buffers we're currently
|
* BTScanOpaqueData is the btree-private state needed for an indexscan.
|
||||||
* examining in an indexscan. Between calls to btgettuple or btgetmulti,
|
* This consists of preprocessed scan keys (see _bt_preprocess_keys() for
|
||||||
* we keep these buffers pinned (but not locked, see nbtree.c) to avoid
|
* details of the preprocessing), information about the current location
|
||||||
* doing a ReadBuffer() for every tuple in the index.
|
* of the scan, and information about the marked location, if any. (We use
|
||||||
|
* BTScanPosData to represent the data needed for each of current and marked
|
||||||
|
* locations.) In addition we can remember some known-killed index entries
|
||||||
|
* that must be marked before we can move off the current page.
|
||||||
*
|
*
|
||||||
* We also store preprocessed versions of the scan keys in this structure.
|
* Index scans work a page at a time: we pin and read-lock the page, identify
|
||||||
* See _bt_preprocess_keys() for details of the preprocessing.
|
* all the matching items on the page and save them in BTScanPosData, then
|
||||||
|
* release the read-lock while returning the items to the caller for
|
||||||
|
* processing. This approach minimizes lock/unlock traffic. Note that we
|
||||||
|
* keep the pin on the index page until the caller is done with all the items
|
||||||
|
* (this is needed for VACUUM synchronization, see nbtree/README). When we
|
||||||
|
* are ready to step to the next page, if the caller has told us any of the
|
||||||
|
* items were killed, we re-lock the page to mark them killed, then unlock.
|
||||||
|
* Finally we drop the pin and step to the next page in the appropriate
|
||||||
|
* direction.
|
||||||
*
|
*
|
||||||
* curHeapIptr & mrkHeapIptr are heap iptr-s from current/marked
|
* NOTE: in this implementation, btree does not use or set the
|
||||||
* index tuples: we don't adjust scans on insertions - instead we
|
* currentItemData and currentMarkData fields of IndexScanDesc at all.
|
||||||
* use these pointers to restore index scan positions...
|
|
||||||
* - vadim 07/29/98
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
typedef struct BTScanPosItem /* what we remember about each match */
|
||||||
|
{
|
||||||
|
ItemPointerData heapTid; /* TID of referenced heap item */
|
||||||
|
OffsetNumber indexOffset; /* index item's location within page */
|
||||||
|
} BTScanPosItem;
|
||||||
|
|
||||||
|
typedef struct BTScanPosData
|
||||||
|
{
|
||||||
|
Buffer buf; /* if valid, the buffer is pinned */
|
||||||
|
|
||||||
|
BlockNumber nextPage; /* page's right link when we scanned it */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* moreLeft and moreRight track whether we think there may be matching
|
||||||
|
* index entries to the left and right of the current page, respectively.
|
||||||
|
* We can clear the appropriate one of these flags when _bt_checkkeys()
|
||||||
|
* returns continuescan = false.
|
||||||
|
*/
|
||||||
|
bool moreLeft;
|
||||||
|
bool moreRight;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The items array is always ordered in index order (ie, increasing
|
||||||
|
* indexoffset). When scanning backwards it is convenient to fill the
|
||||||
|
* array back-to-front, so we start at the last slot and fill downwards.
|
||||||
|
* Hence we need both a first-valid-entry and a last-valid-entry counter.
|
||||||
|
* itemIndex is a cursor showing which entry was last returned to caller.
|
||||||
|
*/
|
||||||
|
int firstItem; /* first valid index in items[] */
|
||||||
|
int lastItem; /* last valid index in items[] */
|
||||||
|
int itemIndex; /* current index in items[] */
|
||||||
|
|
||||||
|
BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
|
||||||
|
} BTScanPosData;
|
||||||
|
|
||||||
|
typedef BTScanPosData *BTScanPos;
|
||||||
|
|
||||||
|
#define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)
|
||||||
|
|
||||||
typedef struct BTScanOpaqueData
|
typedef struct BTScanOpaqueData
|
||||||
{
|
{
|
||||||
Buffer btso_curbuf;
|
|
||||||
Buffer btso_mrkbuf;
|
|
||||||
ItemPointerData curHeapIptr;
|
|
||||||
ItemPointerData mrkHeapIptr;
|
|
||||||
/* these fields are set by _bt_preprocess_keys(): */
|
/* these fields are set by _bt_preprocess_keys(): */
|
||||||
bool qual_ok; /* false if qual can never be satisfied */
|
bool qual_ok; /* false if qual can never be satisfied */
|
||||||
int numberOfKeys; /* number of preprocessed scan keys */
|
int numberOfKeys; /* number of preprocessed scan keys */
|
||||||
ScanKey keyData; /* array of preprocessed scan keys */
|
ScanKey keyData; /* array of preprocessed scan keys */
|
||||||
|
|
||||||
|
/* info about killed items if any (killedItems is NULL if never used) */
|
||||||
|
int *killedItems; /* currPos.items indexes of killed items */
|
||||||
|
int numKilled; /* number of currently stored items */
|
||||||
|
|
||||||
|
/* keep these last in struct for efficiency */
|
||||||
|
BTScanPosData currPos; /* current position data */
|
||||||
|
BTScanPosData markPos; /* marked position, if any */
|
||||||
} BTScanOpaqueData;
|
} BTScanOpaqueData;
|
||||||
|
|
||||||
typedef BTScanOpaqueData *BTScanOpaque;
|
typedef BTScanOpaqueData *BTScanOpaque;
|
||||||
|
@ -424,9 +476,8 @@ extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
|
||||||
ScanKey scankey, bool nextkey);
|
ScanKey scankey, bool nextkey);
|
||||||
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
|
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
|
||||||
Page page, OffsetNumber offnum);
|
Page page, OffsetNumber offnum);
|
||||||
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
|
|
||||||
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
|
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
|
||||||
extern bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
|
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
|
||||||
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
|
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -440,6 +491,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan);
|
||||||
extern bool _bt_checkkeys(IndexScanDesc scan,
|
extern bool _bt_checkkeys(IndexScanDesc scan,
|
||||||
Page page, OffsetNumber offnum,
|
Page page, OffsetNumber offnum,
|
||||||
ScanDirection dir, bool *continuescan);
|
ScanDirection dir, bool *continuescan);
|
||||||
|
extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* prototypes for functions in nbtsort.c
|
* prototypes for functions in nbtsort.c
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.44 2006/03/05 15:58:53 momjian Exp $
|
* $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.45 2006/05/07 01:21:30 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
@ -67,12 +67,9 @@ typedef struct IndexScanDescData
|
||||||
bool kill_prior_tuple; /* last-returned tuple is dead */
|
bool kill_prior_tuple; /* last-returned tuple is dead */
|
||||||
bool ignore_killed_tuples; /* do not return killed entries */
|
bool ignore_killed_tuples; /* do not return killed entries */
|
||||||
|
|
||||||
/* set by index AM if scan keys satisfy index's uniqueness constraint */
|
/* index access method's private state */
|
||||||
bool keys_are_unique;
|
|
||||||
|
|
||||||
/* scan current state */
|
|
||||||
bool got_tuple; /* true after successful index_getnext */
|
|
||||||
void *opaque; /* access-method-specific info */
|
void *opaque; /* access-method-specific info */
|
||||||
|
/* these fields are used by some but not all AMs: */
|
||||||
ItemPointerData currentItemData; /* current index pointer */
|
ItemPointerData currentItemData; /* current index pointer */
|
||||||
ItemPointerData currentMarkData; /* marked position, if any */
|
ItemPointerData currentMarkData; /* marked position, if any */
|
||||||
|
|
||||||
|
@ -85,15 +82,6 @@ typedef struct IndexScanDescData
|
||||||
Buffer xs_cbuf; /* current heap buffer in scan, if any */
|
Buffer xs_cbuf; /* current heap buffer in scan, if any */
|
||||||
/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
|
/* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
|
||||||
|
|
||||||
/*
|
|
||||||
* If keys_are_unique and got_tuple are both true, we stop calling the
|
|
||||||
* index AM; it is then necessary for index_getnext to keep track of the
|
|
||||||
* logical scan position for itself. It does that using unique_tuple_pos:
|
|
||||||
* -1 = before row, 0 = on row, +1 = after row.
|
|
||||||
*/
|
|
||||||
int unique_tuple_pos; /* logical position */
|
|
||||||
int unique_tuple_mark; /* logical marked position */
|
|
||||||
|
|
||||||
PgStat_Info xs_pgstat_info; /* statistics collector hook */
|
PgStat_Info xs_pgstat_info; /* statistics collector hook */
|
||||||
} IndexScanDescData;
|
} IndexScanDescData;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue