1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* btinsert.c
|
1997-09-07 07:04:48 +02:00
|
|
|
* Item insertion in Lehman and Yao btrees for Postgres.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2001-01-24 20:43:33 +01:00
|
|
|
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2001-02-08 00:35:33 +01:00
|
|
|
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.81 2001/02/07 23:35:33 vadim Exp $
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "postgres.h"
|
1996-10-23 09:42:13 +02:00
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "access/heapam.h"
|
1999-07-16 07:00:38 +02:00
|
|
|
#include "access/nbtree.h"
|
2001-01-14 06:08:17 +01:00
|
|
|
#include "miscadmin.h"
|
1996-10-20 12:53:18 +02:00
|
|
|
|
1996-11-03 13:35:27 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
/* context data for _bt_checksplitloc */
|
|
|
|
Size newitemsz; /* size of new item to be inserted */
|
|
|
|
bool non_leaf; /* T if splitting an internal node */
|
|
|
|
|
|
|
|
bool have_split; /* found a valid split? */
|
|
|
|
|
|
|
|
/* these fields valid only if have_split is true */
|
|
|
|
bool newitemonleft; /* new item on left or right of best split */
|
|
|
|
OffsetNumber firstright; /* best split point */
|
|
|
|
int best_delta; /* best size delta so far */
|
|
|
|
} FindSplitData;
|
|
|
|
|
2001-01-29 08:28:17 +01:00
|
|
|
extern bool FixBTree;
|
|
|
|
|
2001-01-26 02:24:31 +01:00
|
|
|
Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release);
|
2001-01-31 02:08:36 +01:00
|
|
|
static void _bt_fixtree(Relation rel, BlockNumber blkno);
|
2001-02-02 20:49:15 +01:00
|
|
|
static void _bt_fixbranch(Relation rel, BlockNumber lblkno,
|
|
|
|
BlockNumber rblkno, BTStack true_stack);
|
|
|
|
static void _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit);
|
|
|
|
static void _bt_fixup(Relation rel, Buffer buf);
|
2001-01-31 02:08:36 +01:00
|
|
|
static OffsetNumber _bt_getoff(Page page, BlockNumber blkno);
|
2001-01-26 02:24:31 +01:00
|
|
|
|
|
|
|
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
|
2000-07-21 08:42:39 +02:00
|
|
|
|
|
|
|
static TransactionId _bt_check_unique(Relation rel, BTItem btitem,
|
|
|
|
Relation heapRel, Buffer buf,
|
|
|
|
ScanKey itup_scankey);
|
|
|
|
static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf,
|
|
|
|
BTStack stack,
|
|
|
|
int keysz, ScanKey scankey,
|
|
|
|
BTItem btitem,
|
|
|
|
OffsetNumber afteritem);
|
2001-01-26 02:24:31 +01:00
|
|
|
static void _bt_insertuple(Relation rel, Buffer buf,
|
|
|
|
Size itemsz, BTItem btitem, OffsetNumber newitemoff);
|
2000-07-21 08:42:39 +02:00
|
|
|
static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
|
|
|
OffsetNumber newitemoff, Size newitemsz,
|
|
|
|
BTItem newitem, bool newitemonleft,
|
|
|
|
OffsetNumber *itup_off, BlockNumber *itup_blkno);
|
|
|
|
static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
|
|
|
|
OffsetNumber newitemoff,
|
|
|
|
Size newitemsz,
|
|
|
|
bool *newitemonleft);
|
|
|
|
static void _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
|
|
|
|
int leftfree, int rightfree,
|
|
|
|
bool newitemonleft, Size firstrightitemsz);
|
2001-02-02 20:49:15 +01:00
|
|
|
static Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
|
2000-07-21 08:42:39 +02:00
|
|
|
static void _bt_pgaddtup(Relation rel, Page page,
|
|
|
|
Size itemsize, BTItem btitem,
|
|
|
|
OffsetNumber itup_off, const char *where);
|
|
|
|
static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
|
|
|
|
int keysz, ScanKey scankey);
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2000-10-13 04:03:02 +02:00
|
|
|
static Relation _xlheapRel; /* temporary hack */
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_doinsert() -- Handle insertion of a single btitem in the tree.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* This routine is called by the public interface routines, btbuild
|
2000-07-21 08:42:39 +02:00
|
|
|
* and btinsert. By here, btitem is filled in, including the TID.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
InsertIndexResult
|
2000-07-21 08:42:39 +02:00
|
|
|
_bt_doinsert(Relation rel, BTItem btitem,
|
|
|
|
bool index_is_unique, Relation heapRel)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
IndexTuple itup = &(btitem->bti_itup);
|
|
|
|
int natts = rel->rd_rel->relnatts;
|
1997-09-08 04:41:22 +02:00
|
|
|
ScanKey itup_scankey;
|
|
|
|
BTStack stack;
|
|
|
|
Buffer buf;
|
1997-09-07 07:04:48 +02:00
|
|
|
InsertIndexResult res;
|
|
|
|
|
|
|
|
/* we need a scan key to do our search, so build one */
|
|
|
|
itup_scankey = _bt_mkscankey(rel, itup);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
top:
|
1997-09-07 07:04:48 +02:00
|
|
|
/* find the page containing this key */
|
2000-07-21 08:42:39 +02:00
|
|
|
stack = _bt_search(rel, natts, itup_scankey, &buf, BT_WRITE);
|
1997-01-10 11:06:20 +01:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/* trade in our read lock for a write lock */
|
1999-05-25 20:20:31 +02:00
|
|
|
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
|
|
|
LockBuffer(buf, BT_WRITE);
|
1998-12-15 13:47:01 +01:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
|
|
|
* If the page was split between the time that we surrendered our read
|
|
|
|
* lock and acquired our write lock, then this page may no longer be
|
|
|
|
* the right place for the key we want to insert. In this case, we
|
|
|
|
* need to move right in the tree. See Lehman and Yao for an
|
|
|
|
* excruciatingly precise description.
|
|
|
|
*/
|
|
|
|
buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* If we're not allowing duplicates, make sure the key isn't
|
|
|
|
* already in the index. XXX this belongs somewhere else, likely
|
|
|
|
*/
|
1997-09-07 07:04:48 +02:00
|
|
|
if (index_is_unique)
|
1997-01-10 11:06:20 +01:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
TransactionId xwait;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
xwait = _bt_check_unique(rel, btitem, heapRel, buf, itup_scankey);
|
|
|
|
|
|
|
|
if (TransactionIdIsValid(xwait))
|
|
|
|
{
|
|
|
|
/* Have to wait for the other guy ... */
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
XactLockTableWait(xwait);
|
|
|
|
/* start over... */
|
|
|
|
_bt_freestack(stack);
|
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-10-13 04:03:02 +02:00
|
|
|
_xlheapRel = heapRel; /* temporary hack */
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* do the insertion */
|
|
|
|
res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, btitem, 0);
|
|
|
|
|
|
|
|
/* be tidy */
|
|
|
|
_bt_freestack(stack);
|
|
|
|
_bt_freeskey(itup_scankey);
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_check_unique() -- Check for violation of unique index constraint
|
|
|
|
*
|
|
|
|
* Returns NullTransactionId if there is no conflict, else an xact ID we
|
|
|
|
* must wait for to see if it commits a conflicting tuple. If an actual
|
|
|
|
* conflict is detected, no return --- just elog().
|
|
|
|
*/
|
|
|
|
static TransactionId
|
|
|
|
_bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
|
|
|
|
Buffer buf, ScanKey itup_scankey)
|
|
|
|
{
|
|
|
|
TupleDesc itupdesc = RelationGetDescr(rel);
|
|
|
|
int natts = rel->rd_rel->relnatts;
|
|
|
|
OffsetNumber offset,
|
|
|
|
maxoff;
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
Buffer nbuf = InvalidBuffer;
|
|
|
|
bool chtup = true;
|
|
|
|
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find first item >= proposed new item. Note we could also get
|
|
|
|
* a pointer to end-of-page here.
|
|
|
|
*/
|
|
|
|
offset = _bt_binsrch(rel, buf, natts, itup_scankey);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Scan over all equal tuples, looking for live conflicts.
|
|
|
|
*/
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
HeapTupleData htup;
|
|
|
|
Buffer buffer;
|
|
|
|
BTItem cbti;
|
|
|
|
BlockNumber nblkno;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
|
|
|
|
* how we handling NULLs - and so we must not use _bt_compare
|
|
|
|
* in real comparison, but only for ordering/finding items on
|
|
|
|
* pages. - vadim 03/24/97
|
|
|
|
*
|
|
|
|
* make sure the offset points to an actual key
|
|
|
|
* before trying to compare it...
|
|
|
|
*/
|
|
|
|
if (offset <= maxoff)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
if (! _bt_isequal(itupdesc, page, offset, natts, itup_scankey))
|
|
|
|
break; /* we're past all the equal tuples */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Have to check is inserted heap tuple deleted one (i.e.
|
|
|
|
* just moved to another place by vacuum)! We only need to
|
|
|
|
* do this once, but don't want to do it at all unless
|
|
|
|
* we see equal tuples, so as not to slow down unequal case.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
if (chtup)
|
|
|
|
{
|
|
|
|
htup.t_self = btitem->bti_itup.t_tid;
|
1998-12-15 13:47:01 +01:00
|
|
|
heap_fetch(heapRel, SnapshotDirty, &htup, &buffer);
|
2000-07-21 08:42:39 +02:00
|
|
|
if (htup.t_data == NULL) /* YES! */
|
|
|
|
break;
|
|
|
|
/* Live tuple is being inserted, so continue checking */
|
|
|
|
ReleaseBuffer(buffer);
|
|
|
|
chtup = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
cbti = (BTItem) PageGetItem(page, PageGetItemId(page, offset));
|
|
|
|
htup.t_self = cbti->bti_itup.t_tid;
|
|
|
|
heap_fetch(heapRel, SnapshotDirty, &htup, &buffer);
|
|
|
|
if (htup.t_data != NULL) /* it is a duplicate */
|
|
|
|
{
|
|
|
|
TransactionId xwait =
|
1999-05-25 18:15:34 +02:00
|
|
|
(TransactionIdIsValid(SnapshotDirty->xmin)) ?
|
|
|
|
SnapshotDirty->xmin : SnapshotDirty->xmax;
|
1998-12-15 13:47:01 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* If this tuple is being updated by other transaction
|
|
|
|
* then we have to wait for its commit/abort.
|
|
|
|
*/
|
|
|
|
ReleaseBuffer(buffer);
|
|
|
|
if (TransactionIdIsValid(xwait))
|
|
|
|
{
|
1997-09-07 07:04:48 +02:00
|
|
|
if (nbuf != InvalidBuffer)
|
|
|
|
_bt_relbuf(rel, nbuf, BT_READ);
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Tell _bt_doinsert to wait... */
|
|
|
|
return xwait;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Otherwise we have a definite conflict.
|
|
|
|
*/
|
|
|
|
elog(ERROR, "Cannot insert a duplicate key into unique index %s",
|
|
|
|
RelationGetRelationName(rel));
|
1997-01-10 11:06:20 +01:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
/* htup null so no buffer to release */
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Advance to next tuple to continue checking.
|
|
|
|
*/
|
|
|
|
if (offset < maxoff)
|
|
|
|
offset = OffsetNumberNext(offset);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* If scankey == hikey we gotta check the next page too */
|
|
|
|
if (P_RIGHTMOST(opaque))
|
|
|
|
break;
|
|
|
|
if (!_bt_isequal(itupdesc, page, P_HIKEY,
|
|
|
|
natts, itup_scankey))
|
|
|
|
break;
|
|
|
|
nblkno = opaque->btpo_next;
|
1997-09-07 07:04:48 +02:00
|
|
|
if (nbuf != InvalidBuffer)
|
|
|
|
_bt_relbuf(rel, nbuf, BT_READ);
|
2000-07-21 08:42:39 +02:00
|
|
|
nbuf = _bt_getbuf(rel, nblkno, BT_READ);
|
|
|
|
page = BufferGetPage(nbuf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
offset = P_FIRSTDATAKEY(opaque);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
1997-01-10 11:06:20 +01:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
if (nbuf != InvalidBuffer)
|
|
|
|
_bt_relbuf(rel, nbuf, BT_READ);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
return NullTransactionId;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*----------
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_insertonpg() -- Insert a tuple on a particular page in the index.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* This recursive procedure does the following things:
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* + finds the right place to insert the tuple.
|
|
|
|
* + if necessary, splits the target page (making sure that the
|
|
|
|
* split is equitable as far as post-insert free space goes).
|
1997-09-07 07:04:48 +02:00
|
|
|
* + inserts the tuple.
|
|
|
|
* + if the page was split, pops the parent stack, and finds the
|
|
|
|
* right place to insert the new child pointer (by walking
|
|
|
|
* right using information stored in the parent stack).
|
2000-07-21 08:42:39 +02:00
|
|
|
* + invokes itself with the appropriate tuple for the right
|
1997-09-07 07:04:48 +02:00
|
|
|
* child page on the parent.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* On entry, we must have the right buffer on which to do the
|
|
|
|
* insertion, and the buffer must be pinned and locked. On return,
|
|
|
|
* we will have dropped both the pin and the write lock on the buffer.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* If 'afteritem' is >0 then the new tuple must be inserted after the
|
|
|
|
* existing item of that number, noplace else. If 'afteritem' is 0
|
|
|
|
* then the procedure finds the exact spot to insert it by searching.
|
|
|
|
* (keysz and scankey parameters are used ONLY if afteritem == 0.)
|
|
|
|
*
|
|
|
|
* NOTE: if the new key is equal to one or more existing keys, we can
|
|
|
|
* legitimately place it anywhere in the series of equal keys --- in fact,
|
|
|
|
* if the new key is equal to the page's "high key" we can place it on
|
|
|
|
* the next page. If it is equal to the high key, and there's not room
|
|
|
|
* to insert the new tuple on the current page without splitting, then
|
2000-08-26 01:13:33 +02:00
|
|
|
* we can move right hoping to find more free space and avoid a split.
|
|
|
|
* (We should not move right indefinitely, however, since that leads to
|
|
|
|
* O(N^2) insertion behavior in the presence of many equal keys.)
|
|
|
|
* Once we have chosen the page to put the key on, we'll insert it before
|
|
|
|
* any existing equal keys because of the way _bt_binsrch() works.
|
2000-07-21 08:42:39 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* The locking interactions in this code are critical. You should
|
|
|
|
* grok Lehman and Yao's paper before making any changes. In addition,
|
|
|
|
* you need to understand how we disambiguate duplicate keys in this
|
|
|
|
* implementation, in order to be able to find our location using
|
|
|
|
* L&Y "move right" operations. Since we may insert duplicate user
|
2000-07-21 08:42:39 +02:00
|
|
|
* keys, and since these dups may propagate up the tree, we use the
|
1997-09-07 07:04:48 +02:00
|
|
|
* 'afteritem' parameter to position ourselves correctly for the
|
|
|
|
* insertion on internal pages.
|
2000-07-21 08:42:39 +02:00
|
|
|
*----------
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
1997-09-08 04:41:22 +02:00
|
|
|
static InsertIndexResult
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_insertonpg(Relation rel,
|
1997-09-07 07:04:48 +02:00
|
|
|
Buffer buf,
|
|
|
|
BTStack stack,
|
|
|
|
int keysz,
|
|
|
|
ScanKey scankey,
|
|
|
|
BTItem btitem,
|
2000-07-21 08:42:39 +02:00
|
|
|
OffsetNumber afteritem)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1997-09-07 07:04:48 +02:00
|
|
|
InsertIndexResult res;
|
1997-09-08 04:41:22 +02:00
|
|
|
Page page;
|
|
|
|
BTPageOpaque lpageop;
|
|
|
|
OffsetNumber itup_off;
|
2000-07-21 08:42:39 +02:00
|
|
|
BlockNumber itup_blkno;
|
|
|
|
OffsetNumber newitemoff;
|
1997-09-08 04:41:22 +02:00
|
|
|
OffsetNumber firstright = InvalidOffsetNumber;
|
2000-03-17 03:36:41 +01:00
|
|
|
Size itemsz;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
|
|
|
itemsz = IndexTupleDSize(btitem->bti_itup)
|
|
|
|
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
|
|
|
|
|
2000-04-12 19:17:23 +02:00
|
|
|
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but
|
|
|
|
* we need to be consistent */
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1999-12-26 04:48:22 +01:00
|
|
|
/*
|
2000-04-12 19:17:23 +02:00
|
|
|
* Check whether the item can fit on a btree page at all. (Eventually,
|
|
|
|
* we ought to try to apply TOAST methods if not.) We actually need to
|
|
|
|
* be able to fit three items on every page, so restrict any one item
|
|
|
|
* to 1/3 the per-page available space. Note that at this point,
|
|
|
|
* itemsz doesn't include the ItemId.
|
1999-12-26 04:48:22 +01:00
|
|
|
*/
|
2000-04-12 19:17:23 +02:00
|
|
|
if (itemsz > (PageGetPageSize(page) - sizeof(PageHeaderData) - MAXALIGN(sizeof(BTPageOpaqueData))) / 3 - sizeof(ItemIdData))
|
2000-11-16 06:51:07 +01:00
|
|
|
elog(ERROR, "btree: index item size %lu exceeds maximum %lu",
|
|
|
|
(unsigned long)itemsz,
|
2000-04-12 19:17:23 +02:00
|
|
|
(PageGetPageSize(page) - sizeof(PageHeaderData) - MAXALIGN(sizeof(BTPageOpaqueData))) /3 - sizeof(ItemIdData));
|
1999-12-26 04:48:22 +01:00
|
|
|
|
1997-06-10 09:28:50 +02:00
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Determine exactly where new item will go.
|
1997-06-10 09:28:50 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
if (afteritem > 0)
|
1997-06-10 09:28:50 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
newitemoff = afteritem + 1;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
else
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-08-26 01:13:33 +02:00
|
|
|
/*----------
|
2000-07-21 08:42:39 +02:00
|
|
|
* If we will need to split the page to put the item here,
|
|
|
|
* check whether we can put the tuple somewhere to the right,
|
2000-08-26 01:13:33 +02:00
|
|
|
* instead. Keep scanning right until we
|
|
|
|
* (a) find a page with enough free space,
|
|
|
|
* (b) reach the last page where the tuple can legally go, or
|
|
|
|
* (c) get tired of searching.
|
|
|
|
* (c) is not flippant; it is important because if there are many
|
|
|
|
* pages' worth of equal keys, it's better to split one of the early
|
|
|
|
* pages than to scan all the way to the end of the run of equal keys
|
|
|
|
* on every insert. We implement "get tired" as a random choice,
|
|
|
|
* since stopping after scanning a fixed number of pages wouldn't work
|
|
|
|
* well (we'd never reach the right-hand side of previously split
|
|
|
|
* pages). Currently the probability of moving right is set at 0.99,
|
|
|
|
* which may seem too high to change the behavior much, but it does an
|
|
|
|
* excellent job of preventing O(N^2) behavior with many equal keys.
|
|
|
|
*----------
|
1996-12-06 10:45:30 +01:00
|
|
|
*/
|
2000-08-26 01:13:33 +02:00
|
|
|
bool movedright = false;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
while (PageGetFreeSpace(page) < itemsz &&
|
|
|
|
!P_RIGHTMOST(lpageop) &&
|
2000-08-26 01:13:33 +02:00
|
|
|
_bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 &&
|
|
|
|
random() > (MAX_RANDOM_VALUE / 100))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* step right one page */
|
|
|
|
BlockNumber rblkno = lpageop->btpo_next;
|
2000-04-12 19:17:23 +02:00
|
|
|
|
1999-08-09 03:39:19 +02:00
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
2000-07-21 08:42:39 +02:00
|
|
|
buf = _bt_getbuf(rel, rblkno, BT_WRITE);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
2000-08-26 01:13:33 +02:00
|
|
|
movedright = true;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
/*
|
2000-08-26 01:13:33 +02:00
|
|
|
* Now we are on the right page, so find the insert position.
|
|
|
|
* If we moved right at all, we know we should insert at the
|
|
|
|
* start of the page, else must find the position by searching.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-08-26 01:13:33 +02:00
|
|
|
if (movedright)
|
|
|
|
newitemoff = P_FIRSTDATAKEY(lpageop);
|
|
|
|
else
|
|
|
|
newitemoff = _bt_binsrch(rel, buf, keysz, scankey);
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Do we need to split the page to fit the item on it?
|
2000-07-21 21:21:00 +02:00
|
|
|
*
|
|
|
|
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its
|
|
|
|
* result, so this comparison is correct even though we appear to
|
|
|
|
* be accounting only for the item and not for its line pointer.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
if (PageGetFreeSpace(page) < itemsz)
|
|
|
|
{
|
|
|
|
Buffer rbuf;
|
|
|
|
BlockNumber bknum = BufferGetBlockNumber(buf);
|
|
|
|
BlockNumber rbknum;
|
|
|
|
bool is_root = P_ISROOT(lpageop);
|
|
|
|
bool newitemonleft;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Choose the split point */
|
|
|
|
firstright = _bt_findsplitloc(rel, page,
|
|
|
|
newitemoff, itemsz,
|
|
|
|
&newitemonleft);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* split the buffer into left and right halves */
|
|
|
|
rbuf = _bt_split(rel, buf, firstright,
|
|
|
|
newitemoff, itemsz, btitem, newitemonleft,
|
|
|
|
&itup_off, &itup_blkno);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*----------
|
1997-09-07 07:04:48 +02:00
|
|
|
* By here,
|
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* + our target page has been split;
|
|
|
|
* + the original tuple has been inserted;
|
|
|
|
* + we have write locks on both the old (left half)
|
|
|
|
* and new (right half) buffers, after the split; and
|
|
|
|
* + we know the key we want to insert into the parent
|
|
|
|
* (it's the "high key" on the left child page).
|
|
|
|
*
|
|
|
|
* We're ready to do the parent insertion. We need to hold onto the
|
|
|
|
* locks for the child pages until we locate the parent, but we can
|
|
|
|
* release them before doing the actual insertion (see Lehman and Yao
|
|
|
|
* for the reasoning).
|
1997-09-07 07:04:48 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Here we have to do something Lehman and Yao don't talk about:
|
|
|
|
* deal with a root split and construction of a new root. If our
|
|
|
|
* stack is empty then we have just split a node on what had been
|
|
|
|
* the root level when we descended the tree. If it is still the
|
|
|
|
* root then we perform a new-root construction. If it *wasn't*
|
|
|
|
* the root anymore, use the parent pointer to get up to the root
|
|
|
|
* level that someone constructed meanwhile, and find the right
|
|
|
|
* place to insert as for the normal case.
|
|
|
|
*----------
|
1997-06-10 09:28:50 +02:00
|
|
|
*/
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
if (is_root)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2001-01-26 02:24:31 +01:00
|
|
|
Buffer rootbuf;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
Assert(stack == (BTStack) NULL);
|
1997-09-07 07:04:48 +02:00
|
|
|
/* create a new root node and release the split buffers */
|
2001-01-26 02:24:31 +01:00
|
|
|
rootbuf = _bt_newroot(rel, buf, rbuf);
|
|
|
|
_bt_wrtbuf(rel, rootbuf);
|
|
|
|
_bt_wrtbuf(rel, rbuf);
|
|
|
|
_bt_wrtbuf(rel, buf);
|
1996-12-06 10:45:30 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
1997-09-07 07:04:48 +02:00
|
|
|
InsertIndexResult newres;
|
1997-09-08 04:41:22 +02:00
|
|
|
BTItem new_item;
|
2000-07-21 08:42:39 +02:00
|
|
|
BTStackData fakestack;
|
|
|
|
BTItem ritem;
|
|
|
|
Buffer pbuf;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-01-29 08:28:17 +01:00
|
|
|
/* If root page was splitted */
|
2000-07-21 08:42:39 +02:00
|
|
|
if (stack == (BTStack) NULL)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
elog(DEBUG, "btree: concurrent ROOT page split");
|
2001-01-29 08:28:17 +01:00
|
|
|
/*
|
|
|
|
* If root page splitter failed to create new root page
|
|
|
|
* then old root' btpo_parent still points to metapage.
|
|
|
|
* We have to fix root page in this case.
|
|
|
|
*/
|
2001-02-08 00:35:33 +01:00
|
|
|
if (BTreeInvalidParent(lpageop))
|
2001-01-29 08:28:17 +01:00
|
|
|
{
|
|
|
|
if (!FixBTree)
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_insertonpg[%s]: no root page found", RelationGetRelationName(rel));
|
2001-01-29 08:28:17 +01:00
|
|
|
_bt_wrtbuf(rel, rbuf);
|
|
|
|
_bt_wrtnorelbuf(rel, buf);
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(NOTICE, "bt_insertonpg[%s]: root page unfound - fixing upper levels", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
_bt_fixup(rel, buf);
|
2001-01-29 08:28:17 +01:00
|
|
|
goto formres;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set up a phony stack entry if we haven't got a real one
|
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
stack = &fakestack;
|
|
|
|
stack->bts_blkno = lpageop->btpo_parent;
|
|
|
|
stack->bts_offset = InvalidOffsetNumber;
|
|
|
|
/* bts_btitem will be initialized below */
|
|
|
|
stack->bts_parent = NULL;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* get high key from left page == lowest key on new right page */
|
|
|
|
ritem = (BTItem) PageGetItem(page,
|
|
|
|
PageGetItemId(page, P_HIKEY));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* form an index tuple that points at the new right page */
|
|
|
|
new_item = _bt_formitem(&(ritem->bti_itup));
|
|
|
|
rbknum = BufferGetBlockNumber(rbuf);
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the parent buffer and get the parent page.
|
|
|
|
*
|
|
|
|
* Oops - if we were moved right then we need to change stack
|
|
|
|
* item! We want to find parent pointing to where we are,
|
|
|
|
* right ? - vadim 05/27/97
|
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Interestingly, this means we didn't *really* need to stack
|
|
|
|
* the parent key at all; all we really care about is the
|
|
|
|
* saved block and offset as a starting point for our search...
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
|
|
|
|
bknum, P_HIKEY);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-02-02 20:49:15 +01:00
|
|
|
pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
|
2001-01-31 02:08:36 +01:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Now we can write and unlock the children */
|
|
|
|
_bt_wrtbuf(rel, rbuf);
|
|
|
|
_bt_wrtbuf(rel, buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-02-02 20:49:15 +01:00
|
|
|
if (pbuf == InvalidBuffer)
|
|
|
|
{
|
|
|
|
if (!FixBTree)
|
|
|
|
elog(ERROR, "_bt_getstackbuf: my bits moved right off the end of the world!"
|
|
|
|
"\n\tRecreate index %s.", RelationGetRelationName(rel));
|
|
|
|
pfree(new_item);
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(NOTICE, "bt_insertonpg[%s]: parent page unfound - fixing branch", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
_bt_fixbranch(rel, bknum, rbknum, stack);
|
|
|
|
goto formres;
|
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Recursively update the parent */
|
1997-09-07 07:04:48 +02:00
|
|
|
newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
|
2000-07-21 08:42:39 +02:00
|
|
|
0, NULL, new_item, stack->bts_offset);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* be tidy */
|
|
|
|
pfree(newres);
|
|
|
|
pfree(new_item);
|
1996-12-06 10:45:30 +01:00
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
else
|
|
|
|
{
|
2000-12-29 21:47:17 +01:00
|
|
|
itup_off = newitemoff;
|
|
|
|
itup_blkno = BufferGetBlockNumber(buf);
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2001-01-26 02:24:31 +01:00
|
|
|
_bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* Write out the updated page and release pin/lock */
|
|
|
|
_bt_wrtbuf(rel, buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2001-01-29 08:28:17 +01:00
|
|
|
formres:;
|
2000-07-21 08:42:39 +02:00
|
|
|
/* by here, the new tuple is inserted at itup_blkno/itup_off */
|
1997-09-07 07:04:48 +02:00
|
|
|
res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
|
|
|
|
ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
|
|
|
|
|
1998-09-01 05:29:17 +02:00
|
|
|
return res;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2001-01-26 02:24:31 +01:00
|
|
|
static void
|
|
|
|
_bt_insertuple(Relation rel, Buffer buf,
|
|
|
|
Size itemsz, BTItem btitem, OffsetNumber newitemoff)
|
|
|
|
{
|
|
|
|
Page page = BufferGetPage(buf);
|
|
|
|
BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
_bt_pgaddtup(rel, page, itemsz, btitem, newitemoff, "page");
|
|
|
|
/* XLOG stuff */
|
|
|
|
{
|
|
|
|
xl_btree_insert xlrec;
|
|
|
|
uint8 flag = XLOG_BTREE_INSERT;
|
|
|
|
XLogRecPtr recptr;
|
|
|
|
XLogRecData rdata[2];
|
|
|
|
BTItemData truncitem;
|
|
|
|
xlrec.target.node = rel->rd_node;
|
|
|
|
ItemPointerSet(&(xlrec.target.tid), BufferGetBlockNumber(buf), newitemoff);
|
|
|
|
rdata[0].buffer = InvalidBuffer;
|
|
|
|
rdata[0].data = (char*)&xlrec;
|
|
|
|
rdata[0].len = SizeOfBtreeInsert;
|
|
|
|
rdata[0].next = &(rdata[1]);
|
|
|
|
|
|
|
|
/* Read comments in _bt_pgaddtup */
|
|
|
|
if (!(P_ISLEAF(pageop)) && newitemoff == P_FIRSTDATAKEY(pageop))
|
|
|
|
{
|
|
|
|
truncitem = *btitem;
|
|
|
|
truncitem.bti_itup.t_info = sizeof(BTItemData);
|
|
|
|
rdata[1].data = (char*)&truncitem;
|
|
|
|
rdata[1].len = sizeof(BTItemData);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
rdata[1].data = (char*)btitem;
|
|
|
|
rdata[1].len = IndexTupleDSize(btitem->bti_itup) +
|
|
|
|
(sizeof(BTItemData) - sizeof(IndexTupleData));
|
|
|
|
}
|
|
|
|
rdata[1].buffer = buf;
|
|
|
|
rdata[1].next = NULL;
|
|
|
|
if (P_ISLEAF(pageop))
|
|
|
|
flag |= XLOG_BTREE_LEAF;
|
|
|
|
|
|
|
|
recptr = XLogInsert(RM_BTREE_ID, flag, rdata);
|
|
|
|
|
|
|
|
PageSetLSN(page, recptr);
|
|
|
|
PageSetSUI(page, ThisStartUpID);
|
|
|
|
}
|
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_split() -- split a page in the btree.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* On entry, buf is the page to split, and is write-locked and pinned.
|
2000-07-21 08:42:39 +02:00
|
|
|
* firstright is the item index of the first item to be moved to the
|
|
|
|
* new right page. newitemoff etc. tell us about the new item that
|
|
|
|
* must be inserted along with the data from the old page.
|
|
|
|
*
|
|
|
|
* Returns the new right sibling of buf, pinned and write-locked.
|
|
|
|
* The pin and lock on buf are maintained. *itup_off and *itup_blkno
|
|
|
|
* are set to the exact location where newitem was inserted.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
1997-09-08 04:41:22 +02:00
|
|
|
static Buffer
|
2000-07-21 08:42:39 +02:00
|
|
|
_bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
|
|
|
OffsetNumber newitemoff, Size newitemsz, BTItem newitem,
|
|
|
|
bool newitemonleft,
|
|
|
|
OffsetNumber *itup_off, BlockNumber *itup_blkno)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
Buffer rbuf;
|
|
|
|
Page origpage;
|
|
|
|
Page leftpage,
|
|
|
|
rightpage;
|
|
|
|
BTPageOpaque ropaque,
|
|
|
|
lopaque,
|
|
|
|
oopaque;
|
2000-10-05 22:10:20 +02:00
|
|
|
Buffer sbuf = 0;
|
|
|
|
Page spage = 0;
|
1997-09-08 04:41:22 +02:00
|
|
|
Size itemsz;
|
|
|
|
ItemId itemid;
|
|
|
|
BTItem item;
|
|
|
|
OffsetNumber leftoff,
|
|
|
|
rightoff;
|
|
|
|
OffsetNumber maxoff;
|
|
|
|
OffsetNumber i;
|
2000-10-13 14:05:22 +02:00
|
|
|
BTItem lhikey;
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
|
|
|
|
origpage = BufferGetPage(buf);
|
|
|
|
leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
|
|
|
|
rightpage = BufferGetPage(rbuf);
|
|
|
|
|
|
|
|
_bt_pageinit(leftpage, BufferGetPageSize(buf));
|
2000-07-21 08:42:39 +02:00
|
|
|
_bt_pageinit(rightpage, BufferGetPageSize(rbuf));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* init btree private data */
|
|
|
|
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
|
|
|
|
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
|
|
|
|
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
|
|
|
|
|
|
|
|
/* if we're splitting this page, it won't be the root when we're done */
|
2000-10-04 02:04:43 +02:00
|
|
|
lopaque->btpo_flags = oopaque->btpo_flags;
|
|
|
|
lopaque->btpo_flags &= ~BTP_ROOT;
|
|
|
|
ropaque->btpo_flags = lopaque->btpo_flags;
|
1997-09-07 07:04:48 +02:00
|
|
|
lopaque->btpo_prev = oopaque->btpo_prev;
|
|
|
|
lopaque->btpo_next = BufferGetBlockNumber(rbuf);
|
2000-07-21 08:42:39 +02:00
|
|
|
ropaque->btpo_prev = BufferGetBlockNumber(buf);
|
1997-09-07 07:04:48 +02:00
|
|
|
ropaque->btpo_next = oopaque->btpo_next;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Must copy the original parent link into both new pages, even though
|
|
|
|
* it might be quite obsolete by now. We might need it if this level
|
|
|
|
* is or recently was the root (see README).
|
|
|
|
*/
|
1999-03-28 22:32:42 +02:00
|
|
|
lopaque->btpo_parent = ropaque->btpo_parent = oopaque->btpo_parent;
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
|
|
|
* If the page we're splitting is not the rightmost page at its level
|
2000-07-21 08:42:39 +02:00
|
|
|
* in the tree, then the first entry on the page is the high key
|
1997-09-07 07:04:48 +02:00
|
|
|
* for the page. We need to copy that to the right half. Otherwise
|
2000-07-21 08:42:39 +02:00
|
|
|
* (meaning the rightmost page case), all the items on the right half
|
|
|
|
* will be user data.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
rightoff = P_HIKEY;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
if (!P_RIGHTMOST(oopaque))
|
|
|
|
{
|
|
|
|
itemid = PageGetItemId(origpage, P_HIKEY);
|
|
|
|
itemsz = ItemIdGetLength(itemid);
|
|
|
|
item = (BTItem) PageGetItem(origpage, itemid);
|
2000-07-21 08:42:39 +02:00
|
|
|
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
|
|
|
|
LP_USED) == InvalidOffsetNumber)
|
2000-10-04 02:04:43 +02:00
|
|
|
elog(STOP, "btree: failed to add hikey to the right sibling");
|
2000-07-21 08:42:39 +02:00
|
|
|
rightoff = OffsetNumberNext(rightoff);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* The "high key" for the new left page will be the first key that's
|
|
|
|
* going to go into the new right page. This might be either the
|
|
|
|
* existing data item at position firstright, or the incoming tuple.
|
|
|
|
*/
|
|
|
|
leftoff = P_HIKEY;
|
|
|
|
if (!newitemonleft && newitemoff == firstright)
|
|
|
|
{
|
|
|
|
/* incoming tuple will become first on right page */
|
|
|
|
itemsz = newitemsz;
|
|
|
|
item = newitem;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
else
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
/* existing item at firstright will become first on right page */
|
|
|
|
itemid = PageGetItemId(origpage, firstright);
|
|
|
|
itemsz = ItemIdGetLength(itemid);
|
|
|
|
item = (BTItem) PageGetItem(origpage, itemid);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-10-13 14:05:22 +02:00
|
|
|
lhikey = item;
|
2000-07-21 08:42:39 +02:00
|
|
|
if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
|
|
|
|
LP_USED) == InvalidOffsetNumber)
|
2000-10-04 02:04:43 +02:00
|
|
|
elog(STOP, "btree: failed to add hikey to the left sibling");
|
2000-07-21 08:42:39 +02:00
|
|
|
leftoff = OffsetNumberNext(leftoff);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Now transfer all the data items to the appropriate page
|
|
|
|
*/
|
|
|
|
maxoff = PageGetMaxOffsetNumber(origpage);
|
|
|
|
|
|
|
|
for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
|
|
|
itemid = PageGetItemId(origpage, i);
|
|
|
|
itemsz = ItemIdGetLength(itemid);
|
|
|
|
item = (BTItem) PageGetItem(origpage, itemid);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* does new item belong before this one? */
|
|
|
|
if (i == newitemoff)
|
|
|
|
{
|
|
|
|
if (newitemonleft)
|
|
|
|
{
|
|
|
|
_bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
|
|
|
|
"left sibling");
|
|
|
|
*itup_off = leftoff;
|
|
|
|
*itup_blkno = BufferGetBlockNumber(buf);
|
|
|
|
leftoff = OffsetNumberNext(leftoff);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
|
|
|
|
"right sibling");
|
|
|
|
*itup_off = rightoff;
|
|
|
|
*itup_blkno = BufferGetBlockNumber(rbuf);
|
|
|
|
rightoff = OffsetNumberNext(rightoff);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/* decide which page to put it on */
|
|
|
|
if (i < firstright)
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
_bt_pgaddtup(rel, leftpage, itemsz, item, leftoff,
|
|
|
|
"left sibling");
|
1997-09-07 07:04:48 +02:00
|
|
|
leftoff = OffsetNumberNext(leftoff);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
_bt_pgaddtup(rel, rightpage, itemsz, item, rightoff,
|
|
|
|
"right sibling");
|
1997-09-07 07:04:48 +02:00
|
|
|
rightoff = OffsetNumberNext(rightoff);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* cope with possibility that newitem goes at the end */
|
|
|
|
if (i <= newitemoff)
|
|
|
|
{
|
|
|
|
if (newitemonleft)
|
|
|
|
{
|
|
|
|
_bt_pgaddtup(rel, leftpage, newitemsz, newitem, leftoff,
|
|
|
|
"left sibling");
|
|
|
|
*itup_off = leftoff;
|
|
|
|
*itup_blkno = BufferGetBlockNumber(buf);
|
|
|
|
leftoff = OffsetNumberNext(leftoff);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
_bt_pgaddtup(rel, rightpage, newitemsz, newitem, rightoff,
|
|
|
|
"right sibling");
|
|
|
|
*itup_off = rightoff;
|
|
|
|
*itup_blkno = BufferGetBlockNumber(rbuf);
|
|
|
|
rightoff = OffsetNumberNext(rightoff);
|
|
|
|
}
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-10-04 02:04:43 +02:00
|
|
|
/*
|
|
|
|
* We have to grab the right sibling (if any) and fix the prev
|
|
|
|
* pointer there. We are guaranteed that this is deadlock-free
|
|
|
|
* since no other writer will be holding a lock on that page
|
|
|
|
* and trying to move left, and all readers release locks on a page
|
|
|
|
* before trying to fetch its neighbors.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (!P_RIGHTMOST(ropaque))
|
|
|
|
{
|
|
|
|
sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
|
|
|
|
spage = BufferGetPage(sbuf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Right sibling is locked, new siblings are prepared, but original
|
|
|
|
* page is not updated yet. Log changes before continuing.
|
|
|
|
*
|
|
|
|
* NO ELOG(ERROR) till right sibling is updated.
|
|
|
|
*
|
|
|
|
*/
|
2001-01-12 22:54:01 +01:00
|
|
|
START_CRIT_SECTION();
|
2000-10-04 02:04:43 +02:00
|
|
|
{
|
2000-12-28 14:00:29 +01:00
|
|
|
xl_btree_split xlrec;
|
|
|
|
int flag = (newitemonleft) ?
|
|
|
|
XLOG_BTREE_SPLEFT : XLOG_BTREE_SPLIT;
|
|
|
|
BlockNumber blkno;
|
|
|
|
XLogRecPtr recptr;
|
|
|
|
XLogRecData rdata[4];
|
|
|
|
|
|
|
|
xlrec.target.node = rel->rd_node;
|
|
|
|
ItemPointerSet(&(xlrec.target.tid), *itup_blkno, *itup_off);
|
2000-10-04 02:04:43 +02:00
|
|
|
if (newitemonleft)
|
|
|
|
{
|
2000-10-21 17:43:36 +02:00
|
|
|
blkno = BufferGetBlockNumber(rbuf);
|
2000-12-28 14:00:29 +01:00
|
|
|
BlockIdSet(&(xlrec.otherblk), blkno);
|
2000-10-04 02:04:43 +02:00
|
|
|
}
|
|
|
|
else
|
2000-10-21 17:43:36 +02:00
|
|
|
{
|
|
|
|
blkno = BufferGetBlockNumber(buf);
|
2000-12-28 14:00:29 +01:00
|
|
|
BlockIdSet(&(xlrec.otherblk), blkno);
|
2000-10-21 17:43:36 +02:00
|
|
|
}
|
2000-12-28 14:00:29 +01:00
|
|
|
BlockIdSet(&(xlrec.parentblk), lopaque->btpo_parent);
|
|
|
|
BlockIdSet(&(xlrec.leftblk), lopaque->btpo_prev);
|
|
|
|
BlockIdSet(&(xlrec.rightblk), ropaque->btpo_next);
|
2000-10-04 02:04:43 +02:00
|
|
|
/*
|
|
|
|
* Dirrect access to page is not good but faster - we should
|
|
|
|
* implement some new func in page API.
|
|
|
|
*/
|
2000-12-28 14:00:29 +01:00
|
|
|
xlrec.leftlen = ((PageHeader)leftpage)->pd_special -
|
|
|
|
((PageHeader)leftpage)->pd_upper;
|
|
|
|
rdata[0].buffer = InvalidBuffer;
|
|
|
|
rdata[0].data = (char*)&xlrec;
|
|
|
|
rdata[0].len = SizeOfBtreeSplit;
|
|
|
|
rdata[0].next = &(rdata[1]);
|
|
|
|
|
|
|
|
rdata[1].buffer = InvalidBuffer;
|
|
|
|
rdata[1].data = (char*)leftpage + ((PageHeader)leftpage)->pd_upper;
|
|
|
|
rdata[1].len = xlrec.leftlen;
|
|
|
|
rdata[1].next = &(rdata[2]);
|
|
|
|
|
|
|
|
rdata[2].buffer = InvalidBuffer;
|
|
|
|
rdata[2].data = (char*)rightpage + ((PageHeader)rightpage)->pd_upper;
|
|
|
|
rdata[2].len = ((PageHeader)rightpage)->pd_special -
|
|
|
|
((PageHeader)rightpage)->pd_upper;
|
|
|
|
rdata[2].next = NULL;
|
|
|
|
|
|
|
|
if (!P_RIGHTMOST(ropaque))
|
|
|
|
{
|
2000-12-29 21:47:17 +01:00
|
|
|
BTPageOpaque sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
|
|
|
|
sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
|
|
|
|
|
2000-12-28 14:00:29 +01:00
|
|
|
rdata[2].next = &(rdata[3]);
|
|
|
|
rdata[3].buffer = sbuf;
|
|
|
|
rdata[3].data = NULL;
|
|
|
|
rdata[3].len = 0;
|
|
|
|
rdata[3].next = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (P_ISLEAF(lopaque))
|
|
|
|
flag |= XLOG_BTREE_LEAF;
|
|
|
|
|
|
|
|
recptr = XLogInsert(RM_BTREE_ID, flag, rdata);
|
2000-10-04 02:04:43 +02:00
|
|
|
|
|
|
|
PageSetLSN(leftpage, recptr);
|
|
|
|
PageSetSUI(leftpage, ThisStartUpID);
|
|
|
|
PageSetLSN(rightpage, recptr);
|
|
|
|
PageSetSUI(rightpage, ThisStartUpID);
|
|
|
|
if (!P_RIGHTMOST(ropaque))
|
|
|
|
{
|
|
|
|
PageSetLSN(spage, recptr);
|
|
|
|
PageSetSUI(spage, ThisStartUpID);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
|
|
|
* By here, the original data page has been split into two new halves,
|
|
|
|
* and these are correct. The algorithm requires that the left page
|
|
|
|
* never move during a split, so we copy the new left page back on top
|
|
|
|
* of the original. Note that this is not a waste of time, since we
|
|
|
|
* also require (in the page management code) that the center of a
|
|
|
|
* page always be clean, and the most efficient way to guarantee this
|
|
|
|
* is just to compact the data by reinserting it into a new left page.
|
|
|
|
*/
|
|
|
|
|
|
|
|
PageRestoreTempPage(leftpage, origpage);
|
|
|
|
|
2001-01-24 00:29:22 +01:00
|
|
|
END_CRIT_SECTION();
|
|
|
|
|
2000-12-29 21:47:17 +01:00
|
|
|
/* write and release the old right sibling */
|
1997-09-07 07:04:48 +02:00
|
|
|
if (!P_RIGHTMOST(ropaque))
|
|
|
|
_bt_wrtbuf(rel, sbuf);
|
|
|
|
|
|
|
|
/* split's done */
|
1998-09-01 05:29:17 +02:00
|
|
|
return rbuf;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* _bt_findsplitloc() -- find an appropriate place to split a page.
|
|
|
|
*
|
|
|
|
* The idea here is to equalize the free space that will be on each split
|
|
|
|
* page, *after accounting for the inserted tuple*. (If we fail to account
|
|
|
|
* for it, we might find ourselves with too little room on the page that
|
|
|
|
* it needs to go into!)
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* We are passed the intended insert position of the new tuple, expressed as
|
|
|
|
* the offsetnumber of the tuple it must go in front of. (This could be
|
|
|
|
* maxoff+1 if the tuple is to go at the end.)
|
|
|
|
*
|
|
|
|
* We return the index of the first existing tuple that should go on the
|
|
|
|
* righthand page, plus a boolean indicating whether the new tuple goes on
|
|
|
|
* the left or right page. The bool is necessary to disambiguate the case
|
|
|
|
* where firstright == newitemoff.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
1997-09-08 04:41:22 +02:00
|
|
|
static OffsetNumber
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_findsplitloc(Relation rel,
|
1997-09-07 07:04:48 +02:00
|
|
|
Page page,
|
2000-07-21 08:42:39 +02:00
|
|
|
OffsetNumber newitemoff,
|
|
|
|
Size newitemsz,
|
|
|
|
bool *newitemonleft)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
BTPageOpaque opaque;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
OffsetNumber maxoff;
|
|
|
|
ItemId itemid;
|
|
|
|
FindSplitData state;
|
|
|
|
int leftspace,
|
|
|
|
rightspace,
|
2000-07-21 21:21:00 +02:00
|
|
|
goodenough,
|
2000-07-21 08:42:39 +02:00
|
|
|
dataitemtotal,
|
|
|
|
dataitemstoleft;
|
|
|
|
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
2000-07-21 21:21:00 +02:00
|
|
|
/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
|
|
|
|
newitemsz += sizeof(ItemIdData);
|
2000-07-21 08:42:39 +02:00
|
|
|
state.newitemsz = newitemsz;
|
|
|
|
state.non_leaf = ! P_ISLEAF(opaque);
|
|
|
|
state.have_split = false;
|
|
|
|
|
|
|
|
/* Total free space available on a btree page, after fixed overhead */
|
|
|
|
leftspace = rightspace =
|
|
|
|
PageGetPageSize(page) - sizeof(PageHeaderData) -
|
|
|
|
MAXALIGN(sizeof(BTPageOpaqueData))
|
|
|
|
+ sizeof(ItemIdData);
|
|
|
|
|
2000-07-21 21:21:00 +02:00
|
|
|
/*
|
|
|
|
* Finding the best possible split would require checking all the possible
|
|
|
|
* split points, because of the high-key and left-key special cases.
|
|
|
|
* That's probably more work than it's worth; instead, stop as soon as
|
|
|
|
* we find a "good-enough" split, where good-enough is defined as an
|
|
|
|
* imbalance in free space of no more than pagesize/16 (arbitrary...)
|
|
|
|
* This should let us stop near the middle on most pages, instead of
|
|
|
|
* plowing to the end.
|
|
|
|
*/
|
|
|
|
goodenough = leftspace / 16;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* The right page will have the same high key as the old page */
|
|
|
|
if (!P_RIGHTMOST(opaque))
|
1997-04-16 03:48:29 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
itemid = PageGetItemId(page, P_HIKEY);
|
2000-07-21 21:21:00 +02:00
|
|
|
rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
|
|
|
|
sizeof(ItemIdData));
|
2000-07-21 08:42:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Count up total space in data items without actually scanning 'em */
|
|
|
|
dataitemtotal = rightspace - (int) PageGetFreeSpace(page);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scan through the data items and calculate space usage for a split
|
2000-07-21 21:21:00 +02:00
|
|
|
* at each possible position.
|
2000-07-21 08:42:39 +02:00
|
|
|
*/
|
|
|
|
dataitemstoleft = 0;
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
for (offnum = P_FIRSTDATAKEY(opaque);
|
|
|
|
offnum <= maxoff;
|
|
|
|
offnum = OffsetNumberNext(offnum))
|
|
|
|
{
|
|
|
|
Size itemsz;
|
|
|
|
int leftfree,
|
|
|
|
rightfree;
|
|
|
|
|
|
|
|
itemid = PageGetItemId(page, offnum);
|
2000-07-21 21:21:00 +02:00
|
|
|
itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* We have to allow for the current item becoming the high key of
|
2000-07-21 21:21:00 +02:00
|
|
|
* the left page; therefore it counts against left space as well
|
|
|
|
* as right space.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
leftfree = leftspace - dataitemstoleft - (int) itemsz;
|
|
|
|
rightfree = rightspace - (dataitemtotal - dataitemstoleft);
|
2000-07-21 21:21:00 +02:00
|
|
|
/*
|
|
|
|
* Will the new item go to left or right of split?
|
|
|
|
*/
|
|
|
|
if (offnum > newitemoff)
|
2000-07-21 08:42:39 +02:00
|
|
|
_bt_checksplitloc(&state, offnum, leftfree, rightfree,
|
|
|
|
true, itemsz);
|
2000-07-21 21:21:00 +02:00
|
|
|
else if (offnum < newitemoff)
|
|
|
|
_bt_checksplitloc(&state, offnum, leftfree, rightfree,
|
|
|
|
false, itemsz);
|
2000-07-21 08:42:39 +02:00
|
|
|
else
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2000-07-21 21:21:00 +02:00
|
|
|
/* need to try it both ways! */
|
2000-07-21 08:42:39 +02:00
|
|
|
_bt_checksplitloc(&state, offnum, leftfree, rightfree,
|
|
|
|
true, itemsz);
|
2000-07-21 21:21:00 +02:00
|
|
|
/* here we are contemplating newitem as first on right */
|
|
|
|
_bt_checksplitloc(&state, offnum, leftfree, rightfree,
|
|
|
|
false, newitemsz);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2000-07-21 08:42:39 +02:00
|
|
|
|
2000-07-21 21:21:00 +02:00
|
|
|
/* Abort scan once we find a good-enough choice */
|
|
|
|
if (state.have_split && state.best_delta <= goodenough)
|
|
|
|
break;
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
dataitemstoleft += itemsz;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 21:21:00 +02:00
|
|
|
/*
|
|
|
|
* I believe it is not possible to fail to find a feasible split,
|
|
|
|
* but just in case ...
|
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
if (! state.have_split)
|
|
|
|
elog(FATAL, "_bt_findsplitloc: can't find a feasible split point for %s",
|
|
|
|
RelationGetRelationName(rel));
|
2000-07-21 21:21:00 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
*newitemonleft = state.newitemonleft;
|
|
|
|
return state.firstright;
|
|
|
|
}
|
|
|
|
|
2000-07-21 21:21:00 +02:00
|
|
|
/*
|
|
|
|
* Subroutine to analyze a particular possible split choice (ie, firstright
|
|
|
|
* and newitemonleft settings), and record the best split so far in *state.
|
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
static void
|
|
|
|
_bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
|
|
|
|
int leftfree, int rightfree,
|
|
|
|
bool newitemonleft, Size firstrightitemsz)
|
|
|
|
{
|
2000-07-21 21:21:00 +02:00
|
|
|
/*
|
|
|
|
* Account for the new item on whichever side it is to be put.
|
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
if (newitemonleft)
|
|
|
|
leftfree -= (int) state->newitemsz;
|
|
|
|
else
|
|
|
|
rightfree -= (int) state->newitemsz;
|
|
|
|
/*
|
|
|
|
* If we are not on the leaf level, we will be able to discard the
|
|
|
|
* key data from the first item that winds up on the right page.
|
|
|
|
*/
|
|
|
|
if (state->non_leaf)
|
|
|
|
rightfree += (int) firstrightitemsz -
|
2000-07-21 21:21:00 +02:00
|
|
|
(int) (MAXALIGN(sizeof(BTItemData)) + sizeof(ItemIdData));
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* If feasible split point, remember best delta.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
if (leftfree >= 0 && rightfree >= 0)
|
|
|
|
{
|
|
|
|
int delta = leftfree - rightfree;
|
|
|
|
|
|
|
|
if (delta < 0)
|
|
|
|
delta = -delta;
|
|
|
|
if (!state->have_split || delta < state->best_delta)
|
|
|
|
{
|
|
|
|
state->have_split = true;
|
|
|
|
state->newitemonleft = newitemonleft;
|
|
|
|
state->firstright = firstright;
|
|
|
|
state->best_delta = delta;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_getstackbuf() -- Walk back up the tree one step, and find the item
|
|
|
|
* we last looked at in the parent.
|
|
|
|
*
|
|
|
|
* This is possible because we save a bit image of the last item
|
|
|
|
* we looked at in the parent, and the update algorithm guarantees
|
|
|
|
* that if items above us in the tree move, they only move right.
|
|
|
|
*
|
|
|
|
* Also, re-set bts_blkno & bts_offset if changed.
|
|
|
|
*/
|
|
|
|
static Buffer
|
2001-02-02 20:49:15 +01:00
|
|
|
_bt_getstackbuf(Relation rel, BTStack stack, int access)
|
2000-07-21 08:42:39 +02:00
|
|
|
{
|
|
|
|
BlockNumber blkno;
|
2000-10-21 17:43:36 +02:00
|
|
|
Buffer buf;
|
2000-07-21 08:42:39 +02:00
|
|
|
OffsetNumber start,
|
|
|
|
offnum,
|
|
|
|
maxoff;
|
|
|
|
Page page;
|
|
|
|
ItemId itemid;
|
|
|
|
BTItem item;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
|
|
|
|
blkno = stack->bts_blkno;
|
2001-02-02 20:49:15 +01:00
|
|
|
buf = _bt_getbuf(rel, blkno, access);
|
2000-07-21 08:42:39 +02:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
start = stack->bts_offset;
|
|
|
|
/*
|
|
|
|
* _bt_insertonpg set bts_offset to InvalidOffsetNumber in the
|
|
|
|
* case of concurrent ROOT page split. Also, watch out for
|
|
|
|
* possibility that page has a high key now when it didn't before.
|
|
|
|
*/
|
|
|
|
if (start < P_FIRSTDATAKEY(opaque))
|
|
|
|
start = P_FIRSTDATAKEY(opaque);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
/* see if it's on this page */
|
|
|
|
for (offnum = start;
|
|
|
|
offnum <= maxoff;
|
|
|
|
offnum = OffsetNumberNext(offnum))
|
|
|
|
{
|
|
|
|
itemid = PageGetItemId(page, offnum);
|
|
|
|
item = (BTItem) PageGetItem(page, itemid);
|
|
|
|
if (BTItemSame(item, &stack->bts_btitem))
|
|
|
|
{
|
|
|
|
/* Return accurate pointer to where link is now */
|
|
|
|
stack->bts_blkno = blkno;
|
|
|
|
stack->bts_offset = offnum;
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* by here, the item we're looking for moved right at least one page */
|
|
|
|
if (P_RIGHTMOST(opaque))
|
2001-01-31 02:08:36 +01:00
|
|
|
{
|
2001-02-02 20:49:15 +01:00
|
|
|
_bt_relbuf(rel, buf, access);
|
2001-01-31 02:08:36 +01:00
|
|
|
return(InvalidBuffer);
|
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
blkno = opaque->btpo_next;
|
2001-02-02 20:49:15 +01:00
|
|
|
_bt_relbuf(rel, buf, access);
|
|
|
|
buf = _bt_getbuf(rel, blkno, access);
|
2000-07-21 08:42:39 +02:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
start = P_FIRSTDATAKEY(opaque);
|
|
|
|
}
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_newroot() -- Create a new root page for the index.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* We've just split the old root page and need to create a new one.
|
|
|
|
* In order to do this, we add a new root page to the file, then lock
|
|
|
|
* the metadata page and update it. This is guaranteed to be deadlock-
|
|
|
|
* free, because all readers release their locks on the metadata page
|
|
|
|
* before trying to lock the root, and all writers lock the root before
|
|
|
|
* trying to lock the metadata page. We have a write lock on the old
|
|
|
|
* root page, so we have not introduced any cycles into the waits-for
|
|
|
|
* graph.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* On entry, lbuf (the old root) and rbuf (its new peer) are write-
|
2001-01-26 02:24:31 +01:00
|
|
|
* locked. On exit, a new root page exists with entries for the
|
|
|
|
* two new children, metapage is updated and unlocked/unpinned.
|
|
|
|
* The new root buffer is returned to caller which has to unlock/unpin
|
|
|
|
* lbuf, rbuf & rootbuf.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2001-01-26 02:24:31 +01:00
|
|
|
static Buffer
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
|
|
|
{
|
2000-12-28 14:00:29 +01:00
|
|
|
Buffer rootbuf;
|
|
|
|
Page lpage,
|
|
|
|
rpage,
|
|
|
|
rootpage;
|
|
|
|
BlockNumber lbkno,
|
|
|
|
rbkno;
|
|
|
|
BlockNumber rootblknum;
|
|
|
|
BTPageOpaque rootopaque;
|
|
|
|
ItemId itemid;
|
|
|
|
BTItem item;
|
|
|
|
Size itemsz;
|
|
|
|
BTItem new_item;
|
|
|
|
Buffer metabuf;
|
|
|
|
Page metapg;
|
|
|
|
BTMetaPageData *metad;
|
2000-10-13 04:03:02 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/* get a new root page */
|
|
|
|
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
|
|
|
|
rootpage = BufferGetPage(rootbuf);
|
2000-10-04 02:04:43 +02:00
|
|
|
rootblknum = BufferGetBlockNumber(rootbuf);
|
2000-12-28 14:00:29 +01:00
|
|
|
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
|
|
|
|
metapg = BufferGetPage(metabuf);
|
|
|
|
metad = BTPageGetMeta(metapg);
|
2000-10-04 02:04:43 +02:00
|
|
|
|
|
|
|
/* NO ELOG(ERROR) from here till newroot op is logged */
|
2001-01-12 22:54:01 +01:00
|
|
|
START_CRIT_SECTION();
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
/* set btree special data */
|
|
|
|
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
|
|
|
|
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
|
|
|
|
rootopaque->btpo_flags |= BTP_ROOT;
|
2000-10-04 02:04:43 +02:00
|
|
|
rootopaque->btpo_parent = BTREE_METAPAGE;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
lbkno = BufferGetBlockNumber(lbuf);
|
|
|
|
rbkno = BufferGetBlockNumber(rbuf);
|
|
|
|
lpage = BufferGetPage(lbuf);
|
|
|
|
rpage = BufferGetPage(rbuf);
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/*
|
|
|
|
* Make sure pages in old root level have valid parent links --- we will
|
|
|
|
* need this in _bt_insertonpg() if a concurrent root split happens (see
|
|
|
|
* README).
|
|
|
|
*/
|
1999-05-25 18:15:34 +02:00
|
|
|
((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_parent =
|
|
|
|
((BTPageOpaque) PageGetSpecialPointer(rpage))->btpo_parent =
|
2000-10-04 02:04:43 +02:00
|
|
|
rootblknum;
|
1999-03-28 22:32:42 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Create downlink item for left page (old root). Since this will be
|
|
|
|
* the first item in a non-leaf page, it implicitly has minus-infinity
|
|
|
|
* key value, so we need not store any actual key in it.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
itemsz = sizeof(BTItemData);
|
|
|
|
new_item = (BTItem) palloc(itemsz);
|
|
|
|
new_item->bti_itup.t_info = itemsz;
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY);
|
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Insert the left page pointer into the new root page. The root page
|
|
|
|
* is the rightmost page on its level so there is no "high key" in it;
|
|
|
|
* the two items will go into positions P_HIKEY and P_FIRSTKEY.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
|
|
|
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
|
2000-10-04 02:04:43 +02:00
|
|
|
elog(STOP, "btree: failed to add leftkey to new root page");
|
1997-09-07 07:04:48 +02:00
|
|
|
pfree(new_item);
|
|
|
|
|
|
|
|
/*
|
2000-07-21 08:42:39 +02:00
|
|
|
* Create downlink item for right page. The key for it is obtained from
|
|
|
|
* the "high key" position in the left page.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
itemid = PageGetItemId(lpage, P_HIKEY);
|
1997-09-07 07:04:48 +02:00
|
|
|
itemsz = ItemIdGetLength(itemid);
|
2000-07-21 08:42:39 +02:00
|
|
|
item = (BTItem) PageGetItem(lpage, itemid);
|
1997-09-07 07:04:48 +02:00
|
|
|
new_item = _bt_formitem(&(item->bti_itup));
|
|
|
|
ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* insert the right page pointer into the new root page.
|
|
|
|
*/
|
|
|
|
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
|
2000-10-04 02:04:43 +02:00
|
|
|
elog(STOP, "btree: failed to add rightkey to new root page");
|
1997-09-07 07:04:48 +02:00
|
|
|
pfree(new_item);
|
|
|
|
|
2000-12-28 14:00:29 +01:00
|
|
|
metad->btm_root = rootblknum;
|
|
|
|
(metad->btm_level)++;
|
|
|
|
|
2000-10-04 02:04:43 +02:00
|
|
|
/* XLOG stuff */
|
|
|
|
{
|
2000-10-13 04:03:02 +02:00
|
|
|
xl_btree_newroot xlrec;
|
2000-10-21 17:43:36 +02:00
|
|
|
XLogRecPtr recptr;
|
2000-12-28 14:00:29 +01:00
|
|
|
XLogRecData rdata[2];
|
2000-10-13 04:03:02 +02:00
|
|
|
|
2000-10-04 02:04:43 +02:00
|
|
|
xlrec.node = rel->rd_node;
|
2000-12-28 14:00:29 +01:00
|
|
|
xlrec.level = metad->btm_level;
|
2000-10-13 04:03:02 +02:00
|
|
|
BlockIdSet(&(xlrec.rootblk), rootblknum);
|
2000-12-28 14:00:29 +01:00
|
|
|
rdata[0].buffer = InvalidBuffer;
|
|
|
|
rdata[0].data = (char*)&xlrec;
|
|
|
|
rdata[0].len = SizeOfBtreeNewroot;
|
|
|
|
rdata[0].next = &(rdata[1]);
|
2000-10-04 02:04:43 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Dirrect access to page is not good but faster - we should
|
|
|
|
* implement some new func in page API.
|
|
|
|
*/
|
2000-12-28 14:00:29 +01:00
|
|
|
rdata[1].buffer = InvalidBuffer;
|
|
|
|
rdata[1].data = (char*)rootpage + ((PageHeader) rootpage)->pd_upper;
|
|
|
|
rdata[1].len = ((PageHeader)rootpage)->pd_special -
|
|
|
|
((PageHeader)rootpage)->pd_upper;
|
|
|
|
rdata[1].next = NULL;
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2000-12-28 14:00:29 +01:00
|
|
|
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata);
|
2000-10-13 04:03:02 +02:00
|
|
|
|
2000-10-04 02:04:43 +02:00
|
|
|
PageSetLSN(rootpage, recptr);
|
|
|
|
PageSetSUI(rootpage, ThisStartUpID);
|
2000-10-13 04:03:02 +02:00
|
|
|
PageSetLSN(metapg, recptr);
|
|
|
|
PageSetSUI(metapg, ThisStartUpID);
|
|
|
|
|
2001-02-08 00:35:33 +01:00
|
|
|
/* we changed their btpo_parent */
|
|
|
|
PageSetLSN(lpage, recptr);
|
|
|
|
PageSetSUI(lpage, ThisStartUpID);
|
|
|
|
PageSetLSN(rpage, recptr);
|
|
|
|
PageSetSUI(rpage, ThisStartUpID);
|
2000-10-04 02:04:43 +02:00
|
|
|
}
|
2001-01-12 22:54:01 +01:00
|
|
|
END_CRIT_SECTION();
|
2000-10-04 02:04:43 +02:00
|
|
|
|
2001-01-26 02:24:31 +01:00
|
|
|
/* write and let go of metapage buffer */
|
2000-12-28 14:00:29 +01:00
|
|
|
_bt_wrtbuf(rel, metabuf);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-01-26 02:24:31 +01:00
|
|
|
return(rootbuf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In the event old root page was splitted but no new one was created we
|
|
|
|
* build required parent levels keeping write lock on old root page.
|
|
|
|
* Note: it's assumed that old root page' btpo_parent points to meta page,
|
|
|
|
* ie not to parent page. On exit, new root page buffer is write locked.
|
|
|
|
* If "release" is TRUE then oldrootbuf will be released immediately
|
|
|
|
* after upper level is builded.
|
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
_bt_fixroot(Relation rel, Buffer oldrootbuf, bool release)
|
|
|
|
{
|
|
|
|
Buffer rootbuf;
|
|
|
|
BlockNumber rootblk;
|
|
|
|
Page rootpage;
|
|
|
|
XLogRecPtr rootLSN;
|
|
|
|
Page oldrootpage = BufferGetPage(oldrootbuf);
|
|
|
|
BTPageOpaque oldrootopaque = (BTPageOpaque)
|
|
|
|
PageGetSpecialPointer(oldrootpage);
|
|
|
|
Buffer buf, leftbuf, rightbuf;
|
|
|
|
Page page, leftpage, rightpage;
|
|
|
|
BTPageOpaque opaque, leftopaque, rightopaque;
|
|
|
|
OffsetNumber newitemoff;
|
|
|
|
BTItem btitem, ritem;
|
|
|
|
Size itemsz;
|
|
|
|
|
|
|
|
if (! P_LEFTMOST(oldrootopaque) || P_RIGHTMOST(oldrootopaque))
|
|
|
|
elog(ERROR, "bt_fixroot: not valid old root page");
|
|
|
|
|
|
|
|
/* Read right neighbor and create new root page*/
|
|
|
|
leftbuf = _bt_getbuf(rel, oldrootopaque->btpo_next, BT_WRITE);
|
|
|
|
leftpage = BufferGetPage(leftbuf);
|
|
|
|
leftopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
|
|
|
|
rootbuf = _bt_newroot(rel, oldrootbuf, leftbuf);
|
|
|
|
rootpage = BufferGetPage(rootbuf);
|
|
|
|
rootLSN = PageGetLSN(rootpage);
|
|
|
|
rootblk = BufferGetBlockNumber(rootbuf);
|
|
|
|
|
|
|
|
/* parent page where to insert pointers */
|
|
|
|
buf = rootbuf;
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now read other pages (if any) on level and add them to new root.
|
2001-02-02 20:49:15 +01:00
|
|
|
* Here we break one of our locking rules - never hold lock on parent
|
|
|
|
* page when acquiring lock on its child, - but we free from deadlock:
|
|
|
|
*
|
2001-01-26 02:24:31 +01:00
|
|
|
* If concurrent process will split one of pages on this level then it
|
2001-02-02 20:49:15 +01:00
|
|
|
* will see either btpo_parent == metablock or btpo_parent == rootblk.
|
|
|
|
* In first case it will give up its locks and walk to the leftmost page
|
|
|
|
* (oldrootbuf) in _bt_fixup() - ie it will wait for us and let us
|
2001-01-26 02:24:31 +01:00
|
|
|
* continue. In second case it will try to lock rootbuf keeping its locks
|
|
|
|
* on buffers we already passed, also waiting for us. If we'll have to
|
|
|
|
* unlock rootbuf (split it) and that process will have to split page
|
|
|
|
* of new level we created (level of rootbuf) then it will wait while
|
|
|
|
* we create upper level. Etc.
|
|
|
|
*/
|
|
|
|
while(! P_RIGHTMOST(leftopaque))
|
|
|
|
{
|
|
|
|
rightbuf = _bt_getbuf(rel, leftopaque->btpo_next, BT_WRITE);
|
|
|
|
rightpage = BufferGetPage(rightbuf);
|
|
|
|
rightopaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
|
|
|
|
|
2001-02-08 00:35:33 +01:00
|
|
|
/*
|
|
|
|
* Update LSN & StartUpID of child page buffer to ensure that
|
|
|
|
* it will be written on disk after flushing log record for new
|
|
|
|
* root creation. Unfortunately, for the moment (?) we do not
|
|
|
|
* log this operation and so possibly break our rule to log entire
|
|
|
|
* page content on first after checkpoint modification.
|
|
|
|
*/
|
2001-01-26 02:24:31 +01:00
|
|
|
HOLD_INTERRUPTS();
|
|
|
|
rightopaque->btpo_parent = rootblk;
|
|
|
|
if (XLByteLT(PageGetLSN(rightpage), rootLSN))
|
|
|
|
PageSetLSN(rightpage, rootLSN);
|
|
|
|
PageSetSUI(rightpage, ThisStartUpID);
|
|
|
|
RESUME_INTERRUPTS();
|
|
|
|
|
|
|
|
ritem = (BTItem) PageGetItem(leftpage, PageGetItemId(leftpage, P_HIKEY));
|
|
|
|
btitem = _bt_formitem(&(ritem->bti_itup));
|
|
|
|
ItemPointerSet(&(btitem->bti_itup.t_tid), leftopaque->btpo_next, P_HIKEY);
|
|
|
|
itemsz = IndexTupleDSize(btitem->bti_itup)
|
|
|
|
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
|
|
|
|
itemsz = MAXALIGN(itemsz);
|
|
|
|
|
|
|
|
newitemoff = OffsetNumberNext(PageGetMaxOffsetNumber(page));
|
|
|
|
|
|
|
|
if (PageGetFreeSpace(page) < itemsz)
|
|
|
|
{
|
|
|
|
Buffer newbuf;
|
|
|
|
OffsetNumber firstright;
|
|
|
|
OffsetNumber itup_off;
|
|
|
|
BlockNumber itup_blkno;
|
|
|
|
bool newitemonleft;
|
|
|
|
|
|
|
|
firstright = _bt_findsplitloc(rel, page,
|
|
|
|
newitemoff, itemsz, &newitemonleft);
|
|
|
|
newbuf = _bt_split(rel, buf, firstright,
|
|
|
|
newitemoff, itemsz, btitem, newitemonleft,
|
|
|
|
&itup_off, &itup_blkno);
|
|
|
|
/* Keep lock on new "root" buffer ! */
|
|
|
|
if (buf != rootbuf)
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
buf = newbuf;
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
_bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
|
|
|
|
|
|
|
|
/* give up left buffer */
|
2001-02-08 00:35:33 +01:00
|
|
|
_bt_wrtbuf(rel, leftbuf);
|
2001-01-31 02:08:36 +01:00
|
|
|
pfree(btitem);
|
2001-01-26 02:24:31 +01:00
|
|
|
leftbuf = rightbuf;
|
|
|
|
leftpage = rightpage;
|
|
|
|
leftopaque = rightopaque;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* give up rightmost page buffer */
|
2001-02-08 00:35:33 +01:00
|
|
|
_bt_wrtbuf(rel, leftbuf);
|
2001-01-26 02:24:31 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Here we hold locks on old root buffer, new root buffer we've
|
|
|
|
* created with _bt_newroot() - rootbuf, - and buf we've used
|
|
|
|
* for last insert ops - buf. If rootbuf != buf then we have to
|
|
|
|
* create at least one more level. And if "release" is TRUE
|
2001-01-29 08:28:17 +01:00
|
|
|
* then we give up oldrootbuf.
|
2001-01-26 02:24:31 +01:00
|
|
|
*/
|
|
|
|
if (release)
|
2001-02-08 00:35:33 +01:00
|
|
|
_bt_wrtbuf(rel, oldrootbuf);
|
2001-01-26 02:24:31 +01:00
|
|
|
|
|
|
|
if (rootbuf != buf)
|
|
|
|
{
|
2001-02-08 00:35:33 +01:00
|
|
|
_bt_wrtbuf(rel, buf);
|
2001-01-26 02:24:31 +01:00
|
|
|
return(_bt_fixroot(rel, rootbuf, true));
|
|
|
|
}
|
|
|
|
|
|
|
|
return(rootbuf);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2001-01-31 02:08:36 +01:00
|
|
|
/*
|
|
|
|
* Using blkno of leftmost page on a level inside tree this func
|
|
|
|
* checks/fixes tree from this level up to the root page.
|
|
|
|
*/
|
2001-01-29 08:28:17 +01:00
|
|
|
static void
|
2001-01-31 02:08:36 +01:00
|
|
|
_bt_fixtree(Relation rel, BlockNumber blkno)
|
|
|
|
{
|
|
|
|
Buffer buf;
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
BlockNumber pblkno;
|
|
|
|
|
|
|
|
for ( ; ; )
|
|
|
|
{
|
|
|
|
buf = _bt_getbuf(rel, blkno, BT_READ);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
if (! P_LEFTMOST(opaque) || P_ISLEAF(opaque))
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixtree[%s]: invalid start page (need to recreate index)", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
pblkno = opaque->btpo_parent;
|
|
|
|
|
|
|
|
/* check/fix entire level */
|
|
|
|
_bt_fixlevel(rel, buf, InvalidBlockNumber);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* No pins/locks are held here. Re-read start page if its
|
|
|
|
* btpo_parent pointed to meta page else go up one level.
|
2001-02-08 00:35:33 +01:00
|
|
|
*
|
|
|
|
* XXX have to catch InvalidBlockNumber at the moment -:(
|
2001-01-31 02:08:36 +01:00
|
|
|
*/
|
2001-02-08 00:35:33 +01:00
|
|
|
if (pblkno == BTREE_METAPAGE || pblkno == InvalidBlockNumber)
|
2001-01-31 02:08:36 +01:00
|
|
|
{
|
|
|
|
buf = _bt_getbuf(rel, blkno, BT_WRITE);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
if (P_ISROOT(opaque))
|
|
|
|
{
|
|
|
|
/* Tree is Ok now */
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Call _bt_fixroot() if there is no upper level */
|
2001-02-08 00:35:33 +01:00
|
|
|
if (BTreeInvalidParent(opaque))
|
2001-01-31 02:08:36 +01:00
|
|
|
{
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(NOTICE, "bt_fixtree[%s]: fixing root page", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
buf = _bt_fixroot(rel, buf, true);
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Have to go up one level */
|
2001-02-08 00:35:33 +01:00
|
|
|
pblkno = opaque->btpo_parent;
|
2001-01-31 02:08:36 +01:00
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
}
|
2001-02-08 00:35:33 +01:00
|
|
|
blkno = pblkno;
|
2001-01-31 02:08:36 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check/fix level starting from page in buffer buf up to block
|
|
|
|
* limit on *child* level (or till rightmost child page if limit
|
|
|
|
* is InvalidBlockNumber). Start buffer must be read locked.
|
2001-02-02 20:49:15 +01:00
|
|
|
* No pins/locks are held on exit.
|
2001-01-31 02:08:36 +01:00
|
|
|
*/
|
2001-02-02 20:49:15 +01:00
|
|
|
static void
|
2001-01-31 02:08:36 +01:00
|
|
|
_bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit)
|
|
|
|
{
|
|
|
|
BlockNumber blkno = BufferGetBlockNumber(buf);
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
BlockNumber cblkno[3];
|
|
|
|
OffsetNumber coff[3];
|
|
|
|
Buffer cbuf[3];
|
|
|
|
Page cpage[3];
|
|
|
|
BTPageOpaque copaque[3];
|
|
|
|
BTItem btitem;
|
|
|
|
int cidx, i;
|
|
|
|
bool goodbye = false;
|
|
|
|
char tbuf[BLCKSZ];
|
|
|
|
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
/* copy page to temp storage */
|
|
|
|
memmove(tbuf, page, PageGetPageSize(page));
|
|
|
|
_bt_relbuf(rel, buf, BT_READ);
|
|
|
|
|
|
|
|
page = (Page)tbuf;
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
|
|
|
|
/* Initialize first child data */
|
|
|
|
coff[0] = P_FIRSTDATAKEY(opaque);
|
|
|
|
if (coff[0] > PageGetMaxOffsetNumber(page))
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixlevel[%s]: invalid maxoff on start page (need to recreate index)", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, coff[0]));
|
|
|
|
cblkno[0] = ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid));
|
|
|
|
cbuf[0] = _bt_getbuf(rel, cblkno[0], BT_READ);
|
|
|
|
cpage[0] = BufferGetPage(cbuf[0]);
|
|
|
|
copaque[0] = (BTPageOpaque) PageGetSpecialPointer(cpage[0]);
|
|
|
|
if (P_LEFTMOST(opaque) && ! P_LEFTMOST(copaque[0]))
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixtlevel[%s]: non-leftmost child page of leftmost parent (need to recreate index)", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
/* caller should take care and avoid this */
|
|
|
|
if (P_RIGHTMOST(copaque[0]))
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixtlevel[%s]: invalid start child (need to recreate index)", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
|
|
|
|
for ( ; ; )
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Read up to 2 more child pages and look for pointers
|
|
|
|
* to them in *saved* parent page
|
|
|
|
*/
|
|
|
|
coff[1] = coff[2] = InvalidOffsetNumber;
|
|
|
|
for (cidx = 0; cidx < 2; )
|
|
|
|
{
|
|
|
|
cidx++;
|
|
|
|
cblkno[cidx] = (copaque[cidx - 1])->btpo_next;
|
|
|
|
cbuf[cidx] = _bt_getbuf(rel, cblkno[cidx], BT_READ);
|
|
|
|
cpage[cidx] = BufferGetPage(cbuf[cidx]);
|
|
|
|
copaque[cidx] = (BTPageOpaque) PageGetSpecialPointer(cpage[cidx]);
|
|
|
|
coff[cidx] = _bt_getoff(page, cblkno[cidx]);
|
|
|
|
|
|
|
|
/* sanity check */
|
|
|
|
if (coff[cidx] != InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
for (i = cidx - 1; i >= 0; i--)
|
|
|
|
{
|
|
|
|
if (coff[i] == InvalidOffsetNumber)
|
|
|
|
continue;
|
|
|
|
if (coff[cidx] != coff[i] + 1)
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixlevel[%s]: invalid item order(1) (need to recreate index)", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (P_RIGHTMOST(copaque[cidx]))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read parent page and insert missed pointers.
|
|
|
|
*/
|
|
|
|
if (coff[1] == InvalidOffsetNumber ||
|
|
|
|
(cidx == 2 && coff[2] == InvalidOffsetNumber))
|
|
|
|
{
|
|
|
|
Buffer newbuf;
|
|
|
|
Page newpage;
|
|
|
|
BTPageOpaque newopaque;
|
|
|
|
BTItem ritem;
|
|
|
|
Size itemsz;
|
|
|
|
OffsetNumber newitemoff;
|
|
|
|
BlockNumber parblk[3];
|
|
|
|
BTStackData stack;
|
|
|
|
|
|
|
|
stack.bts_parent = NULL;
|
2001-02-02 20:49:15 +01:00
|
|
|
stack.bts_blkno = blkno;
|
2001-01-31 02:08:36 +01:00
|
|
|
stack.bts_offset = InvalidOffsetNumber;
|
|
|
|
ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid),
|
|
|
|
cblkno[0], P_HIKEY);
|
|
|
|
|
2001-02-02 20:49:15 +01:00
|
|
|
buf = _bt_getstackbuf(rel, &stack, BT_WRITE);
|
2001-01-31 02:08:36 +01:00
|
|
|
if (buf == InvalidBuffer)
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixlevel[%s]: pointer disappeared (need to recreate index)", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
coff[0] = stack.bts_offset;
|
2001-02-02 20:49:15 +01:00
|
|
|
blkno = BufferGetBlockNumber(buf);
|
|
|
|
parblk[0] = blkno;
|
2001-01-31 02:08:36 +01:00
|
|
|
|
|
|
|
/* Check/insert missed pointers */
|
|
|
|
for (i = 1; i <= cidx; i++)
|
|
|
|
{
|
|
|
|
coff[i] = _bt_getoff(page, cblkno[i]);
|
|
|
|
|
|
|
|
/* sanity check */
|
|
|
|
parblk[i] = BufferGetBlockNumber(buf);
|
|
|
|
if (coff[i] != InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
if (parblk[i] == parblk[i - 1] &&
|
|
|
|
coff[i] != coff[i - 1] + 1)
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixlevel[%s]: invalid item order(2) (need to recreate index)", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Have to check next page ? */
|
|
|
|
if ((! P_RIGHTMOST(opaque)) &&
|
|
|
|
coff[i - 1] == PageGetMaxOffsetNumber(page)) /* yes */
|
|
|
|
{
|
|
|
|
newbuf = _bt_getbuf(rel, opaque->btpo_next, BT_WRITE);
|
|
|
|
newpage = BufferGetPage(newbuf);
|
|
|
|
newopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
|
|
|
|
coff[i] = _bt_getoff(newpage, cblkno[i]);
|
|
|
|
if (coff[i] != InvalidOffsetNumber) /* found ! */
|
|
|
|
{
|
|
|
|
if (coff[i] != P_FIRSTDATAKEY(newopaque))
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixlevel[%s]: invalid item order(3) (need to recreate index)", RelationGetRelationName(rel));
|
2001-01-31 02:08:36 +01:00
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
buf = newbuf;
|
|
|
|
page = newpage;
|
|
|
|
opaque = newopaque;
|
2001-02-02 20:49:15 +01:00
|
|
|
blkno = BufferGetBlockNumber(buf);
|
|
|
|
parblk[i] = blkno;
|
2001-01-31 02:08:36 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* unfound - need to insert on current page */
|
|
|
|
_bt_relbuf(rel, newbuf, BT_WRITE);
|
|
|
|
}
|
|
|
|
/* insert pointer */
|
|
|
|
ritem = (BTItem) PageGetItem(cpage[i - 1],
|
|
|
|
PageGetItemId(cpage[i - 1], P_HIKEY));
|
|
|
|
btitem = _bt_formitem(&(ritem->bti_itup));
|
|
|
|
ItemPointerSet(&(btitem->bti_itup.t_tid), cblkno[i], P_HIKEY);
|
|
|
|
itemsz = IndexTupleDSize(btitem->bti_itup)
|
|
|
|
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
|
|
|
|
itemsz = MAXALIGN(itemsz);
|
|
|
|
|
|
|
|
newitemoff = coff[i - 1] + 1;
|
|
|
|
|
|
|
|
if (PageGetFreeSpace(page) < itemsz)
|
|
|
|
{
|
|
|
|
OffsetNumber firstright;
|
|
|
|
OffsetNumber itup_off;
|
|
|
|
BlockNumber itup_blkno;
|
|
|
|
bool newitemonleft;
|
|
|
|
|
|
|
|
firstright = _bt_findsplitloc(rel, page,
|
|
|
|
newitemoff, itemsz, &newitemonleft);
|
|
|
|
newbuf = _bt_split(rel, buf, firstright,
|
|
|
|
newitemoff, itemsz, btitem, newitemonleft,
|
|
|
|
&itup_off, &itup_blkno);
|
|
|
|
/* what buffer we need in ? */
|
|
|
|
if (newitemonleft)
|
|
|
|
_bt_relbuf(rel, newbuf, BT_WRITE);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
buf = newbuf;
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
}
|
2001-02-02 20:49:15 +01:00
|
|
|
blkno = BufferGetBlockNumber(buf);
|
2001-01-31 02:08:36 +01:00
|
|
|
coff[i] = itup_off;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
_bt_insertuple(rel, buf, itemsz, btitem, newitemoff);
|
|
|
|
coff[i] = newitemoff;
|
|
|
|
}
|
|
|
|
|
|
|
|
pfree(btitem);
|
2001-02-02 20:49:15 +01:00
|
|
|
parblk[i] = blkno;
|
2001-01-31 02:08:36 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* copy page with pointer to cblkno[cidx] to temp storage */
|
|
|
|
memmove(tbuf, page, PageGetPageSize(page));
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
page = (Page)tbuf;
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Continue if current check/fix level page is rightmost */
|
|
|
|
if (P_RIGHTMOST(opaque))
|
|
|
|
goodbye = false;
|
|
|
|
|
|
|
|
/* Pointers to child pages are Ok - right end of child level ? */
|
|
|
|
_bt_relbuf(rel, cbuf[0], BT_READ);
|
|
|
|
_bt_relbuf(rel, cbuf[1], BT_READ);
|
|
|
|
if (cidx == 1 ||
|
|
|
|
(cidx == 2 && (P_RIGHTMOST(copaque[2]) || goodbye)))
|
|
|
|
{
|
|
|
|
if (cidx == 2)
|
|
|
|
_bt_relbuf(rel, cbuf[2], BT_READ);
|
2001-02-02 20:49:15 +01:00
|
|
|
return;
|
2001-01-31 02:08:36 +01:00
|
|
|
}
|
|
|
|
if (cblkno[0] == limit || cblkno[1] == limit)
|
|
|
|
goodbye = true;
|
|
|
|
cblkno[0] = cblkno[2];
|
|
|
|
cbuf[0] = cbuf[2];
|
|
|
|
cpage[0] = cpage[2];
|
|
|
|
copaque[0] = copaque[2];
|
|
|
|
coff[0] = coff[2];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-02-02 20:49:15 +01:00
|
|
|
/*
|
|
|
|
* Check/fix part of tree - branch - up from parent of level with blocks
|
|
|
|
* lblkno and rblknum. We first ensure that parent level has pointers
|
|
|
|
* to both lblkno & rblknum and if those pointers are on different
|
|
|
|
* parent pages then do the same for parent level, etc. No locks must
|
|
|
|
* be held on target level and upper on entry. No locks will be held
|
|
|
|
* on exit. Stack created when traversing tree down should be provided and
|
|
|
|
* it must points to parent level. rblkno must be on the right from lblkno.
|
|
|
|
* (This function is special edition of more expensive _bt_fixtree(),
|
|
|
|
* but it doesn't guarantee full consistency of tree.)
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
_bt_fixbranch(Relation rel, BlockNumber lblkno,
|
|
|
|
BlockNumber rblkno, BTStack true_stack)
|
|
|
|
{
|
|
|
|
BlockNumber blkno = true_stack->bts_blkno;
|
|
|
|
BTStackData stack;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
Buffer buf, rbuf;
|
|
|
|
Page page;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
|
|
|
|
true_stack = true_stack->bts_parent;
|
|
|
|
for ( ; ; )
|
|
|
|
{
|
|
|
|
buf = _bt_getbuf(rel, blkno, BT_READ);
|
|
|
|
|
|
|
|
/* Check/fix parent level pointed by blkno */
|
|
|
|
_bt_fixlevel(rel, buf, rblkno);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Here parent level should have pointers for both
|
|
|
|
* lblkno and rblkno and we have to find them.
|
|
|
|
*/
|
|
|
|
stack.bts_parent = NULL;
|
|
|
|
stack.bts_blkno = blkno;
|
|
|
|
stack.bts_offset = InvalidOffsetNumber;
|
|
|
|
ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), lblkno, P_HIKEY);
|
|
|
|
buf = _bt_getstackbuf(rel, &stack, BT_READ);
|
|
|
|
if (buf == InvalidBuffer)
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixbranch[%s]: left pointer unfound (need to recreate index)", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
page = BufferGetPage(buf);
|
|
|
|
offnum = _bt_getoff(page, rblkno);
|
|
|
|
|
|
|
|
if (offnum != InvalidOffsetNumber) /* right pointer found */
|
|
|
|
{
|
|
|
|
if (offnum <= stack.bts_offset)
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixbranch[%s]: invalid item order (need to recreate index)", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
_bt_relbuf(rel, buf, BT_READ);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Pointers are on different parent pages - find right one */
|
|
|
|
lblkno = BufferGetBlockNumber(buf);
|
2001-02-08 00:35:33 +01:00
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
if (P_RIGHTMOST(opaque))
|
|
|
|
elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(1) (need to recreate index)", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
|
|
|
|
stack.bts_parent = NULL;
|
2001-02-08 00:35:33 +01:00
|
|
|
stack.bts_blkno = opaque->btpo_next;
|
2001-02-02 20:49:15 +01:00
|
|
|
stack.bts_offset = InvalidOffsetNumber;
|
|
|
|
ItemPointerSet(&(stack.bts_btitem.bti_itup.t_tid), rblkno, P_HIKEY);
|
|
|
|
rbuf = _bt_getstackbuf(rel, &stack, BT_READ);
|
|
|
|
if (rbuf == InvalidBuffer)
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(2) (need to recreate index)", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
rblkno = BufferGetBlockNumber(rbuf);
|
|
|
|
_bt_relbuf(rel, rbuf, BT_READ);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have parent item in true_stack then go up one level and
|
|
|
|
* ensure that it has pointers to new lblkno & rblkno.
|
|
|
|
*/
|
|
|
|
if (true_stack)
|
|
|
|
{
|
|
|
|
_bt_relbuf(rel, buf, BT_READ);
|
|
|
|
blkno = true_stack->bts_blkno;
|
|
|
|
true_stack = true_stack->bts_parent;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Well, we are on the level that was root or unexistent when
|
|
|
|
* we started traversing tree down. If btpo_parent is updated
|
|
|
|
* then we'll use it to continue, else we'll fix/restore upper
|
|
|
|
* levels entirely.
|
|
|
|
*/
|
2001-02-08 00:35:33 +01:00
|
|
|
if (!BTreeInvalidParent(opaque))
|
2001-02-02 20:49:15 +01:00
|
|
|
{
|
|
|
|
blkno = opaque->btpo_parent;
|
|
|
|
_bt_relbuf(rel, buf, BT_READ);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Have to switch to excl buf lock and re-check btpo_parent */
|
|
|
|
_bt_relbuf(rel, buf, BT_READ);
|
|
|
|
buf = _bt_getbuf(rel, blkno, BT_WRITE);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
2001-02-08 00:35:33 +01:00
|
|
|
if (!BTreeInvalidParent(opaque))
|
2001-02-02 20:49:15 +01:00
|
|
|
{
|
|
|
|
blkno = opaque->btpo_parent;
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We hold excl lock on some internal page with unupdated
|
|
|
|
* btpo_parent - time for _bt_fixup.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(NOTICE, "bt_fixbranch[%s]: fixing upper levels", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
_bt_fixup(rel, buf);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Having buf excl locked this routine walks to the left on level and
|
|
|
|
* uses either _bt_fixtree() or _bt_fixroot() to create/check&fix upper
|
|
|
|
* levels. No buffer pins/locks will be held on exit.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
_bt_fixup(Relation rel, Buffer buf)
|
|
|
|
{
|
|
|
|
Page page;
|
|
|
|
BTPageOpaque opaque;
|
|
|
|
BlockNumber blkno;
|
|
|
|
|
|
|
|
for ( ; ; )
|
|
|
|
{
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
/*
|
|
|
|
* If someone else already created parent pages
|
|
|
|
* then it's time for _bt_fixtree() to check upper
|
|
|
|
* levels and fix them, if required.
|
|
|
|
*/
|
2001-02-08 00:35:33 +01:00
|
|
|
if (!BTreeInvalidParent(opaque))
|
2001-02-02 20:49:15 +01:00
|
|
|
{
|
|
|
|
blkno = opaque->btpo_parent;
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(NOTICE, "bt_fixup[%s]: checking/fixing upper levels", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
_bt_fixtree(rel, blkno);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (P_LEFTMOST(opaque))
|
|
|
|
break;
|
|
|
|
blkno = opaque->btpo_prev;
|
|
|
|
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
|
|
|
ReleaseBuffer(buf);
|
|
|
|
buf = _bt_getbuf(rel, blkno, BT_WRITE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok, we are on the leftmost page, it's write locked
|
|
|
|
* by us and its btpo_parent points to meta page - time
|
|
|
|
* for _bt_fixroot().
|
|
|
|
*/
|
2001-02-08 00:35:33 +01:00
|
|
|
elog(NOTICE, "bt_fixup[%s]: fixing root page", RelationGetRelationName(rel));
|
2001-02-02 20:49:15 +01:00
|
|
|
buf = _bt_fixroot(rel, buf, true);
|
|
|
|
_bt_relbuf(rel, buf, BT_WRITE);
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2001-01-31 02:08:36 +01:00
|
|
|
static OffsetNumber
|
|
|
|
_bt_getoff(Page page, BlockNumber blkno)
|
2001-01-29 08:28:17 +01:00
|
|
|
{
|
2001-01-31 02:08:36 +01:00
|
|
|
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
OffsetNumber offnum = P_FIRSTDATAKEY(opaque);
|
|
|
|
BlockNumber curblkno;
|
|
|
|
ItemId itemid;
|
|
|
|
BTItem item;
|
|
|
|
|
|
|
|
for ( ; offnum <= maxoff; offnum++)
|
|
|
|
{
|
|
|
|
itemid = PageGetItemId(page, offnum);
|
|
|
|
item = (BTItem) PageGetItem(page, itemid);
|
|
|
|
curblkno = ItemPointerGetBlockNumber(&(item->bti_itup.t_tid));
|
|
|
|
if (curblkno == blkno)
|
|
|
|
return(offnum);
|
|
|
|
}
|
|
|
|
|
|
|
|
return(InvalidOffsetNumber);
|
2001-01-29 08:28:17 +01:00
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/*
|
1997-09-07 07:04:48 +02:00
|
|
|
* _bt_pgaddtup() -- add a tuple to a particular page in the index.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* This routine adds the tuple to the page as requested. It does
|
|
|
|
* not affect pin/lock status, but you'd better have a write lock
|
|
|
|
* and pin on the target buffer! Don't forget to write and release
|
|
|
|
* the buffer afterwards, either.
|
|
|
|
*
|
|
|
|
* The main difference between this routine and a bare PageAddItem call
|
|
|
|
* is that this code knows that the leftmost data item on a non-leaf
|
|
|
|
* btree page doesn't need to have a key. Therefore, it strips such
|
|
|
|
* items down to just the item header. CAUTION: this works ONLY if
|
|
|
|
* we insert the items in order, so that the given itup_off does
|
|
|
|
* represent the final position of the item!
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
2000-07-21 08:42:39 +02:00
|
|
|
static void
|
1996-07-09 08:22:35 +02:00
|
|
|
_bt_pgaddtup(Relation rel,
|
2000-07-21 08:42:39 +02:00
|
|
|
Page page,
|
1997-09-07 07:04:48 +02:00
|
|
|
Size itemsize,
|
|
|
|
BTItem btitem,
|
2000-07-21 08:42:39 +02:00
|
|
|
OffsetNumber itup_off,
|
|
|
|
const char *where)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
|
|
|
BTItemData truncitem;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
if (! P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
|
2000-02-18 07:32:39 +01:00
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
memcpy(&truncitem, btitem, sizeof(BTItemData));
|
|
|
|
truncitem.bti_itup.t_info = sizeof(BTItemData);
|
|
|
|
btitem = &truncitem;
|
|
|
|
itemsize = sizeof(BTItemData);
|
2000-02-18 07:32:39 +01:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
if (PageAddItem(page, (Item) btitem, itemsize, itup_off,
|
|
|
|
LP_USED) == InvalidOffsetNumber)
|
2000-10-04 02:04:43 +02:00
|
|
|
elog(STOP, "btree: failed to add item to the %s for %s",
|
2000-07-21 08:42:39 +02:00
|
|
|
where, RelationGetRelationName(rel));
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-03-24 09:48:16 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* _bt_isequal - used in _bt_doinsert in check for duplicates.
|
|
|
|
*
|
2000-07-21 08:42:39 +02:00
|
|
|
* This is very similar to _bt_compare, except for NULL handling.
|
1997-03-24 09:48:16 +01:00
|
|
|
* Rule is simple: NOT_NULL not equal NULL, NULL not_equal NULL too.
|
|
|
|
*/
|
1997-09-08 04:41:22 +02:00
|
|
|
static bool
|
1997-09-07 07:04:48 +02:00
|
|
|
_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
|
|
|
|
int keysz, ScanKey scankey)
|
1997-03-24 09:48:16 +01:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
BTItem btitem;
|
|
|
|
IndexTuple itup;
|
|
|
|
int i;
|
2000-07-21 08:42:39 +02:00
|
|
|
|
|
|
|
/* Better be comparing to a leaf item */
|
|
|
|
Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
|
|
|
itup = &(btitem->bti_itup);
|
|
|
|
|
|
|
|
for (i = 1; i <= keysz; i++)
|
|
|
|
{
|
2000-07-21 08:42:39 +02:00
|
|
|
ScanKey entry = &scankey[i - 1];
|
|
|
|
AttrNumber attno;
|
|
|
|
Datum datum;
|
|
|
|
bool isNull;
|
|
|
|
int32 result;
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
attno = entry->sk_attno;
|
|
|
|
Assert(attno == i);
|
2000-07-21 08:42:39 +02:00
|
|
|
datum = index_getattr(itup, attno, itupdesc, &isNull);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* NULLs are never equal to anything */
|
|
|
|
if (entry->sk_flags & SK_ISNULL || isNull)
|
1998-09-01 05:29:17 +02:00
|
|
|
return false;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2000-05-30 06:25:00 +02:00
|
|
|
result = DatumGetInt32(FunctionCall2(&entry->sk_func,
|
2000-07-21 08:42:39 +02:00
|
|
|
entry->sk_argument,
|
|
|
|
datum));
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
if (result != 0)
|
1998-09-01 05:29:17 +02:00
|
|
|
return false;
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
|
|
|
|
2000-07-21 08:42:39 +02:00
|
|
|
/* if we get here, the keys are equal */
|
1998-09-01 05:29:17 +02:00
|
|
|
return true;
|
1997-03-24 09:48:16 +01:00
|
|
|
}
|