Harmonize nbtree page split point code.

An nbtree split point can be thought of as a point between two adjoining
tuples from an imaginary version of the page being split that includes
the incoming/new item (in addition to the items that really are on the
page).  These adjoining tuples are called the lastleft and firstright
tuples.

The variables that represent split points contained a field called
firstright, which is an offset number of the first data item from the
original page that goes on the new right page.  The corresponding tuple
from origpage was usually the same thing as the actual firstright tuple,
but not always: the firstright tuple is sometimes the new/incoming item
instead.  This situation seems unnecessarily confusing.

Make things clearer by renaming the origpage offset returned by
_bt_findsplitloc() to "firstrightoff".  We now have a firstright tuple
and a firstrightoff offset number which are comparable to the
newitem/lastleft tuples and the newitemoff/lastleftoff offset numbers
respectively.  Also make sure that we are consistent about how we
describe nbtree page split point state.

Push the responsibility for dealing with pg_upgrade'd !heapkeyspace
indexes down to lower level code, relieving _bt_split() from dealing
with it directly.  This means that we always have a palloc'd left page
high key on the leaf level, no matter what.  This enables simplifying
some of the code (and code comments) within _bt_split().

Finally, restructure the page split code to make it clearer why suffix
truncation (which only takes place during leaf page splits) is
completely different to the first data item truncation that takes place
during internal page splits.  Tuples are marked as having fewer
attributes stored in both cases, and the firstright tuple is truncated
in both cases, so it's easy to imagine somebody missing the distinction.
This commit is contained in:
Peter Geoghegan 2020-04-13 16:39:55 -07:00
parent 8f00d84afc
commit bc3087b626
8 changed files with 334 additions and 291 deletions

View File

@ -1121,7 +1121,7 @@ bt_target_page_check(BtreeCheckState *state)
* designated purpose. Enforce the lower limit for pivot tuples when * designated purpose. Enforce the lower limit for pivot tuples when
* an explicit heap TID isn't actually present. (In all other cases * an explicit heap TID isn't actually present. (In all other cases
* suffix truncation is guaranteed to generate a pivot tuple that's no * suffix truncation is guaranteed to generate a pivot tuple that's no
* larger than the first right tuple provided to it by its caller.) * larger than the firstright tuple provided to it by its caller.)
*/ */
lowersizelimit = skey->heapkeyspace && lowersizelimit = skey->heapkeyspace &&
(P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL); (P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL);

View File

@ -56,8 +56,8 @@ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
BTStack stack, bool is_root, bool is_only); BTStack stack, bool is_root, bool is_only);
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
OffsetNumber itup_off); OffsetNumber itup_off, bool newfirstdataitem);
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
/* /*
@ -1452,18 +1452,18 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
BTPageOpaque sopaque = NULL; BTPageOpaque sopaque = NULL;
Size itemsz; Size itemsz;
ItemId itemid; ItemId itemid;
IndexTuple item; IndexTuple firstright,
OffsetNumber leftoff, lefthighkey;
rightoff; OffsetNumber firstrightoff;
OffsetNumber firstright; OffsetNumber afterleftoff,
afterrightoff,
minusinfoff;
OffsetNumber origpagepostingoff; OffsetNumber origpagepostingoff;
OffsetNumber maxoff; OffsetNumber maxoff;
OffsetNumber i; OffsetNumber i;
bool newitemonleft, bool newitemonleft,
isleaf; isleaf,
IndexTuple lefthikey; isrightmost;
int indnatts = IndexRelationGetNumberOfAttributes(rel);
int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
/* /*
* origpage is the original page to be split. leftpage is a temporary * origpage is the original page to be split. leftpage is a temporary
@ -1480,24 +1480,36 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
*/ */
origpage = BufferGetPage(buf); origpage = BufferGetPage(buf);
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
isleaf = P_ISLEAF(oopaque);
isrightmost = P_RIGHTMOST(oopaque);
maxoff = PageGetMaxOffsetNumber(origpage);
origpagenumber = BufferGetBlockNumber(buf); origpagenumber = BufferGetBlockNumber(buf);
/* /*
* Choose a point to split origpage at. * Choose a point to split origpage at.
* *
* A split point can be thought of as a point _between_ two existing * A split point can be thought of as a point _between_ two existing data
* tuples on origpage (lastleft and firstright tuples), provided you * items on origpage (the lastleft and firstright tuples), provided you
* pretend that the new item that didn't fit is already on origpage. * pretend that the new item that didn't fit is already on origpage.
* *
* Since origpage does not actually contain newitem, the representation of * Since origpage does not actually contain newitem, the representation of
* split points needs to work with two boundary cases: splits where * split points needs to work with two boundary cases: splits where
* newitem is lastleft, and splits where newitem is firstright. * newitem is lastleft, and splits where newitem is firstright.
* newitemonleft resolves the ambiguity that would otherwise exist when * newitemonleft resolves the ambiguity that would otherwise exist when
* newitemoff == firstright. In all other cases it's clear which side of * newitemoff == firstrightoff. In all other cases it's clear which side
* the split every tuple goes on from context. newitemonleft is usually * of the split every tuple goes on from context. newitemonleft is
* (but not always) redundant information. * usually (but not always) redundant information.
*
* firstrightoff is supposed to be an origpage offset number, but it's
* possible that its value will be maxoff+1, which is "past the end" of
* origpage. This happens in the rare case where newitem goes after all
* existing items (i.e. newitemoff is maxoff+1) and we end up splitting
* origpage at the point that leaves newitem alone on new right page. Any
* "!newitemonleft && newitemoff == firstrightoff" split point makes
* newitem the firstright tuple, though, so this case isn't a special
* case.
*/ */
firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
newitem, &newitemonleft); newitem, &newitemonleft);
/* Allocate temp buffer for leftpage */ /* Allocate temp buffer for leftpage */
@ -1524,7 +1536,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* examine the LSN and possibly dump it in a page image. * examine the LSN and possibly dump it in a page image.
*/ */
PageSetLSN(leftpage, PageGetLSN(origpage)); PageSetLSN(leftpage, PageGetLSN(origpage));
isleaf = P_ISLEAF(oopaque);
/* /*
* Determine page offset number of existing overlapped-with-orignewitem * Determine page offset number of existing overlapped-with-orignewitem
@ -1555,74 +1566,57 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
} }
/* /*
* The "high key" for the new left page will be the first key that's going * The high key for the new left page is a possibly-truncated copy of
* to go into the new right page, or a truncated version if this is a leaf * firstright on the leaf level (it's "firstright itself" on internal
* page split. * pages; see !isleaf comments below). This may seem to be contrary to
* Lehman & Yao's approach of using a copy of lastleft as the new high key
* when splitting on the leaf level. It isn't, though.
* *
* The high key for the left page is formed using the first item on the * Suffix truncation will leave the left page's high key fully equal to
* right page, which may seem to be contrary to Lehman & Yao's approach of * lastleft when lastleft and firstright are equal prior to heap TID (that
* using the left page's last item as its new high key when splitting on * is, the tiebreaker TID value comes from lastleft). It isn't actually
* the leaf level. It isn't, though: suffix truncation will leave the * necessary for a new leaf high key to be a copy of lastleft for the L&Y
* left page's high key fully equal to the last item on the left page when * "subtree" invariant to hold. It's sufficient to make sure that the new
* two tuples with equal key values (excluding heap TID) enclose the split * leaf high key is strictly less than firstright, and greater than or
* point. It isn't actually necessary for a new leaf high key to be equal * equal to (not necessarily equal to) lastleft. In other words, when
* to the last item on the left for the L&Y "subtree" invariant to hold. * suffix truncation isn't possible during a leaf page split, we take
* It's sufficient to make sure that the new leaf high key is strictly * L&Y's exact approach to generating a new high key for the left page.
* less than the first item on the right leaf page, and greater than or * (Actually, that is slightly inaccurate. We don't just use a copy of
* equal to (not necessarily equal to) the last item on the left leaf * lastleft. A tuple with all the keys from firstright but the max heap
* page. * TID from lastleft is used, to avoid introducing a special case.)
*
* In other words, when suffix truncation isn't possible, L&Y's exact
* approach to leaf splits is taken. (Actually, even that is slightly
* inaccurate. A tuple with all the keys from firstright but the heap TID
* from lastleft will be used as the new high key, since the last left
* tuple could be physically larger despite being opclass-equal in respect
* of all attributes prior to the heap TID attribute.)
*/ */
if (!newitemonleft && newitemoff == firstright) if (!newitemonleft && newitemoff == firstrightoff)
{ {
/* incoming tuple will become first on right page */ /* incoming tuple becomes firstright */
itemsz = newitemsz; itemsz = newitemsz;
item = newitem; firstright = newitem;
} }
else else
{ {
/* existing item at firstright will become first on right page */ /* existing item at firstrightoff becomes firstright */
itemid = PageGetItemId(origpage, firstright); itemid = PageGetItemId(origpage, firstrightoff);
itemsz = ItemIdGetLength(itemid); itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid); firstright = (IndexTuple) PageGetItem(origpage, itemid);
if (firstright == origpagepostingoff) if (firstrightoff == origpagepostingoff)
item = nposting; firstright = nposting;
} }
/* if (isleaf)
* Truncate unneeded key and non-key attributes of the high key item
* before inserting it on the left page. This can only happen at the leaf
* level, since in general all pivot tuple values originate from leaf
* level high keys. A pivot tuple in a grandparent page must guide a
* search not only to the correct parent page, but also to the correct
* leaf page.
*/
if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
{ {
IndexTuple lastleft; IndexTuple lastleft;
/* /* Attempt suffix truncation for leaf page splits */
* Determine which tuple will become the last on the left page. This if (newitemonleft && newitemoff == firstrightoff)
* is needed to decide how many attributes from the first item on the
* right page must remain in new high key for left page.
*/
if (newitemonleft && newitemoff == firstright)
{ {
/* incoming tuple will become last on left page */ /* incoming tuple becomes lastleft */
lastleft = newitem; lastleft = newitem;
} }
else else
{ {
OffsetNumber lastleftoff; OffsetNumber lastleftoff;
/* item just before firstright will become last on left page */ /* existing item before firstrightoff becomes lastleft */
lastleftoff = OffsetNumberPrev(firstright); lastleftoff = OffsetNumberPrev(firstrightoff);
Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
itemid = PageGetItemId(origpage, lastleftoff); itemid = PageGetItemId(origpage, lastleftoff);
lastleft = (IndexTuple) PageGetItem(origpage, itemid); lastleft = (IndexTuple) PageGetItem(origpage, itemid);
@ -1630,30 +1624,55 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
lastleft = nposting; lastleft = nposting;
} }
Assert(lastleft != item); lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
lefthikey = _bt_truncate(rel, lastleft, item, itup_key); itemsz = IndexTupleSize(lefthighkey);
itemsz = IndexTupleSize(lefthikey);
itemsz = MAXALIGN(itemsz);
} }
else else
lefthikey = item; {
/*
* Don't perform suffix truncation on a copy of firstright to make
* left page high key for internal page splits. Must use firstright
* as new high key directly.
*
* Each distinct separator key value originates as a leaf level high
* key; all other separator keys/pivot tuples are copied from one
* level down. A separator key in a grandparent page must be
* identical to high key in rightmost parent page of the subtree to
* its left, which must itself be identical to high key in rightmost
* child page of that same subtree (this even applies to separator
* from grandparent's high key). There must always be an unbroken
* "seam" of identical separator keys that guide index scans at every
* level, starting from the grandparent. That's why suffix truncation
* is unsafe here.
*
* Internal page splits will truncate firstright into a "negative
* infinity" data item when it gets inserted on the new right page
* below, though. This happens during the call to _bt_pgaddtup() for
* the new first data item for right page. Do not confuse this
* mechanism with suffix truncation. It is just a convenient way of
* implementing page splits that split the internal page "inside"
* firstright. The lefthighkey separator key cannot appear a second
* time in the right page (only firstright's downlink goes in right
* page).
*/
lefthighkey = firstright;
}
/* /*
* Add new high key to leftpage * Add new high key to leftpage
*/ */
leftoff = P_HIKEY; afterleftoff = P_HIKEY;
Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0); Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts); Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, IndexRelationGetNumberOfKeyAttributes(rel));
false, false) == InvalidOffsetNumber) Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
elog(ERROR, "failed to add hikey to the left sibling" if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
false) == InvalidOffsetNumber)
elog(ERROR, "failed to add high key to the left sibling"
" while splitting block %u of index \"%s\"", " while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel)); origpagenumber, RelationGetRelationName(rel));
leftoff = OffsetNumberNext(leftoff); afterleftoff = OffsetNumberNext(afterleftoff);
/* be tidy */
if (lefthikey != item)
pfree(lefthikey);
/* /*
* Acquire a new right page to split into, now that left page has a new * Acquire a new right page to split into, now that left page has a new
@ -1700,26 +1719,37 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* the tree, then the first entry on the page is the high key from * the tree, then the first entry on the page is the high key from
* origpage. * origpage.
*/ */
rightoff = P_HIKEY; afterrightoff = P_HIKEY;
if (!P_RIGHTMOST(oopaque)) if (!isrightmost)
{ {
IndexTuple righthighkey;
itemid = PageGetItemId(origpage, P_HIKEY); itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid); itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid); righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
Assert(BTreeTupleGetNAtts(item, rel) > 0); Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts); Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, IndexRelationGetNumberOfKeyAttributes(rel));
if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
false, false) == InvalidOffsetNumber) false, false) == InvalidOffsetNumber)
{ {
memset(rightpage, 0, BufferGetPageSize(rbuf)); memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add hikey to the right sibling" elog(ERROR, "failed to add high key to the right sibling"
" while splitting block %u of index \"%s\"", " while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel)); origpagenumber, RelationGetRelationName(rel));
} }
rightoff = OffsetNumberNext(rightoff); afterrightoff = OffsetNumberNext(afterrightoff);
} }
/*
* Internal page splits truncate first data item on right page -- it
* becomes "minus infinity" item for the page. Set this up here.
*/
minusinfoff = InvalidOffsetNumber;
if (!isleaf)
minusinfoff = afterrightoff;
/* /*
* Now transfer all the data items (non-pivot tuples in isleaf case, or * Now transfer all the data items (non-pivot tuples in isleaf case, or
* additional pivot tuples in !isleaf case) to the appropriate page. * additional pivot tuples in !isleaf case) to the appropriate page.
@ -1727,20 +1757,20 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* Note: we *must* insert at least the right page's items in item-number * Note: we *must* insert at least the right page's items in item-number
* order, for the benefit of _bt_restore_page(). * order, for the benefit of _bt_restore_page().
*/ */
maxoff = PageGetMaxOffsetNumber(origpage);
for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i)) for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
{ {
IndexTuple dataitem;
itemid = PageGetItemId(origpage, i); itemid = PageGetItemId(origpage, i);
itemsz = ItemIdGetLength(itemid); itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid); dataitem = (IndexTuple) PageGetItem(origpage, itemid);
/* replace original item with nposting due to posting split? */ /* replace original item with nposting due to posting split? */
if (i == origpagepostingoff) if (i == origpagepostingoff)
{ {
Assert(BTreeTupleIsPosting(item)); Assert(BTreeTupleIsPosting(dataitem));
Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
item = nposting; dataitem = nposting;
} }
/* does new item belong before this one? */ /* does new item belong before this one? */
@ -1748,56 +1778,59 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
{ {
if (newitemonleft) if (newitemonleft)
{ {
Assert(newitemoff <= firstright); Assert(newitemoff <= firstrightoff);
if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff)) if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
false))
{ {
memset(rightpage, 0, BufferGetPageSize(rbuf)); memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the left sibling" elog(ERROR, "failed to add new item to the left sibling"
" while splitting block %u of index \"%s\"", " while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel)); origpagenumber, RelationGetRelationName(rel));
} }
leftoff = OffsetNumberNext(leftoff); afterleftoff = OffsetNumberNext(afterleftoff);
} }
else else
{ {
Assert(newitemoff >= firstright); Assert(newitemoff >= firstrightoff);
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
afterrightoff == minusinfoff))
{ {
memset(rightpage, 0, BufferGetPageSize(rbuf)); memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right sibling" elog(ERROR, "failed to add new item to the right sibling"
" while splitting block %u of index \"%s\"", " while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel)); origpagenumber, RelationGetRelationName(rel));
} }
rightoff = OffsetNumberNext(rightoff); afterrightoff = OffsetNumberNext(afterrightoff);
} }
} }
/* decide which page to put it on */ /* decide which page to put it on */
if (i < firstright) if (i < firstrightoff)
{ {
if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff)) if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
{ {
memset(rightpage, 0, BufferGetPageSize(rbuf)); memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add old item to the left sibling" elog(ERROR, "failed to add old item to the left sibling"
" while splitting block %u of index \"%s\"", " while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel)); origpagenumber, RelationGetRelationName(rel));
} }
leftoff = OffsetNumberNext(leftoff); afterleftoff = OffsetNumberNext(afterleftoff);
} }
else else
{ {
if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff)) if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
afterrightoff == minusinfoff))
{ {
memset(rightpage, 0, BufferGetPageSize(rbuf)); memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add old item to the right sibling" elog(ERROR, "failed to add old item to the right sibling"
" while splitting block %u of index \"%s\"", " while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel)); origpagenumber, RelationGetRelationName(rel));
} }
rightoff = OffsetNumberNext(rightoff); afterrightoff = OffsetNumberNext(afterrightoff);
} }
} }
/* cope with possibility that newitem goes at the end */ /* Handle case where newitem goes at the end of rightpage */
if (i <= newitemoff) if (i <= newitemoff)
{ {
/* /*
@ -1805,15 +1838,16 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* *everything* on the left page, which cannot fit (if it could, we'd * *everything* on the left page, which cannot fit (if it could, we'd
* not be splitting the page). * not be splitting the page).
*/ */
Assert(!newitemonleft); Assert(!newitemonleft && newitemoff == maxoff + 1);
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
afterrightoff == minusinfoff))
{ {
memset(rightpage, 0, BufferGetPageSize(rbuf)); memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right sibling" elog(ERROR, "failed to add new item to the right sibling"
" while splitting block %u of index \"%s\"", " while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel)); origpagenumber, RelationGetRelationName(rel));
} }
rightoff = OffsetNumberNext(rightoff); afterrightoff = OffsetNumberNext(afterrightoff);
} }
/* /*
@ -1823,7 +1857,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* all readers release locks on a page before trying to fetch its * all readers release locks on a page before trying to fetch its
* neighbors. * neighbors.
*/ */
if (!P_RIGHTMOST(oopaque)) if (!isrightmost)
{ {
sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
spage = BufferGetPage(sbuf); spage = BufferGetPage(sbuf);
@ -1886,7 +1920,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
MarkBufferDirty(buf); MarkBufferDirty(buf);
MarkBufferDirty(rbuf); MarkBufferDirty(rbuf);
if (!P_RIGHTMOST(ropaque)) if (!isrightmost)
{ {
sopaque->btpo_prev = rightpagenumber; sopaque->btpo_prev = rightpagenumber;
MarkBufferDirty(sbuf); MarkBufferDirty(sbuf);
@ -1914,10 +1948,10 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
xlrec.level = ropaque->btpo.level; xlrec.level = ropaque->btpo.level;
/* See comments below on newitem, orignewitem, and posting lists */ /* See comments below on newitem, orignewitem, and posting lists */
xlrec.firstright = firstright; xlrec.firstrightoff = firstrightoff;
xlrec.newitemoff = newitemoff; xlrec.newitemoff = newitemoff;
xlrec.postingoff = 0; xlrec.postingoff = 0;
if (postingoff != 0 && origpagepostingoff < firstright) if (postingoff != 0 && origpagepostingoff < firstrightoff)
xlrec.postingoff = postingoff; xlrec.postingoff = postingoff;
XLogBeginInsert(); XLogBeginInsert();
@ -1925,10 +1959,10 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
/* Log the right sibling, because we've changed its prev-pointer. */ /* Log original right sibling, since we've changed its prev-pointer */
if (!P_RIGHTMOST(ropaque)) if (!isrightmost)
XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
if (BufferIsValid(cbuf)) if (!isleaf)
XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
/* /*
@ -1959,18 +1993,24 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* newitem-logged case). * newitem-logged case).
*/ */
if (newitemonleft && xlrec.postingoff == 0) if (newitemonleft && xlrec.postingoff == 0)
XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz)); XLogRegisterBufData(0, (char *) newitem, newitemsz);
else if (xlrec.postingoff != 0) else if (xlrec.postingoff != 0)
{ {
Assert(newitemonleft || firstright == newitemoff); Assert(isleaf);
Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem)); Assert(newitemonleft || firstrightoff == newitemoff);
XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz)); Assert(newitemsz == IndexTupleSize(orignewitem));
XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
} }
/* Log the left page's new high key */ /* Log the left page's new high key */
if (!isleaf)
{
/* lefthighkey isn't local copy, get current pointer */
itemid = PageGetItemId(origpage, P_HIKEY); itemid = PageGetItemId(origpage, P_HIKEY);
item = (IndexTuple) PageGetItem(origpage, itemid); lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item))); }
XLogRegisterBufData(0, (char *) lefthighkey,
MAXALIGN(IndexTupleSize(lefthighkey)));
/* /*
* Log the contents of the right page in the format understood by * Log the contents of the right page in the format understood by
@ -1991,26 +2031,26 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
PageSetLSN(origpage, recptr); PageSetLSN(origpage, recptr);
PageSetLSN(rightpage, recptr); PageSetLSN(rightpage, recptr);
if (!P_RIGHTMOST(ropaque)) if (!isrightmost)
{
PageSetLSN(spage, recptr); PageSetLSN(spage, recptr);
}
if (!isleaf) if (!isleaf)
{
PageSetLSN(BufferGetPage(cbuf), recptr); PageSetLSN(BufferGetPage(cbuf), recptr);
} }
}
END_CRIT_SECTION(); END_CRIT_SECTION();
/* release the old right sibling */ /* release the old right sibling */
if (!P_RIGHTMOST(ropaque)) if (!isrightmost)
_bt_relbuf(rel, sbuf); _bt_relbuf(rel, sbuf);
/* release the child */ /* release the child */
if (!isleaf) if (!isleaf)
_bt_relbuf(rel, cbuf); _bt_relbuf(rel, cbuf);
/* be tidy */
if (isleaf)
pfree(lefthighkey);
/* split's done */ /* split's done */
return rbuf; return rbuf;
} }
@ -2405,9 +2445,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
metad = BTPageGetMeta(metapg); metad = BTPageGetMeta(metapg);
/* /*
* Create downlink item for left page (old root). Since this will be the * Create downlink item for left page (old root). The key value used is
* first item in a non-leaf page, it implicitly has minus-infinity key * "minus infinity", a sentinel value that's reliably less than any real
* value, so we need not store any actual key in it. * key value that could appear in the left page.
*/ */
left_item_sz = sizeof(IndexTupleData); left_item_sz = sizeof(IndexTupleData);
left_item = (IndexTuple) palloc(left_item_sz); left_item = (IndexTuple) palloc(left_item_sz);
@ -2541,33 +2581,30 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
* _bt_pgaddtup() -- add a data item to a particular page during split. * _bt_pgaddtup() -- add a data item to a particular page during split.
* *
* The difference between this routine and a bare PageAddItem call is * The difference between this routine and a bare PageAddItem call is
* that this code knows that the leftmost data item on an internal * that this code can deal with the first data item on an internal btree
* btree page has a key that must be treated as minus infinity. * page in passing. This data item (which is called "firstright" within
* Therefore, it truncates away all attributes. This extra step is * _bt_split()) has a key that must be treated as minus infinity after
* only needed during internal page splits. * the split. Therefore, we truncate away all attributes when caller
* specifies it's the first data item on page (downlink is not changed,
* though). This extra step is only needed for the right page of an
* internal page split. There is no need to do this for the first data
* item on the existing/left page, since that will already have been
* truncated during an earlier page split.
* *
* Truncation of an internal page data item can be thought of as one * See _bt_split() for a high level explanation of why we truncate here.
* of the steps used to "move" a boundary separator key during an * Note that this routine has nothing to do with suffix truncation,
* internal page split. Conceptually, _bt_split() caller splits * despite using some of the same infrastructure.
* internal pages "inside" the firstright data item: firstright's
* separator key is used as the high key for the left page, while its
* downlink is used within the first data item (also the negative
* infinity item) for the right page. Each distinct separator key
* should appear no more than once per level of the tree.
*
* CAUTION: this works ONLY if we insert the tuples in order, so that
* the given itup_off does represent the final position of the tuple!
*/ */
static bool static inline bool
_bt_pgaddtup(Page page, _bt_pgaddtup(Page page,
Size itemsize, Size itemsize,
IndexTuple itup, IndexTuple itup,
OffsetNumber itup_off) OffsetNumber itup_off,
bool newfirstdataitem)
{ {
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTupleData trunctuple; IndexTupleData trunctuple;
if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque)) if (newfirstdataitem)
{ {
trunctuple = *itup; trunctuple = *itup;
trunctuple.t_info = sizeof(IndexTupleData); trunctuple.t_info = sizeof(IndexTupleData);
@ -2576,8 +2613,8 @@ _bt_pgaddtup(Page page,
itemsize = sizeof(IndexTupleData); itemsize = sizeof(IndexTupleData);
} }
if (PageAddItem(page, (Item) itup, itemsize, itup_off, if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
false, false) == InvalidOffsetNumber) false) == InvalidOffsetNumber))
return false; return false;
return true; return true;

View File

@ -269,7 +269,8 @@ static Page _bt_blnewpage(uint32 level);
static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
static void _bt_slideleft(Page page); static void _bt_slideleft(Page page);
static void _bt_sortaddtup(Page page, Size itemsize, static void _bt_sortaddtup(Page page, Size itemsize,
IndexTuple itup, OffsetNumber itup_off); IndexTuple itup, OffsetNumber itup_off,
bool newfirstdataitem);
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
IndexTuple itup, Size truncextra); IndexTuple itup, Size truncextra);
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
@ -750,26 +751,24 @@ _bt_slideleft(Page page)
/* /*
* Add an item to a page being built. * Add an item to a page being built.
* *
* The main difference between this routine and a bare PageAddItem call * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
* is that this code knows that the leftmost data item on a non-leaf btree * raises an error directly.
* page has a key that must be treated as minus infinity. Therefore, it
* truncates away all attributes.
* *
* This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use * Note that our nbtsort.c caller does not know yet if the page will be
* that because it assumes that P_RIGHTMOST() will return the correct * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
* answer for the page. Here, we don't know yet if the page will be * caller. Page that turns out to be the rightmost on its level is fixed by
* rightmost. Offset P_FIRSTKEY is always the first data key. * calling _bt_slideleft().
*/ */
static void static void
_bt_sortaddtup(Page page, _bt_sortaddtup(Page page,
Size itemsize, Size itemsize,
IndexTuple itup, IndexTuple itup,
OffsetNumber itup_off) OffsetNumber itup_off,
bool newfirstdataitem)
{ {
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTupleData trunctuple; IndexTupleData trunctuple;
if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY) if (newfirstdataitem)
{ {
trunctuple = *itup; trunctuple = *itup;
trunctuple.t_info = sizeof(IndexTupleData); trunctuple.t_info = sizeof(IndexTupleData);
@ -867,12 +866,13 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
* Every newly built index will treat heap TID as part of the keyspace, * Every newly built index will treat heap TID as part of the keyspace,
* which imposes the requirement that new high keys must occasionally have * which imposes the requirement that new high keys must occasionally have
* a heap TID appended within _bt_truncate(). That may leave a new pivot * a heap TID appended within _bt_truncate(). That may leave a new pivot
* tuple one or two MAXALIGN() quantums larger than the original first * tuple one or two MAXALIGN() quantums larger than the original
* right tuple it's derived from. v4 deals with the problem by decreasing * firstright tuple it's derived from. v4 deals with the problem by
* the limit on the size of tuples inserted on the leaf level by the same * decreasing the limit on the size of tuples inserted on the leaf level
* small amount. Enforce the new v4+ limit on the leaf level, and the old * by the same small amount. Enforce the new v4+ limit on the leaf level,
* limit on internal levels, since pivot tuples may need to make use of * and the old limit on internal levels, since pivot tuples may need to
* the reserved space. This should never fail on internal pages. * make use of the reserved space. This should never fail on internal
* pages.
*/ */
if (unlikely(itupsz > BTMaxItemSize(npage))) if (unlikely(itupsz > BTMaxItemSize(npage)))
_bt_check_third_page(wstate->index, wstate->heap, isleaf, npage, _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
@ -925,7 +925,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
Assert(last_off > P_FIRSTKEY); Assert(last_off > P_FIRSTKEY);
ii = PageGetItemId(opage, last_off); ii = PageGetItemId(opage, last_off);
oitup = (IndexTuple) PageGetItem(opage, ii); oitup = (IndexTuple) PageGetItem(opage, ii);
_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
!isleaf);
/* /*
* Move 'last' into the high key position on opage. _bt_blnewpage() * Move 'last' into the high key position on opage. _bt_blnewpage()
@ -1054,7 +1055,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
* Add the new item into the current page. * Add the new item into the current page.
*/ */
last_off = OffsetNumberNext(last_off); last_off = OffsetNumberNext(last_off);
_bt_sortaddtup(npage, itupsz, itup, last_off); _bt_sortaddtup(npage, itupsz, itup, last_off,
!isleaf && last_off == P_FIRSTKEY);
state->btps_page = npage; state->btps_page = npage;
state->btps_blkno = nblkno; state->btps_blkno = nblkno;

View File

@ -37,7 +37,7 @@ typedef struct
int16 rightfree; /* space left on right page post-split */ int16 rightfree; /* space left on right page post-split */
/* split point identifying fields (returned by _bt_findsplitloc) */ /* split point identifying fields (returned by _bt_findsplitloc) */
OffsetNumber firstoldonright; /* first item on new right page */ OffsetNumber firstrightoff; /* first origpage item on rightpage */
bool newitemonleft; /* new item goes on left, or right? */ bool newitemonleft; /* new item goes on left, or right? */
} SplitPoint; } SplitPoint;
@ -46,7 +46,7 @@ typedef struct
{ {
/* context data for _bt_recsplitloc */ /* context data for _bt_recsplitloc */
Relation rel; /* index relation */ Relation rel; /* index relation */
Page page; /* page undergoing split */ Page origpage; /* page undergoing split */
IndexTuple newitem; /* new item (cause of page split) */ IndexTuple newitem; /* new item (cause of page split) */
Size newitemsz; /* size of newitem (includes line pointer) */ Size newitemsz; /* size of newitem (includes line pointer) */
bool is_leaf; /* T if splitting a leaf page */ bool is_leaf; /* T if splitting a leaf page */
@ -55,7 +55,7 @@ typedef struct
int leftspace; /* space available for items on left page */ int leftspace; /* space available for items on left page */
int rightspace; /* space available for items on right page */ int rightspace; /* space available for items on right page */
int olddataitemstotal; /* space taken by old items */ int olddataitemstotal; /* space taken by old items */
Size minfirstrightsz; /* smallest firstoldonright tuple size */ Size minfirstrightsz; /* smallest firstright size */
/* candidate split point data */ /* candidate split point data */
int maxsplits; /* maximum number of splits */ int maxsplits; /* maximum number of splits */
@ -65,8 +65,9 @@ typedef struct
} FindSplitData; } FindSplitData;
static void _bt_recsplitloc(FindSplitData *state, static void _bt_recsplitloc(FindSplitData *state,
OffsetNumber firstoldonright, bool newitemonleft, OffsetNumber firstrightoff, bool newitemonleft,
int olddataitemstoleft, Size firstoldonrightsz); int olddataitemstoleft,
Size firstrightofforigpagetuplesz);
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
bool usemult); bool usemult);
static int _bt_splitcmp(const void *arg1, const void *arg2); static int _bt_splitcmp(const void *arg1, const void *arg2);
@ -119,13 +120,18 @@ static inline IndexTuple _bt_split_firstright(FindSplitData *state,
* suffix truncation. * suffix truncation.
* *
* We return the index of the first existing tuple that should go on the * We return the index of the first existing tuple that should go on the
* righthand page, plus a boolean indicating whether the new tuple goes on * righthand page (which is called firstrightoff), plus a boolean
* the left or right page. The bool is necessary to disambiguate the case * indicating whether the new tuple goes on the left or right page. You
* where firstright == newitemoff. * can think of the returned state as a point _between_ two adjacent data
* items (laftleft and firstright data items) on an imaginary version of
* origpage that already includes newitem. The bool is necessary to
* disambiguate the case where firstrightoff == newitemoff (i.e. it is
* sometimes needed to determine if the firstright tuple for the split is
* newitem rather than the tuple from origpage at offset firstrightoff).
*/ */
OffsetNumber OffsetNumber
_bt_findsplitloc(Relation rel, _bt_findsplitloc(Relation rel,
Page page, Page origpage,
OffsetNumber newitemoff, OffsetNumber newitemoff,
Size newitemsz, Size newitemsz,
IndexTuple newitem, IndexTuple newitem,
@ -143,36 +149,36 @@ _bt_findsplitloc(Relation rel,
ItemId itemid; ItemId itemid;
OffsetNumber offnum, OffsetNumber offnum,
maxoff, maxoff,
foundfirstright; firstrightoff;
double fillfactormult; double fillfactormult;
bool usemult; bool usemult;
SplitPoint leftpage, SplitPoint leftpage,
rightpage; rightpage;
opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
maxoff = PageGetMaxOffsetNumber(page); maxoff = PageGetMaxOffsetNumber(origpage);
/* Total free space available on a btree page, after fixed overhead */ /* Total free space available on a btree page, after fixed overhead */
leftspace = rightspace = leftspace = rightspace =
PageGetPageSize(page) - SizeOfPageHeaderData - PageGetPageSize(origpage) - SizeOfPageHeaderData -
MAXALIGN(sizeof(BTPageOpaqueData)); MAXALIGN(sizeof(BTPageOpaqueData));
/* The right page will have the same high key as the old page */ /* The right page will have the same high key as the old page */
if (!P_RIGHTMOST(opaque)) if (!P_RIGHTMOST(opaque))
{ {
itemid = PageGetItemId(page, P_HIKEY); itemid = PageGetItemId(origpage, P_HIKEY);
rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
sizeof(ItemIdData)); sizeof(ItemIdData));
} }
/* Count up total space in data items before actually scanning 'em */ /* Count up total space in data items before actually scanning 'em */
olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page); olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
leaffillfactor = BTGetFillFactor(rel); leaffillfactor = BTGetFillFactor(rel);
/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
newitemsz += sizeof(ItemIdData); newitemsz += sizeof(ItemIdData);
state.rel = rel; state.rel = rel;
state.page = page; state.origpage = origpage;
state.newitem = newitem; state.newitem = newitem;
state.newitemsz = newitemsz; state.newitemsz = newitemsz;
state.is_leaf = P_ISLEAF(opaque); state.is_leaf = P_ISLEAF(opaque);
@ -209,7 +215,7 @@ _bt_findsplitloc(Relation rel,
{ {
Size itemsz; Size itemsz;
itemid = PageGetItemId(page, offnum); itemid = PageGetItemId(origpage, offnum);
itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
/* /*
@ -293,8 +299,7 @@ _bt_findsplitloc(Relation rel,
* New item inserted at rightmost point among a localized grouping on * New item inserted at rightmost point among a localized grouping on
* a leaf page -- apply "split after new item" optimization, either by * a leaf page -- apply "split after new item" optimization, either by
* applying leaf fillfactor multiplier, or by choosing the exact split * applying leaf fillfactor multiplier, or by choosing the exact split
* point that leaves the new item as last on the left. (usemult is set * point that leaves newitem as lastleft. (usemult is set for us.)
* for us.)
*/ */
if (usemult) if (usemult)
{ {
@ -309,7 +314,7 @@ _bt_findsplitloc(Relation rel,
SplitPoint *split = state.splits + i; SplitPoint *split = state.splits + i;
if (split->newitemonleft && if (split->newitemonleft &&
newitemoff == split->firstoldonright) newitemoff == split->firstrightoff)
{ {
pfree(state.splits); pfree(state.splits);
*newitemonleft = true; *newitemonleft = true;
@ -429,24 +434,26 @@ _bt_findsplitloc(Relation rel,
* the entry that has the lowest penalty, and is therefore expected to * the entry that has the lowest penalty, and is therefore expected to
* maximize fan-out. Sets *newitemonleft for us. * maximize fan-out. Sets *newitemonleft for us.
*/ */
foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft, firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
strategy); strategy);
pfree(state.splits); pfree(state.splits);
return foundfirstright; return firstrightoff;
} }
/* /*
* Subroutine to record a particular point between two tuples (possibly the * Subroutine to record a particular point between two tuples (possibly the
* new item) on page (ie, combination of firstright and newitemonleft * new item) on page (ie, combination of firstrightoff and newitemonleft
* settings) in *state for later analysis. This is also a convenient point * settings) in *state for later analysis. This is also a convenient point to
* to check if the split is legal (if it isn't, it won't be recorded). * check if the split is legal (if it isn't, it won't be recorded).
* *
* firstoldonright is the offset of the first item on the original page that * firstrightoff is the offset of the first item on the original page that
* goes to the right page, and firstoldonrightsz is the size of that tuple. * goes to the right page, and firstrightofforigpagetuplesz is the size of
* firstoldonright can be > max offset, which means that all the old items go * that tuple. firstrightoff can be > max offset, which means that all the
* to the left page and only the new item goes to the right page. In that * old items go to the left page and only the new item goes to the right page.
* case, firstoldonrightsz is not used. * We don't actually use firstrightofforigpagetuplesz in that case (actually,
* we don't use it for _any_ split where the firstright tuple happens to be
* newitem).
* *
* olddataitemstoleft is the total size of all old items to the left of the * olddataitemstoleft is the total size of all old items to the left of the
* split point that is recorded here when legal. Should not include * split point that is recorded here when legal. Should not include
@ -454,41 +461,44 @@ _bt_findsplitloc(Relation rel,
*/ */
static void static void
_bt_recsplitloc(FindSplitData *state, _bt_recsplitloc(FindSplitData *state,
OffsetNumber firstoldonright, OffsetNumber firstrightoff,
bool newitemonleft, bool newitemonleft,
int olddataitemstoleft, int olddataitemstoleft,
Size firstoldonrightsz) Size firstrightofforigpagetuplesz)
{ {
int16 leftfree, int16 leftfree,
rightfree; rightfree;
Size firstrightitemsz; Size firstrightsz;
Size postingsz = 0; Size postingsz = 0;
bool newitemisfirstonright; bool newitemisfirstright;
/* Is the new item going to be the first item on the right page? */ /* Is the new item going to be split point's firstright tuple? */
newitemisfirstonright = (firstoldonright == state->newitemoff newitemisfirstright = (firstrightoff == state->newitemoff &&
&& !newitemonleft); !newitemonleft);
if (newitemisfirstonright) if (newitemisfirstright)
firstrightitemsz = state->newitemsz; firstrightsz = state->newitemsz;
else else
{ {
firstrightitemsz = firstoldonrightsz; firstrightsz = firstrightofforigpagetuplesz;
/* /*
* Calculate suffix truncation space saving when firstright is a * Calculate suffix truncation space saving when firstright tuple is a
* posting list tuple, though only when the firstright is over 64 * posting list tuple, though only when the tuple is over 64 bytes
* bytes including line pointer overhead (arbitrary). This avoids * including line pointer overhead (arbitrary). This avoids accessing
* accessing the tuple in cases where its posting list must be very * the tuple in cases where its posting list must be very small (if
* small (if firstright has one at all). * tuple has one at all).
*
* Note: We don't do this in the case where firstright tuple is
* newitem, since newitem cannot have a posting list.
*/ */
if (state->is_leaf && firstrightitemsz > 64) if (state->is_leaf && firstrightsz > 64)
{ {
ItemId itemid; ItemId itemid;
IndexTuple newhighkey; IndexTuple newhighkey;
itemid = PageGetItemId(state->page, firstoldonright); itemid = PageGetItemId(state->origpage, firstrightoff);
newhighkey = (IndexTuple) PageGetItem(state->page, itemid); newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid);
if (BTreeTupleIsPosting(newhighkey)) if (BTreeTupleIsPosting(newhighkey))
postingsz = IndexTupleSize(newhighkey) - postingsz = IndexTupleSize(newhighkey) -
@ -525,11 +535,11 @@ _bt_recsplitloc(FindSplitData *state,
* precise there noticeably improves the balance of free space.) * precise there noticeably improves the balance of free space.)
*/ */
if (state->is_leaf) if (state->is_leaf)
leftfree -= (int16) (firstrightitemsz + leftfree -= (int16) (firstrightsz +
MAXALIGN(sizeof(ItemPointerData)) - MAXALIGN(sizeof(ItemPointerData)) -
postingsz); postingsz);
else else
leftfree -= (int16) firstrightitemsz; leftfree -= (int16) firstrightsz;
/* account for the new item */ /* account for the new item */
if (newitemonleft) if (newitemonleft)
@ -542,7 +552,7 @@ _bt_recsplitloc(FindSplitData *state,
* data from the first item that winds up on the right page. * data from the first item that winds up on the right page.
*/ */
if (!state->is_leaf) if (!state->is_leaf)
rightfree += (int16) firstrightitemsz - rightfree += (int16) firstrightsz -
(int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
/* Record split if legal */ /* Record split if legal */
@ -550,13 +560,13 @@ _bt_recsplitloc(FindSplitData *state,
{ {
Assert(state->nsplits < state->maxsplits); Assert(state->nsplits < state->maxsplits);
/* Determine smallest firstright item size on page */ /* Determine smallest firstright tuple size among legal splits */
state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz); state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz);
state->splits[state->nsplits].curdelta = 0; state->splits[state->nsplits].curdelta = 0;
state->splits[state->nsplits].leftfree = leftfree; state->splits[state->nsplits].leftfree = leftfree;
state->splits[state->nsplits].rightfree = rightfree; state->splits[state->nsplits].rightfree = rightfree;
state->splits[state->nsplits].firstoldonright = firstoldonright; state->splits[state->nsplits].firstrightoff = firstrightoff;
state->splits[state->nsplits].newitemonleft = newitemonleft; state->splits[state->nsplits].newitemonleft = newitemonleft;
state->nsplits++; state->nsplits++;
} }
@ -632,8 +642,8 @@ _bt_splitcmp(const void *arg1, const void *arg2)
* taken by caller varies. Caller uses original leaf page fillfactor in * taken by caller varies. Caller uses original leaf page fillfactor in
* standard way rather than using the new item offset directly when *usemult * standard way rather than using the new item offset directly when *usemult
* was also set to true here. Otherwise, caller applies optimization by * was also set to true here. Otherwise, caller applies optimization by
* locating the legal split point that makes the new tuple the very last tuple * locating the legal split point that makes the new tuple the lastleft tuple
* on the left side of the split. * for the split.
*/ */
static bool static bool
_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
@ -694,8 +704,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
*/ */
if (state->newitemoff > maxoff) if (state->newitemoff > maxoff)
{ {
itemid = PageGetItemId(state->page, maxoff); itemid = PageGetItemId(state->origpage, maxoff);
tup = (IndexTuple) PageGetItem(state->page, itemid); tup = (IndexTuple) PageGetItem(state->origpage, itemid);
keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
if (keepnatts > 1 && keepnatts <= nkeyatts) if (keepnatts > 1 && keepnatts <= nkeyatts)
@ -720,8 +730,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
* optimization. Besides, all inappropriate cases triggered here will * optimization. Besides, all inappropriate cases triggered here will
* still split in the middle of the page on average. * still split in the middle of the page on average.
*/ */
itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff)); itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff));
tup = (IndexTuple) PageGetItem(state->page, itemid); tup = (IndexTuple) PageGetItem(state->origpage, itemid);
/* Do cheaper test first */ /* Do cheaper test first */
if (BTreeTupleIsPosting(tup) || if (BTreeTupleIsPosting(tup) ||
!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
@ -843,8 +853,8 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
* leftmost page. * leftmost page.
*/ */
if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost && if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
!final->newitemonleft && final->firstoldonright >= state->newitemoff && !final->newitemonleft && final->firstrightoff >= state->newitemoff &&
final->firstoldonright < state->newitemoff + MAX_LEAF_INTERVAL) final->firstrightoff < state->newitemoff + MAX_LEAF_INTERVAL)
{ {
/* /*
* Avoid the problem by performing a 50:50 split when the new item is * Avoid the problem by performing a 50:50 split when the new item is
@ -854,7 +864,7 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
} }
*newitemonleft = final->newitemonleft; *newitemonleft = final->newitemonleft;
return final->firstoldonright; return final->firstrightoff;
} }
/* /*
@ -883,10 +893,11 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
*strategy = SPLIT_DEFAULT; *strategy = SPLIT_DEFAULT;
/* /*
* Use smallest observed first right item size for entire page as perfect * Use smallest observed firstright item size for entire page (actually,
* entire imaginary version of page that includes newitem) as perfect
* penalty on internal pages. This can save cycles in the common case * penalty on internal pages. This can save cycles in the common case
* where most or all splits (not just splits within interval) have first * where most or all splits (not just splits within interval) have
* right tuples that are the same size. * firstright tuples that are the same size.
*/ */
if (!state->is_leaf) if (!state->is_leaf)
return state->minfirstrightsz; return state->minfirstrightsz;
@ -961,8 +972,8 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
ItemId itemid; ItemId itemid;
IndexTuple hikey; IndexTuple hikey;
itemid = PageGetItemId(state->page, P_HIKEY); itemid = PageGetItemId(state->origpage, P_HIKEY);
hikey = (IndexTuple) PageGetItem(state->page, itemid); hikey = (IndexTuple) PageGetItem(state->origpage, itemid);
perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
state->newitem); state->newitem);
if (perfectpenalty <= indnkeyatts) if (perfectpenalty <= indnkeyatts)
@ -1005,12 +1016,12 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
{ {
SplitPoint *distant = state->splits + i; SplitPoint *distant = state->splits + i;
if (distant->firstoldonright < deltaoptimal->firstoldonright) if (distant->firstrightoff < deltaoptimal->firstrightoff)
{ {
if (*leftinterval == NULL) if (*leftinterval == NULL)
*leftinterval = distant; *leftinterval = distant;
} }
else if (distant->firstoldonright > deltaoptimal->firstoldonright) else if (distant->firstrightoff > deltaoptimal->firstrightoff)
{ {
if (*rightinterval == NULL) if (*rightinterval == NULL)
*rightinterval = distant; *rightinterval = distant;
@ -1018,22 +1029,20 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
else if (!distant->newitemonleft && deltaoptimal->newitemonleft) else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
{ {
/* /*
* "incoming tuple will become first on right page" (distant) is * "incoming tuple will become firstright" (distant) is to the
* to the left of "incoming tuple will become last on left page" * left of "incoming tuple will become lastleft" (delta-optimal)
* (delta-optimal)
*/ */
Assert(distant->firstoldonright == state->newitemoff); Assert(distant->firstrightoff == state->newitemoff);
if (*leftinterval == NULL) if (*leftinterval == NULL)
*leftinterval = distant; *leftinterval = distant;
} }
else if (distant->newitemonleft && !deltaoptimal->newitemonleft) else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
{ {
/* /*
* "incoming tuple will become last on left page" (distant) is to * "incoming tuple will become lastleft" (distant) is to the right
* the right of "incoming tuple will become first on right page" * of "incoming tuple will become firstright" (delta-optimal)
* (delta-optimal)
*/ */
Assert(distant->firstoldonright == state->newitemoff); Assert(distant->firstrightoff == state->newitemoff);
if (*rightinterval == NULL) if (*rightinterval == NULL)
*rightinterval = distant; *rightinterval = distant;
} }
@ -1062,34 +1071,33 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
* key for left page. It can be greater than the number of key attributes in * key for left page. It can be greater than the number of key attributes in
* cases where a heap TID will need to be appended during truncation. * cases where a heap TID will need to be appended during truncation.
* *
* On internal pages, penalty is simply the size of the first item on the * On internal pages, penalty is simply the size of the firstright tuple for
* right half of the split (including line pointer overhead). This tuple will * the split (including line pointer overhead). This tuple will become the
* become the new high key for the left page. * new high key for the left page.
*/ */
static inline int static inline int
_bt_split_penalty(FindSplitData *state, SplitPoint *split) _bt_split_penalty(FindSplitData *state, SplitPoint *split)
{ {
IndexTuple lastleftuple; IndexTuple lastleft;
IndexTuple firstrighttuple; IndexTuple firstright;
if (!state->is_leaf) if (!state->is_leaf)
{ {
ItemId itemid; ItemId itemid;
if (!split->newitemonleft && if (!split->newitemonleft &&
split->firstoldonright == state->newitemoff) split->firstrightoff == state->newitemoff)
return state->newitemsz; return state->newitemsz;
itemid = PageGetItemId(state->page, split->firstoldonright); itemid = PageGetItemId(state->origpage, split->firstrightoff);
return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
} }
lastleftuple = _bt_split_lastleft(state, split); lastleft = _bt_split_lastleft(state, split);
firstrighttuple = _bt_split_firstright(state, split); firstright = _bt_split_firstright(state, split);
Assert(lastleftuple != firstrighttuple); return _bt_keep_natts_fast(state->rel, lastleft, firstright);
return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple);
} }
/* /*
@ -1100,12 +1108,12 @@ _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
{ {
ItemId itemid; ItemId itemid;
if (split->newitemonleft && split->firstoldonright == state->newitemoff) if (split->newitemonleft && split->firstrightoff == state->newitemoff)
return state->newitem; return state->newitem;
itemid = PageGetItemId(state->page, itemid = PageGetItemId(state->origpage,
OffsetNumberPrev(split->firstoldonright)); OffsetNumberPrev(split->firstrightoff));
return (IndexTuple) PageGetItem(state->page, itemid); return (IndexTuple) PageGetItem(state->origpage, itemid);
} }
/* /*
@ -1116,9 +1124,9 @@ _bt_split_firstright(FindSplitData *state, SplitPoint *split)
{ {
ItemId itemid; ItemId itemid;
if (!split->newitemonleft && split->firstoldonright == state->newitemoff) if (!split->newitemonleft && split->firstrightoff == state->newitemoff)
return state->newitem; return state->newitem;
itemid = PageGetItemId(state->page, split->firstoldonright); itemid = PageGetItemId(state->origpage, split->firstrightoff);
return (IndexTuple) PageGetItem(state->page, itemid); return (IndexTuple) PageGetItem(state->origpage, itemid);
} }

View File

@ -2346,17 +2346,12 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
ScanKey scankey; ScanKey scankey;
/* /*
* Be consistent about the representation of BTREE_VERSION 2/3 tuples * _bt_compare() treats truncated key attributes as having the value minus
* across Postgres versions; don't allow new pivot tuples to have * infinity, which would break searches within !heapkeyspace indexes. We
* truncated key attributes there. _bt_compare() treats truncated key * must still truncate away non-key attribute values, though.
* attributes as having the value minus infinity, which would break
* searches within !heapkeyspace indexes.
*/ */
if (!itup_key->heapkeyspace) if (!itup_key->heapkeyspace)
{
Assert(nkeyatts != IndexRelationGetNumberOfAttributes(rel));
return nkeyatts; return nkeyatts;
}
scankey = itup_key->scankeys; scankey = itup_key->scankeys;
keepnatts = 1; keepnatts = 1;

View File

@ -251,7 +251,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
} }
static void static void
btree_xlog_split(bool onleft, XLogReaderState *record) btree_xlog_split(bool newitemonleft, XLogReaderState *record)
{ {
XLogRecPtr lsn = record->EndRecPtr; XLogRecPtr lsn = record->EndRecPtr;
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
@ -323,7 +323,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
datapos = XLogRecGetBlockData(record, 0, &datalen); datapos = XLogRecGetBlockData(record, 0, &datalen);
if (onleft || xlrec->postingoff != 0) if (newitemonleft || xlrec->postingoff != 0)
{ {
newitem = (IndexTuple) datapos; newitem = (IndexTuple) datapos;
newitemsz = MAXALIGN(IndexTupleSize(newitem)); newitemsz = MAXALIGN(IndexTupleSize(newitem));
@ -368,7 +368,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
elog(PANIC, "failed to add high key to left page after split"); elog(PANIC, "failed to add high key to left page after split");
leftoff = OffsetNumberNext(leftoff); leftoff = OffsetNumberNext(leftoff);
for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++) for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstrightoff; off++)
{ {
ItemId itemid; ItemId itemid;
Size itemsz; Size itemsz;
@ -377,7 +377,8 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
/* Add replacement posting list when required */ /* Add replacement posting list when required */
if (off == replacepostingoff) if (off == replacepostingoff)
{ {
Assert(onleft || xlrec->firstright == xlrec->newitemoff); Assert(newitemonleft ||
xlrec->firstrightoff == xlrec->newitemoff);
if (PageAddItem(newlpage, (Item) nposting, if (PageAddItem(newlpage, (Item) nposting,
MAXALIGN(IndexTupleSize(nposting)), leftoff, MAXALIGN(IndexTupleSize(nposting)), leftoff,
false, false) == InvalidOffsetNumber) false, false) == InvalidOffsetNumber)
@ -387,7 +388,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
} }
/* add the new item if it was inserted on left page */ /* add the new item if it was inserted on left page */
else if (onleft && off == xlrec->newitemoff) else if (newitemonleft && off == xlrec->newitemoff)
{ {
if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
false, false) == InvalidOffsetNumber) false, false) == InvalidOffsetNumber)
@ -405,7 +406,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
} }
/* cope with possibility that newitem goes at the end */ /* cope with possibility that newitem goes at the end */
if (onleft && off == xlrec->newitemoff) if (newitemonleft && off == xlrec->newitemoff)
{ {
if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff, if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
false, false) == InvalidOffsetNumber) false, false) == InvalidOffsetNumber)

View File

@ -39,8 +39,8 @@ btree_desc(StringInfo buf, XLogReaderState *record)
{ {
xl_btree_split *xlrec = (xl_btree_split *) rec; xl_btree_split *xlrec = (xl_btree_split *) rec;
appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d", appendStringInfo(buf, "level %u, firstrightoff %d, newitemoff %d, postingoff %d",
xlrec->level, xlrec->firstright, xlrec->level, xlrec->firstrightoff,
xlrec->newitemoff, xlrec->postingoff); xlrec->newitemoff, xlrec->postingoff);
break; break;
} }

View File

@ -99,9 +99,9 @@ typedef struct xl_btree_insert
* left or right split page (and thus, whether the new item is stored or not). * left or right split page (and thus, whether the new item is stored or not).
* We always log the left page high key because suffix truncation can generate * We always log the left page high key because suffix truncation can generate
* a new leaf high key using user-defined code. This is also necessary on * a new leaf high key using user-defined code. This is also necessary on
* internal pages, since the first right item that the left page's high key * internal pages, since the firstright item that the left page's high key was
* was based on will have been truncated to zero attributes in the right page * based on will have been truncated to zero attributes in the right page (the
* (the original is unavailable from the right page). * separator key is unavailable from the right page).
* *
* Backup Blk 0: original page / new left page * Backup Blk 0: original page / new left page
* *
@ -153,7 +153,7 @@ typedef struct xl_btree_insert
typedef struct xl_btree_split typedef struct xl_btree_split
{ {
uint32 level; /* tree level of page being split */ uint32 level; /* tree level of page being split */
OffsetNumber firstright; /* first item moved to right page */ OffsetNumber firstrightoff; /* first origpage item on rightpage */
OffsetNumber newitemoff; /* new item's offset */ OffsetNumber newitemoff; /* new item's offset */
uint16 postingoff; /* offset inside orig posting tuple */ uint16 postingoff; /* offset inside orig posting tuple */
} xl_btree_split; } xl_btree_split;