Don't leave behind junk nbtree pages during split.

Commit 8fa30f906b reduced the elevel of a number of "can't happen"
_bt_split() errors from PANIC to ERROR.  At the same time, the new right
page buffer for the split could continue to be acquired well before the
critical section.  This was possible because it was relatively
straightforward to make sure that _bt_split() could not throw an error,
with a few specific exceptions.  The exceptional cases were safe because
they involved specific, well understood errors, making it possible to
consistently zero the right page before actually raising an error using
elog().  There was no danger of leaving around a junk page, provided
_bt_split() stuck to this coding rule.

Commit 8224de4f, which introduced INCLUDE indexes, added code to make
_bt_split() truncate away non-key attributes.  This happened at a point
that broke the rule around zeroing the right page in _bt_split().  If
truncation failed (perhaps due to palloc() failure), that would result
in an errant right page buffer with junk contents.  This could confuse
VACUUM when it attempted to delete the page, and should be avoided on
general principle.

To fix, reorganize _bt_split() so that truncation occurs before the new
right page buffer is even acquired.  A junk page/buffer will not be left
behind if _bt_nonkey_truncate()/_bt_truncate() raise an error.

Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com
Backpatch: 11-, where INCLUDE indexes were introduced.
This commit is contained in:
Peter Geoghegan 2019-05-13 10:27:59 -07:00
parent 221b377f09
commit 9b42e71376
1 changed files with 129 additions and 90 deletions

View File

@ -49,8 +49,8 @@ static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
OffsetNumber newitemoff, OffsetNumber newitemoff,
bool split_only_page); bool split_only_page);
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
Size newitemsz, IndexTuple newitem, bool newitemonleft); IndexTuple newitem);
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
BTStack stack, bool is_root, bool is_only); BTStack stack, bool is_root, bool is_only);
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
@ -943,7 +943,6 @@ _bt_insertonpg(Relation rel,
{ {
Page page; Page page;
BTPageOpaque lpageop; BTPageOpaque lpageop;
OffsetNumber firstright = InvalidOffsetNumber;
Size itemsz; Size itemsz;
page = BufferGetPage(buf); page = BufferGetPage(buf);
@ -979,7 +978,6 @@ _bt_insertonpg(Relation rel,
{ {
bool is_root = P_ISROOT(lpageop); bool is_root = P_ISROOT(lpageop);
bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop); bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
bool newitemonleft;
Buffer rbuf; Buffer rbuf;
/* /*
@ -1000,14 +998,8 @@ _bt_insertonpg(Relation rel,
Assert(!(P_ISLEAF(lpageop) && Assert(!(P_ISLEAF(lpageop) &&
BlockNumberIsValid(RelationGetTargetBlock(rel)))); BlockNumberIsValid(RelationGetTargetBlock(rel))));
/* Choose the split point */
firstright = _bt_findsplitloc(rel, page,
newitemoff, itemsz, itup,
&newitemonleft);
/* split the buffer into left and right halves */ /* split the buffer into left and right halves */
rbuf = _bt_split(rel, itup_key, buf, cbuf, firstright, newitemoff, rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
itemsz, itup, newitemonleft);
PredicateLockPageSplit(rel, PredicateLockPageSplit(rel,
BufferGetBlockNumber(buf), BufferGetBlockNumber(buf),
BufferGetBlockNumber(rbuf)); BufferGetBlockNumber(rbuf));
@ -1211,9 +1203,8 @@ _bt_insertonpg(Relation rel,
* _bt_split() -- split a page in the btree. * _bt_split() -- split a page in the btree.
* *
* On entry, buf is the page to split, and is pinned and write-locked. * On entry, buf is the page to split, and is pinned and write-locked.
* firstright is the item index of the first item to be moved to the * newitemoff etc. tell us about the new item that must be inserted
* new right page. newitemoff etc. tell us about the new item that * along with the data from the original page.
* must be inserted along with the data from the old page.
* *
* itup_key is used for suffix truncation on leaf pages (internal * itup_key is used for suffix truncation on leaf pages (internal
* page callers pass NULL). When splitting a non-leaf page, 'cbuf' * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
@ -1226,8 +1217,7 @@ _bt_insertonpg(Relation rel,
*/ */
static Buffer static Buffer
_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
IndexTuple newitem, bool newitemonleft)
{ {
Buffer rbuf; Buffer rbuf;
Page origpage; Page origpage;
@ -1246,36 +1236,67 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
IndexTuple item; IndexTuple item;
OffsetNumber leftoff, OffsetNumber leftoff,
rightoff; rightoff;
OffsetNumber firstright;
OffsetNumber maxoff; OffsetNumber maxoff;
OffsetNumber i; OffsetNumber i;
bool isleaf; bool newitemonleft,
isleaf;
IndexTuple lefthikey; IndexTuple lefthikey;
int indnatts = IndexRelationGetNumberOfAttributes(rel); int indnatts = IndexRelationGetNumberOfAttributes(rel);
int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
/* Acquire a new page to split into */
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
/* /*
* origpage is the original page to be split. leftpage is a temporary * origpage is the original page to be split. leftpage is a temporary
* buffer that receives the left-sibling data, which will be copied back * buffer that receives the left-sibling data, which will be copied back
* into origpage on success. rightpage is the new page that receives the * into origpage on success. rightpage is the new page that will receive
* right-sibling data. If we fail before reaching the critical section, * the right-sibling data.
* origpage hasn't been modified and leftpage is only workspace. In *
* principle we shouldn't need to worry about rightpage either, because it * leftpage is allocated after choosing a split point. rightpage's new
* hasn't been linked into the btree page structure; but to avoid leaving * buffer isn't acquired until after leftpage is initialized and has new
* possibly-confusing junk behind, we are careful to rewrite rightpage as * high key, the last point where splitting the page may fail (barring
* zeroes before throwing any error. * corruption). Failing before acquiring new buffer won't have lasting
* consequences, since origpage won't have been modified and leftpage is
* only workspace.
*/ */
origpage = BufferGetPage(buf); origpage = BufferGetPage(buf);
leftpage = PageGetTempPage(origpage); oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
rightpage = BufferGetPage(rbuf);
origpagenumber = BufferGetBlockNumber(buf); origpagenumber = BufferGetBlockNumber(buf);
rightpagenumber = BufferGetBlockNumber(rbuf);
/*
* Choose a point to split origpage at.
*
* A split point can be thought of as a point _between_ two existing
* tuples on origpage (lastleft and firstright tuples), provided you
* pretend that the new item that didn't fit is already on origpage.
*
* Since origpage does not actually contain newitem, the representation of
* split points needs to work with two boundary cases: splits where
* newitem is lastleft, and splits where newitem is firstright.
* newitemonleft resolves the ambiguity that would otherwise exist when
* newitemoff == firstright. In all other cases it's clear which side of
* the split every tuple goes on from context. newitemonleft is usually
* (but not always) redundant information.
*/
firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
newitem, &newitemonleft);
/* Allocate temp buffer for leftpage */
leftpage = PageGetTempPage(origpage);
_bt_pageinit(leftpage, BufferGetPageSize(buf)); _bt_pageinit(leftpage, BufferGetPageSize(buf));
/* rightpage was already initialized by _bt_getbuf */ lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
/*
* leftpage won't be the root when we're done. Also, clear the SPLIT_END
* and HAS_GARBAGE flags.
*/
lopaque->btpo_flags = oopaque->btpo_flags;
lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
/* set flag in leftpage indicating that rightpage has no downlink yet */
lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
lopaque->btpo_prev = oopaque->btpo_prev;
/* handle btpo_next after rightpage buffer acquired */
lopaque->btpo.level = oopaque->btpo.level;
/* handle btpo_cycleid after rightpage buffer acquired */
/* /*
* Copy the original page's LSN into leftpage, which will become the * Copy the original page's LSN into leftpage, which will become the
@ -1283,62 +1304,12 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* examine the LSN and possibly dump it in a page image. * examine the LSN and possibly dump it in a page image.
*/ */
PageSetLSN(leftpage, PageGetLSN(origpage)); PageSetLSN(leftpage, PageGetLSN(origpage));
/* init btree private data */
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
isleaf = P_ISLEAF(oopaque); isleaf = P_ISLEAF(oopaque);
/* if we're splitting this page, it won't be the root when we're done */
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
lopaque->btpo_flags = oopaque->btpo_flags;
lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
ropaque->btpo_flags = lopaque->btpo_flags;
/* set flag in left page indicating that the right page has no downlink */
lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
lopaque->btpo_prev = oopaque->btpo_prev;
lopaque->btpo_next = rightpagenumber;
ropaque->btpo_prev = origpagenumber;
ropaque->btpo_next = oopaque->btpo_next;
lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
/* Since we already have write-lock on both pages, ok to read cycleid */
lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
ropaque->btpo_cycleid = lopaque->btpo_cycleid;
/*
* If the page we're splitting is not the rightmost page at its level in
* the tree, then the first entry on the page is the high key for the
* page. We need to copy that to the right half. Otherwise (meaning the
* rightmost page case), all the items on the right half will be user
* data.
*/
rightoff = P_HIKEY;
if (!P_RIGHTMOST(oopaque))
{
itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
Assert(BTreeTupleGetNAtts(item, rel) > 0);
Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
false, false) == InvalidOffsetNumber)
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add hikey to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
rightoff = OffsetNumberNext(rightoff);
}
/* /*
* The "high key" for the new left page will be the first key that's going * The "high key" for the new left page will be the first key that's going
* to go into the new right page, or possibly a truncated version if this * to go into the new right page, or a truncated version if this is a leaf
* is a leaf page split. This might be either the existing data item at * page split.
* position firstright, or the incoming tuple.
* *
* The high key for the left page is formed using the first item on the * The high key for the left page is formed using the first item on the
* right page, which may seem to be contrary to Lehman & Yao's approach of * right page, which may seem to be contrary to Lehman & Yao's approach of
@ -1360,7 +1331,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* tuple could be physically larger despite being opclass-equal in respect * tuple could be physically larger despite being opclass-equal in respect
* of all attributes prior to the heap TID attribute.) * of all attributes prior to the heap TID attribute.)
*/ */
leftoff = P_HIKEY;
if (!newitemonleft && newitemoff == firstright) if (!newitemonleft && newitemoff == firstright)
{ {
/* incoming tuple will become first on right page */ /* incoming tuple will become first on right page */
@ -1416,23 +1386,91 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
else else
lefthikey = item; lefthikey = item;
/*
* Add new high key to leftpage
*/
leftoff = P_HIKEY;
Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0); Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts); Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
false, false) == InvalidOffsetNumber) false, false) == InvalidOffsetNumber)
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add hikey to the left sibling" elog(ERROR, "failed to add hikey to the left sibling"
" while splitting block %u of index \"%s\"", " while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel)); origpagenumber, RelationGetRelationName(rel));
}
leftoff = OffsetNumberNext(leftoff); leftoff = OffsetNumberNext(leftoff);
/* be tidy */ /* be tidy */
if (lefthikey != item) if (lefthikey != item)
pfree(lefthikey); pfree(lefthikey);
/* /*
* Now transfer all the data items to the appropriate page. * Acquire a new right page to split into, now that left page has a new
* high key. From here on, it's not okay to throw an error without
* zeroing rightpage first. This coding rule ensures that we won't
* confuse future VACUUM operations, which might otherwise try to re-find
* a downlink to a leftover junk page as the page undergoes deletion.
*
* It would be reasonable to start the critical section just after the new
* rightpage buffer is acquired instead; that would allow us to avoid
* leftover junk pages without bothering to zero rightpage. We do it this
* way because it avoids an unnecessary PANIC when either origpage or its
* existing sibling page are corrupt.
*/
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
rightpage = BufferGetPage(rbuf);
rightpagenumber = BufferGetBlockNumber(rbuf);
/* rightpage was initialized by _bt_getbuf */
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
/*
* Finish off remaining leftpage special area fields. They cannot be set
* before both origpage (leftpage) and rightpage buffers are acquired and
* locked.
*/
lopaque->btpo_next = rightpagenumber;
lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
/*
* rightpage won't be the root when we're done. Also, clear the SPLIT_END
* and HAS_GARBAGE flags.
*/
ropaque->btpo_flags = oopaque->btpo_flags;
ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
ropaque->btpo_prev = origpagenumber;
ropaque->btpo_next = oopaque->btpo_next;
ropaque->btpo.level = oopaque->btpo.level;
ropaque->btpo_cycleid = lopaque->btpo_cycleid;
/*
* Add new high key to rightpage where necessary.
*
* If the page we're splitting is not the rightmost page at its level in
* the tree, then the first entry on the page is the high key from
* origpage.
*/
rightoff = P_HIKEY;
if (!P_RIGHTMOST(oopaque))
{
itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
Assert(BTreeTupleGetNAtts(item, rel) > 0);
Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
false, false) == InvalidOffsetNumber)
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add hikey to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
rightoff = OffsetNumberNext(rightoff);
}
/*
* Now transfer all the data items (non-pivot tuples in isleaf case, or
* additional pivot tuples in !isleaf case) to the appropriate page.
* *
* Note: we *must* insert at least the right page's items in item-number * Note: we *must* insert at least the right page's items in item-number
* order, for the benefit of _bt_restore_page(). * order, for the benefit of _bt_restore_page().
@ -1450,6 +1488,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
{ {
if (newitemonleft) if (newitemonleft)
{ {
Assert(newitemoff <= firstright);
if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff)) if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
{ {
memset(rightpage, 0, BufferGetPageSize(rbuf)); memset(rightpage, 0, BufferGetPageSize(rbuf));
@ -1461,6 +1500,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
} }
else else
{ {
Assert(newitemoff >= firstright);
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff)) if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
{ {
memset(rightpage, 0, BufferGetPageSize(rbuf)); memset(rightpage, 0, BufferGetPageSize(rbuf));
@ -1523,7 +1563,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* all readers release locks on a page before trying to fetch its * all readers release locks on a page before trying to fetch its
* neighbors. * neighbors.
*/ */
if (!P_RIGHTMOST(oopaque)) if (!P_RIGHTMOST(oopaque))
{ {
sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);