diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index cd110df332..4820f76e3b 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -404,12 +404,41 @@ an additional insertion above that, etc). For a root split, the followon WAL entry is a "new root" entry rather than an "insertion" entry, but details are otherwise much the same. -Because insertion involves multiple atomic actions, the WAL replay logic -has to detect the case where a page split isn't followed by a matching -insertion on the parent level, and then do that insertion on its own (and -recursively for any subsequent parent insertion, of course). This is -feasible because the WAL entry for the split contains enough info to know -what must be inserted in the parent level. +Because splitting involves multiple atomic actions, it's possible that the +system crashes between splitting a page and inserting the downlink for the +new half to the parent. After recovery, the downlink for the new page will +be missing. The search algorithm works correctly, as the page will be found +by following the right-link from its left sibling, although if a lot of +downlinks in the tree are missing, performance will suffer. A more serious +consequence is that if the page without a downlink gets split again, the +insertion algorithm will fail to find the location in the parent level to +insert the downlink. + +Our approach is to create any missing downlinks on-the-fly, when searching +the tree for a new insertion. It could be done during searches, too, but +it seems best not to put any extra updates in what would otherwise be a +read-only operation (updating is not possible in hot standby mode anyway). +It would seem natural to add the missing downlinks in VACUUM, but since +inserting a downlink might require splitting a page, it might fail if you +run out of disk space. That would be bad during VACUUM - the reason for +running VACUUM in the first place might be that you run out of disk space, +and now VACUUM won't finish because you're out of disk space. In contrast, +an insertion can require enlarging the physical file anyway. + +To identify missing downlinks, when a page is split, the left page is +flagged to indicate that the split is not yet complete (INCOMPLETE_SPLIT). +When the downlink is inserted to the parent, the flag is cleared atomically +with the insertion. The child page is kept locked until the insertion in +the parent is finished and the flag in the child cleared, but can be +released immediately after that, before recursing up the tree if the parent +also needs to be split. This ensures that incompletely split pages should +not be seen under normal circumstances; only if insertion to the parent +has failed for some reason. + +We flag the left page, even though it's the right page that's missing the +downlink, beacuse it's more convenient to know already when following the +right-link from the left page to the right page that it will need to have +its downlink inserted to the parent. When splitting a non-root page that is alone on its level, the required metapage update (of the "fast root" link) is performed and logged as part @@ -419,8 +448,16 @@ metapage update is handled as part of the "new root" action. Each step in page deletion are logged as separate WAL entries: marking the leaf as half-dead and removing the downlink is one record, and unlinking a page is a second record. If vacuum is interrupted for some reason, or the -system crashes, the tree is consistent for searches and insertions. The next -VACUUM will find the half-dead leaf page and continue the deletion. +system crashes, the tree is consistent for searches and insertions. The +next VACUUM will find the half-dead leaf page and continue the deletion. + +Before 9.4, we used to keep track of incomplete splits and page deletions +during recovery and finish them immediately at end of recovery, instead of +doing it lazily at the next insertion or vacuum. However, that made the +recovery much more complicated, and only fixed the problem when crash +recovery was performed. An incomplete split can also occur if an otherwise +recoverable error, like out-of-memory or out-of-disk-space, happens while +inserting the downlink to the parent. Scans during Recovery --------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 5f7953f5b3..3cd70ad4cf 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -57,15 +57,18 @@ static void _bt_findinsertloc(Relation rel, int keysz, ScanKey scankey, IndexTuple newtup, + BTStack stack, Relation heapRel); -static void _bt_insertonpg(Relation rel, Buffer buf, +static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page); -static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, - OffsetNumber newitemoff, Size newitemsz, +static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf, + OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft); +static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, + BTStack stack, bool is_root, bool is_only); static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, @@ -129,7 +132,8 @@ top: * move right in the tree. See Lehman and Yao for an excruciatingly * precise description. */ - buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE); + buf = _bt_moveright(rel, buf, natts, itup_scankey, false, + true, stack, BT_WRITE); /* * If we're not allowing duplicates, make sure the key isn't already in @@ -182,8 +186,9 @@ top: */ CheckForSerializableConflictIn(rel, NULL, buf); /* do the insertion */ - _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel); - _bt_insertonpg(rel, buf, stack, itup, offset, false); + _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, + stack, heapRel); + _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false); } else { @@ -507,6 +512,7 @@ _bt_findinsertloc(Relation rel, int keysz, ScanKey scankey, IndexTuple newtup, + BTStack stack, Relation heapRel) { Buffer buf = *bufptr; @@ -568,6 +574,7 @@ _bt_findinsertloc(Relation rel, while (PageGetFreeSpace(page) < itemsz) { Buffer rbuf; + BlockNumber rblkno; /* * before considering moving right, see if we can obtain enough space @@ -605,18 +612,33 @@ _bt_findinsertloc(Relation rel, */ rbuf = InvalidBuffer; + rblkno = lpageop->btpo_next; for (;;) { - BlockNumber rblkno = lpageop->btpo_next; - rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); page = BufferGetPage(rbuf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * If this page was incompletely split, finish the split now. + * We do this while holding a lock on the left sibling, which + * is not good because finishing the split could be a fairly + * lengthy operation. But this should happen very seldom. + */ + if (P_INCOMPLETE_SPLIT(lpageop)) + { + _bt_finish_split(rel, rbuf, stack); + rbuf = InvalidBuffer; + continue; + } + if (!P_IGNORE(lpageop)) break; if (P_RIGHTMOST(lpageop)) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); + + rblkno = lpageop->btpo_next; } _bt_relbuf(rel, buf); buf = rbuf; @@ -658,10 +680,14 @@ _bt_findinsertloc(Relation rel, * child page on the parent. * + updates the metapage if a true root or fast root is split. * - * On entry, we must have the right buffer in which to do the + * On entry, we must have the correct buffer in which to do the * insertion, and the buffer must be pinned and write-locked. On return, * we will have dropped both the pin and the lock on the buffer. * + * When inserting to a non-leaf page, 'cbuf' is the left-sibling of the + * page we're inserting the downlink for. This function will clear the + * INCOMPLETE_SPLIT flag on it, and release the buffer. + * * The locking interactions in this code are critical. You should * grok Lehman and Yao's paper before making any changes. In addition, * you need to understand how we disambiguate duplicate keys in this @@ -675,6 +701,7 @@ _bt_findinsertloc(Relation rel, static void _bt_insertonpg(Relation rel, Buffer buf, + Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, @@ -688,6 +715,14 @@ _bt_insertonpg(Relation rel, page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + /* child buffer must be given iff inserting on an internal page */ + Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf)); + + /* The caller should've finished any incomplete splits already. */ + if (P_INCOMPLETE_SPLIT(lpageop)) + elog(ERROR, "cannot insert to incompletely split page %u", + BufferGetBlockNumber(buf)); + itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ @@ -712,7 +747,7 @@ _bt_insertonpg(Relation rel, &newitemonleft); /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, buf, firstright, + rbuf = _bt_split(rel, buf, cbuf, firstright, newitemoff, itemsz, itup, newitemonleft); PredicateLockPageSplit(rel, BufferGetBlockNumber(buf), @@ -786,11 +821,22 @@ _bt_insertonpg(Relation rel, MarkBufferDirty(metabuf); } + /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */ + if (BufferIsValid(cbuf)) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + Assert(P_INCOMPLETE_SPLIT(cpageop)); + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_btree_insert xlrec; - BlockNumber xldownlink; + BlockNumber xlleftchild; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; @@ -810,12 +856,15 @@ _bt_insertonpg(Relation rel, xlinfo = XLOG_BTREE_INSERT_LEAF; else { - xldownlink = ItemPointerGetBlockNumber(&(itup->t_tid)); - Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); - - nextrdata->data = (char *) &xldownlink; + /* + * Include the block number of the left child, whose + * INCOMPLETE_SPLIT flag was cleared. + */ + xlleftchild = BufferGetBlockNumber(cbuf); + nextrdata->data = (char *) &xlleftchild; nextrdata->len = sizeof(BlockNumber); - nextrdata->buffer = InvalidBuffer; + nextrdata->buffer = cbuf; + nextrdata->buffer_std = true; nextrdata->next = nextrdata + 1; nextrdata++; @@ -870,7 +919,8 @@ _bt_insertonpg(Relation rel, /* release buffers */ if (BufferIsValid(metabuf)) _bt_relbuf(rel, metabuf); - + if (BufferIsValid(cbuf)) + _bt_relbuf(rel, cbuf); _bt_relbuf(rel, buf); } } @@ -883,11 +933,15 @@ _bt_insertonpg(Relation rel, * new right page. newitemoff etc. tell us about the new item that * must be inserted along with the data from the old page. * + * When splitting a non-leaf page, 'cbuf' is the left-sibling of the + * page we're inserting the downlink for. This function will clear the + * INCOMPLETE_SPLIT flag on it, and release the buffer. + * * Returns the new right sibling of buf, pinned and write-locked. * The pin and lock on buf are maintained. */ static Buffer -_bt_split(Relation rel, Buffer buf, OffsetNumber firstright, +_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool newitemonleft) { @@ -911,6 +965,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, OffsetNumber maxoff; OffsetNumber i; bool isroot; + bool isleaf; /* Acquire a new page to split into */ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); @@ -949,12 +1004,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); isroot = P_ISROOT(oopaque); + isleaf = P_ISLEAF(oopaque); /* if we're splitting this page, it won't be the root when we're done */ /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */ lopaque->btpo_flags = oopaque->btpo_flags; lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); ropaque->btpo_flags = lopaque->btpo_flags; + /* set flag in left page indicating that the right page has no downlink */ + lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; lopaque->btpo_prev = oopaque->btpo_prev; lopaque->btpo_next = rightpagenumber; ropaque->btpo_prev = origpagenumber; @@ -1178,6 +1236,19 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, MarkBufferDirty(sbuf); } + /* + * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes + * a split. + */ + if (!isleaf) + { + Page cpage = BufferGetPage(cbuf); + BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage); + + cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(cbuf); + } + /* XLOG stuff */ if (RelationNeedsWAL(rel)) { @@ -1186,6 +1257,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, XLogRecPtr recptr; XLogRecData rdata[7]; XLogRecData *lastrdata; + BlockNumber cblkno; xlrec.node = rel->rd_node; xlrec.leftsib = origpagenumber; @@ -1200,34 +1272,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lastrdata = &rdata[0]; - if (ropaque->btpo.level > 0) - { - /* Log downlink on non-leaf pages */ - lastrdata->next = lastrdata + 1; - lastrdata++; - - lastrdata->data = (char *) &newitem->t_tid.ip_blkid; - lastrdata->len = sizeof(BlockIdData); - lastrdata->buffer = InvalidBuffer; - - /* - * We must also log the left page's high key, because the right - * page's leftmost key is suppressed on non-leaf levels. Show it - * as belonging to the left page buffer, so that it is not stored - * if XLogInsert decides it needs a full-page image of the left - * page. - */ - lastrdata->next = lastrdata + 1; - lastrdata++; - - itemid = PageGetItemId(origpage, P_HIKEY); - item = (IndexTuple) PageGetItem(origpage, itemid); - lastrdata->data = (char *) item; - lastrdata->len = MAXALIGN(IndexTupleSize(item)); - lastrdata->buffer = buf; /* backup block 1 */ - lastrdata->buffer_std = true; - } - /* * Log the new item and its offset, if it was inserted on the left * page. (If it was put on the right page, we don't need to explicitly @@ -1254,23 +1298,62 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer_std = true; } - else if (ropaque->btpo.level == 0) + + /* Log left page */ + if (!isleaf) { - /* - * Although we don't need to WAL-log the new item, we still need - * XLogInsert to consider storing a full-page image of the left - * page, so make an empty entry referencing that buffer. This also - * ensures that the left page is always backup block 1. - */ lastrdata->next = lastrdata + 1; lastrdata++; + /* + * We must also log the left page's high key, because the right + * page's leftmost key is suppressed on non-leaf levels. Show it + * as belonging to the left page buffer, so that it is not stored + * if XLogInsert decides it needs a full-page image of the left + * page. + */ + itemid = PageGetItemId(origpage, P_HIKEY); + item = (IndexTuple) PageGetItem(origpage, itemid); + lastrdata->data = (char *) item; + lastrdata->len = MAXALIGN(IndexTupleSize(item)); + lastrdata->buffer = buf; /* backup block 1 */ + lastrdata->buffer_std = true; + } + + if (isleaf && !newitemonleft) + { + lastrdata->next = lastrdata + 1; + lastrdata++; + + /* + * Although we don't need to WAL-log anything on the left page, + * we still need XLogInsert to consider storing a full-page image + * of the left page, so make an empty entry referencing that + * buffer. This also ensures that the left page is always backup + * block 1. + */ lastrdata->data = NULL; lastrdata->len = 0; lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer_std = true; } + /* + * Log block number of left child, whose INCOMPLETE_SPLIT flag this + * insertion clears. + */ + if (!isleaf) + { + lastrdata->next = lastrdata + 1; + lastrdata++; + + cblkno = BufferGetBlockNumber(cbuf); + lastrdata->data = (char *) &cblkno; + lastrdata->len = sizeof(BlockNumber); + lastrdata->buffer = cbuf; /* backup block 2 */ + lastrdata->buffer_std = true; + } + /* * Log the contents of the right page in the format understood by * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer, @@ -1300,7 +1383,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lastrdata->data = NULL; lastrdata->len = 0; - lastrdata->buffer = sbuf; /* backup block 2 */ + lastrdata->buffer = sbuf; /* bkp block 2 (leaf) or 3 (non-leaf) */ lastrdata->buffer_std = true; } @@ -1327,6 +1410,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, if (!P_RIGHTMOST(ropaque)) _bt_relbuf(rel, sbuf); + /* release the child */ + if (!isleaf) + _bt_relbuf(rel, cbuf); + /* split's done */ return rbuf; } @@ -1597,10 +1684,8 @@ _bt_checksplitloc(FindSplitData *state, * have to be efficient (concurrent ROOT split, WAL recovery) * is_root - we split the true root * is_only - we split a page alone on its level (might have been fast root) - * - * This is exported so it can be called by nbtxlog.c. */ -void +static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, @@ -1619,8 +1704,7 @@ _bt_insert_parent(Relation rel, * * If we have to search for the parent level, we do so by re-descending * from the root. This is not super-efficient, but it's rare enough not - * to matter. (This path is also taken when called from WAL recovery --- - * we have no stack in that case.) + * to matter. */ if (is_root) { @@ -1679,12 +1763,13 @@ _bt_insert_parent(Relation rel, * 05/27/97 */ ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY); - pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); - /* Now we can unlock the children */ + /* + * Now we can unlock the right child. The left child will be unlocked + * by _bt_insertonpg(). + */ _bt_relbuf(rel, rbuf); - _bt_relbuf(rel, buf); /* Check for error only after writing children */ if (pbuf == InvalidBuffer) @@ -1692,7 +1777,7 @@ _bt_insert_parent(Relation rel, RelationGetRelationName(rel), bknum, rbknum); /* Recursively update the parent */ - _bt_insertonpg(rel, pbuf, stack->bts_parent, + _bt_insertonpg(rel, pbuf, buf, stack->bts_parent, new_item, stack->bts_offset + 1, is_only); @@ -1701,6 +1786,62 @@ _bt_insert_parent(Relation rel, } } +/* + * _bt_finish_split() -- Finish an incomplete split + * + * A crash or other failure can leave a split incomplete. The insertion + * routines won't allow to insert on a page that is incompletely split. + * Before inserting on such a page, call _bt_finish_split(). + * + * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked + * and unpinned. + */ +void +_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) +{ + Page lpage = BufferGetPage(lbuf); + BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage); + Buffer rbuf; + Page rpage; + BTPageOpaque rpageop; + bool was_root; + bool was_only; + + Assert(P_INCOMPLETE_SPLIT(lpageop)); + + /* Lock right sibling, the one missing the downlink */ + rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* Could this be a root split? */ + if (!stack) + { + Buffer metabuf; + Page metapg; + BTMetaPageData *metad; + + /* acquire lock on the metapage */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metad = BTPageGetMeta(metapg); + + was_root = (metad->btm_root == BufferGetBlockNumber(lbuf)); + + _bt_relbuf(rel, metabuf); + } + else + was_root = false; + + /* Was this the only page on the level before split? */ + was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop)); + + elog(DEBUG1, "finishing incomplete split of %u/%u", + BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf)); + + _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only); +} + /* * _bt_getstackbuf() -- Walk back up the tree one step, and find the item * we last looked at in the parent. @@ -1733,6 +1874,12 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque)) + { + _bt_finish_split(rel, buf, stack->bts_parent); + continue; + } + if (!P_IGNORE(opaque)) { OffsetNumber offnum, @@ -1837,6 +1984,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rbkno; BlockNumber rootblknum; BTPageOpaque rootopaque; + BTPageOpaque lopaque; ItemId itemid; IndexTuple item; Size itemsz; @@ -1848,6 +1996,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) lbkno = BufferGetBlockNumber(lbuf); rbkno = BufferGetBlockNumber(rbuf); lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); /* get a new root page */ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); @@ -1921,6 +2070,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); pfree(new_item); + /* Clear the incomplete-split flag in the left child */ + Assert(P_INCOMPLETE_SPLIT(lopaque)); + lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + MarkBufferDirty(lbuf); + MarkBufferDirty(rootbuf); MarkBufferDirty(metabuf); @@ -1929,7 +2083,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) { xl_btree_newroot xlrec; XLogRecPtr recptr; - XLogRecData rdata[2]; + XLogRecData rdata[3]; xlrec.node = rel->rd_node; xlrec.rootblk = rootblknum; @@ -1948,7 +2102,13 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rdata[1].len = ((PageHeader) rootpage)->pd_special - ((PageHeader) rootpage)->pd_upper; rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; + rdata[1].next = &(rdata[2]); + + /* Make a full-page image of the left child if needed */ + rdata[2].data = NULL; + rdata[2].len = 0; + rdata[2].buffer = lbuf; + rdata[2].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 14e422a1d3..87ac5f4aaf 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -992,6 +992,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, Buffer pbuf; Page page; BTPageOpaque opaque; + BlockNumber leftsib; /* Locate the parent's downlink (updating the stack entry if needed) */ ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY); @@ -1020,7 +1021,8 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, * We have to check the parent itself, and then recurse to test * the conditions at the parent's parent. */ - if (P_RIGHTMOST(opaque) || P_ISROOT(opaque)) + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || + P_INCOMPLETE_SPLIT(opaque)) { _bt_relbuf(rel, pbuf); return false; @@ -1028,8 +1030,41 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, *target = parent; *rightsib = opaque->btpo_next; + leftsib = opaque->btpo_prev; _bt_relbuf(rel, pbuf); + + /* + * Like in _bt_pagedel, check that the left sibling is not marked + * with INCOMPLETE_SPLIT flag. That would mean that there is no + * downlink to the page to be deleted, and the page deletion + * algorithm isn't prepared to handle that. + */ + if (leftsib != P_NONE) + { + Buffer lbuf; + Page lpage; + BTPageOpaque lopaque; + + lbuf = _bt_getbuf(rel, leftsib, BT_READ); + lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); + /* + * If the left sibling was concurrently split, so that its + * next-pointer doesn't point to the current page anymore, + * the split that created the current page must be completed. + * (We don't allow splitting an incompletely split page again + * until the previous split has been completed) + */ + if (lopaque->btpo_next == parent && + P_INCOMPLETE_SPLIT(lopaque)) + { + _bt_relbuf(rel, lbuf); + return false; + } + _bt_relbuf(rel, lbuf); + } + return _bt_lock_branch_parent(rel, parent, stack->bts_parent, topparent, topoff, target, rightsib); } @@ -1081,6 +1116,10 @@ _bt_pagedel(Relation rel, Buffer buf) * "stack" is a search stack leading (approximately) to the target page. * It is initially NULL, but when iterating, we keep it to avoid * duplicated search effort. + * + * Also, when "stack" is not NULL, we have already checked that the + * current page is not the right half of an incomplete split, i.e. the + * left sibling does not have its INCOMPLETE_SPLIT flag set. */ BTStack stack = NULL; @@ -1117,11 +1156,25 @@ _bt_pagedel(Relation rel, Buffer buf) } /* - * We can never delete rightmost pages nor root pages. While at it, - * check that page is not already deleted and is empty. + * We can never delete rightmost pages nor root pages. While at + * it, check that page is not already deleted and is empty. + * + * To keep the algorithm simple, we also never delete an incompletely + * split page (they should be rare enough that this doesn't make any + * meaningful difference to disk usage): + * + * The INCOMPLETE_SPLIT flag on the page tells us if the page is the + * left half of an incomplete split, but ensuring that it's not the + * right half is more complicated. For that, we have to check that + * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On + * the first iteration, we temporarily release the lock on the + * current page, and check the left sibling and also construct a + * search stack to. On subsequent iterations, we know we stepped right + * from a page that passed these tests, so it's OK. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || - P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) + P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || + P_INCOMPLETE_SPLIT(opaque)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); @@ -1142,6 +1195,9 @@ _bt_pagedel(Relation rel, Buffer buf) * use the standard search mechanism to search for the page's high * key; this will give us a link to either the current parent or * someplace to its left (if there are multiple equal high keys). + * + * Also check if this is the right-half of an incomplete split + * (see comment above). */ if (!stack) { @@ -1149,16 +1205,43 @@ _bt_pagedel(Relation rel, Buffer buf) ItemId itemid; IndexTuple targetkey; Buffer lbuf; + BlockNumber leftsib; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); + leftsib = opaque->btpo_prev; + /* * To avoid deadlocks, we'd better drop the leaf page lock * before going further. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); + /* + * Fetch the left sibling, to check that it's not marked + * with INCOMPLETE_SPLIT flag. That would mean that the + * page to-be-deleted doesn't have a downlink, and the page + * deletion algorithm isn't prepared to handle that. + */ + if (!P_LEFTMOST(opaque)) + { + BTPageOpaque lopaque; + Page lpage; + + lbuf = _bt_getbuf(rel, leftsib, BT_READ); + lpage = BufferGetPage(lbuf); + lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); + if (lopaque->btpo_next == BufferGetBlockNumber(buf) && + P_INCOMPLETE_SPLIT(lopaque)) + { + ReleaseBuffer(buf); + _bt_relbuf(rel, lbuf); + return ndeleted; + } + _bt_relbuf(rel, lbuf); + } + /* we need an insertion scan key for the search, so build one */ itup_scankey = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page containing this key */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index bc62fbc02e..0bf12f0e10 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -51,7 +51,8 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * NOTE that the returned buffer is read-locked regardless of the access * parameter. However, access = BT_WRITE will allow an empty root page * to be created and returned. When access = BT_READ, an empty index - * will result in *bufP being set to InvalidBuffer. + * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode, + * any incomplete splits encountered during the search will be finished. */ BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, @@ -82,8 +83,17 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * Race -- the page we just grabbed may have split since we read its * pointer in the parent (or metapage). If it has, we may need to * move right to its new sibling. Do that. + * + * In write-mode, allow _bt_moveright to finish any incomplete splits + * along the way. Strictly speaking, we'd only need to finish an + * incomplete split on the leaf page we're about to insert to, not on + * any of the upper levels (they is taken care of in _bt_getstackbuf, + * if the leaf page is split and we insert to the parent page). But + * this is a good opportunity to finish splits of internal pages too. */ - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ); + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, + (access == BT_WRITE), stack_in, + BT_READ); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); @@ -148,6 +158,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * + * If forupdate is true, we will attempt to finish any incomplete splits + * that we encounter. This is required when locking a target page for an + * insertion, because we don't allow inserting on a page before the split + * is completed. 'stack' is only used if forupdate is true. + * * On entry, we have the buffer pinned and a lock of the type specified by * 'access'. If we move right, we release the buffer and lock and acquire * the same on the right sibling. Return value is the buffer we stop at. @@ -158,15 +173,14 @@ _bt_moveright(Relation rel, int keysz, ScanKey scankey, bool nextkey, + bool forupdate, + BTStack stack, int access) { Page page; BTPageOpaque opaque; int32 cmpval; - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - /* * When nextkey = false (normal case): if the scan key that brought us to * this page is > the high key stored on the page, then the page has split @@ -184,16 +198,46 @@ _bt_moveright(Relation rel, */ cmpval = nextkey ? 0 : 1; - while (!P_RIGHTMOST(opaque) && - (P_IGNORE(opaque) || - _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)) + for (;;) { - /* step right one page */ - BlockNumber rblkno = opaque->btpo_next; - - buf = _bt_relandgetbuf(rel, buf, rblkno, access); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_RIGHTMOST(opaque)) + break; + + /* + * Finish any incomplete splits we encounter along the way. + */ + if (forupdate && P_INCOMPLETE_SPLIT(opaque)) + { + BlockNumber blkno = BufferGetBlockNumber(buf); + + /* upgrade our lock if necessary */ + if (access == BT_READ) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BT_WRITE); + } + + if (P_INCOMPLETE_SPLIT(opaque)) + _bt_finish_split(rel, buf, stack); + else + _bt_relbuf(rel, buf); + + /* re-acquire the lock in the right mode, and re-check */ + buf = _bt_getbuf(rel, blkno, access); + continue; + } + + if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval) + { + /* step right one page */ + buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); + continue; + } + else + break; } if (P_IGNORE(opaque)) diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index bc376fd409..665e60b7bd 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -20,61 +20,6 @@ #include "storage/procarray.h" #include "miscadmin.h" -/* - * We must keep track of expected insertions due to page splits, and apply - * them manually if they are not seen in the WAL log during replay. This - * makes it safe for page insertion to be a multiple-WAL-action process. - * - * The data structure is a simple linked list --- this should be good enough, - * since we don't expect a page split to remain incomplete for long. In any - * case we need to respect the order of operations. - */ -typedef struct bt_incomplete_split -{ - RelFileNode node; /* the index */ - bool is_root; /* we split the root */ - BlockNumber leftblk; /* left half of split */ - BlockNumber rightblk; /* right half of split */ -} bt_incomplete_split; - -static List *incomplete_splits; - - -static void -log_incomplete_split(RelFileNode node, BlockNumber leftblk, - BlockNumber rightblk, bool is_root) -{ - bt_incomplete_split *action = palloc(sizeof(bt_incomplete_split)); - - action->node = node; - action->is_root = is_root; - action->leftblk = leftblk; - action->rightblk = rightblk; - incomplete_splits = lappend(incomplete_splits, action); -} - -static void -forget_matching_split(RelFileNode node, BlockNumber downlink, bool is_root) -{ - ListCell *l; - - foreach(l, incomplete_splits) - { - bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l); - - if (RelFileNodeEquals(node, action->node) && - downlink == action->rightblk) - { - if (is_root != action->is_root) - elog(LOG, "forget_matching_split: fishy is_root data (expected %d, got %d)", - action->is_root, is_root); - incomplete_splits = list_delete_ptr(incomplete_splits, action); - pfree(action); - break; /* need not look further */ - } - } -} - /* * _bt_restore_page -- re-enter all the index tuples on a page * @@ -149,23 +94,60 @@ _bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn, UnlockReleaseBuffer(metabuf); } +/* + * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page + * + * This is a common subroutine of the redo functions of all the WAL record + * types that can insert a downlink: insert, split, and newroot. + */ +static void +_bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record, + RelFileNode rnode, BlockNumber cblock) +{ + Buffer buf; + + buf = XLogReadBuffer(rnode, cblock, false); + if (BufferIsValid(buf)) + { + Page page = (Page) BufferGetPage(buf); + + if (lsn > PageGetLSN(page)) + { + BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); + Assert((pageop->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0); + pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + } + UnlockReleaseBuffer(buf); + } +} + static void btree_xlog_insert(bool isleaf, bool ismeta, XLogRecPtr lsn, XLogRecord *record) { xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); Buffer buffer; + Buffer cbuffer = InvalidBuffer; Page page; char *datapos; int datalen; xl_btree_metadata md; - BlockNumber downlink = 0; + BlockNumber cblkno = 0; + int main_blk_index; datapos = (char *) xlrec + SizeOfBtreeInsert; datalen = record->xl_len - SizeOfBtreeInsert; - if (!isleaf) + /* + * if this insert finishes a split at lower level, extract the block + * number of the (left) child. + */ + if (!isleaf && (record->xl_info & XLR_BKP_BLOCK(0)) == 0) { - memcpy(&downlink, datapos, sizeof(BlockNumber)); + memcpy(&cblkno, datapos, sizeof(BlockNumber)); + Assert(cblkno != 0); datapos += sizeof(BlockNumber); datalen -= sizeof(BlockNumber); } @@ -176,8 +158,19 @@ btree_xlog_insert(bool isleaf, bool ismeta, datalen -= sizeof(xl_btree_metadata); } - if (record->xl_info & XLR_BKP_BLOCK(0)) - (void) RestoreBackupBlock(lsn, record, 0, false, false); + if (!isleaf) + { + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else + _bt_clear_incomplete_split(lsn, record, xlrec->target.node, cblkno); + main_blk_index = 1; + } + else + main_blk_index = 0; + + if (record->xl_info & XLR_BKP_BLOCK(main_blk_index)) + (void) RestoreBackupBlock(lsn, record, main_blk_index, false, false); else { buffer = XLogReadBuffer(xlrec->target.node, @@ -187,11 +180,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, { page = (Page) BufferGetPage(buffer); - if (lsn <= PageGetLSN(page)) - { - UnlockReleaseBuffer(buffer); - } - else + if (lsn > PageGetLSN(page)) { if (PageAddItem(page, (Item) datapos, datalen, ItemPointerGetOffsetNumber(&(xlrec->target.tid)), @@ -200,11 +189,14 @@ btree_xlog_insert(bool isleaf, bool ismeta, PageSetLSN(page, lsn); MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); } + UnlockReleaseBuffer(buffer); } } + if (BufferIsValid(cbuffer)) + UnlockReleaseBuffer(cbuffer); + /* * Note: in normal operation, we'd update the metapage while still holding * lock on the page we inserted into. But during replay it's not @@ -216,10 +208,6 @@ btree_xlog_insert(bool isleaf, bool ismeta, _bt_restore_meta(xlrec->target.node, lsn, md.root, md.level, md.fastroot, md.fastlevel); - - /* Forget any split this insertion completes */ - if (!isleaf) - forget_matching_split(xlrec->target.node, downlink, false); } static void @@ -227,6 +215,8 @@ btree_xlog_split(bool onleft, bool isroot, XLogRecPtr lsn, XLogRecord *record) { xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); + bool isleaf = (xlrec->level == 0); + Buffer lbuf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; @@ -237,42 +227,18 @@ btree_xlog_split(bool onleft, bool isroot, Size newitemsz = 0; Item left_hikey = NULL; Size left_hikeysz = 0; + BlockNumber cblkno = InvalidBlockNumber; datapos = (char *) xlrec + SizeOfBtreeSplit; datalen = record->xl_len - SizeOfBtreeSplit; - /* Forget any split this insertion completes */ - if (xlrec->level > 0) - { - /* we assume SizeOfBtreeSplit is at least 16-bit aligned */ - BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos); - - datapos += sizeof(BlockIdData); - datalen -= sizeof(BlockIdData); - - forget_matching_split(xlrec->node, downlink, false); - - /* Extract left hikey and its size (still assuming 16-bit alignment) */ - if (!(record->xl_info & XLR_BKP_BLOCK(0))) - { - /* We assume 16-bit alignment is enough for IndexTupleSize */ - left_hikey = (Item) datapos; - left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); - - datapos += left_hikeysz; - datalen -= left_hikeysz; - } - } - - /* Extract newitem and newitemoff, if present */ + /* Extract newitemoff and newitem, if present */ if (onleft) { - /* Extract the offset (still assuming 16-bit alignment) */ memcpy(&newitemoff, datapos, sizeof(OffsetNumber)); datapos += sizeof(OffsetNumber); datalen -= sizeof(OffsetNumber); } - if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0))) { /* @@ -286,6 +252,37 @@ btree_xlog_split(bool onleft, bool isroot, datalen -= newitemsz; } + /* Extract left hikey and its size (still assuming 16-bit alignment) */ + if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(0))) + { + left_hikey = (Item) datapos; + left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); + datapos += left_hikeysz; + datalen -= left_hikeysz; + } + /* + * If this insertion finishes an incomplete split, get the block number + * of the child. + */ + if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(1))) + { + memcpy(&cblkno, datapos, sizeof(BlockNumber)); + datapos += sizeof(BlockNumber); + datalen -= sizeof(BlockNumber); + } + + /* + * Clear the incomplete split flag on the left sibling of the child page + * this is a downlink for. + */ + if (!isleaf) + { + if (record->xl_info & XLR_BKP_BLOCK(2)) + (void) RestoreBackupBlock(lsn, record, 2, false, false); + else + _bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno); + } + /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true); Assert(BufferIsValid(rbuf)); @@ -297,7 +294,7 @@ btree_xlog_split(bool onleft, bool isroot, ropaque->btpo_prev = xlrec->leftsib; ropaque->btpo_next = xlrec->rnext; ropaque->btpo.level = xlrec->level; - ropaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; + ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; _bt_restore_page(rpage, datapos, datalen); @@ -306,7 +303,7 @@ btree_xlog_split(bool onleft, bool isroot, * On leaf level, the high key of the left page is equal to the first key * on the right page. */ - if (xlrec->level == 0) + if (isleaf) { ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); @@ -321,10 +318,10 @@ btree_xlog_split(bool onleft, bool isroot, /* Now reconstruct left (original) sibling page */ if (record->xl_info & XLR_BKP_BLOCK(0)) - (void) RestoreBackupBlock(lsn, record, 0, false, false); + lbuf = RestoreBackupBlock(lsn, record, 0, false, true); else { - Buffer lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false); + lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false); if (BufferIsValid(lbuf)) { @@ -381,19 +378,21 @@ btree_xlog_split(bool onleft, bool isroot, elog(PANIC, "failed to add high key to left page after split"); /* Fix opaque fields */ - lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0; + lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; + if (isleaf) + lopaque->btpo_flags |= BTP_LEAF; lopaque->btpo_next = xlrec->rightsib; lopaque->btpo_cycleid = 0; PageSetLSN(lpage, lsn); MarkBufferDirty(lbuf); } - - UnlockReleaseBuffer(lbuf); } } - /* We no longer need the right buffer */ + /* We no longer need the buffers */ + if (BufferIsValid(lbuf)) + UnlockReleaseBuffer(lbuf); UnlockReleaseBuffer(rbuf); /* @@ -404,32 +403,39 @@ btree_xlog_split(bool onleft, bool isroot, * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ - if (record->xl_info & XLR_BKP_BLOCK(1)) - (void) RestoreBackupBlock(lsn, record, 1, false, false); - else if (xlrec->rnext != P_NONE) + if (xlrec->rnext != P_NONE) { - Buffer buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false); + /* + * the backup block containing right sibling is 2 or 3, depending + * whether this was a leaf or internal page. + */ + int rnext_index = isleaf ? 2 : 3; - if (BufferIsValid(buffer)) + if (record->xl_info & XLR_BKP_BLOCK(rnext_index)) + (void) RestoreBackupBlock(lsn, record, rnext_index, false, false); + else { - Page page = (Page) BufferGetPage(buffer); + Buffer buffer; - if (lsn > PageGetLSN(page)) + buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false); + + if (BufferIsValid(buffer)) { - BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); + Page page = (Page) BufferGetPage(buffer); - pageop->btpo_prev = xlrec->rightsib; + if (lsn > PageGetLSN(page)) + { + BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); - PageSetLSN(page, lsn); - MarkBufferDirty(buffer); + pageop->btpo_prev = xlrec->rightsib; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); } - UnlockReleaseBuffer(buffer); } } - - /* The job ain't done till the parent link is inserted... */ - log_incomplete_split(xlrec->node, - xlrec->leftsib, xlrec->rightsib, isroot); } static void @@ -1003,10 +1009,6 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) Buffer buffer; Page page; BTPageOpaque pageop; - BlockNumber downlink = 0; - - /* Backup blocks are not used in newroot records */ - Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true); Assert(BufferIsValid(buffer)); @@ -1025,14 +1027,21 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) if (record->xl_len > SizeOfBtreeNewroot) { IndexTuple itup; + BlockNumber cblkno; _bt_restore_page(page, (char *) xlrec + SizeOfBtreeNewroot, record->xl_len - SizeOfBtreeNewroot); - /* extract downlink to the right-hand split page */ - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY)); - downlink = ItemPointerGetBlockNumber(&(itup->t_tid)); + /* extract block number of the left-hand split page */ + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY)); + cblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY); + + /* Clear the incomplete-split flag in left child */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else + _bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno); } PageSetLSN(page, lsn); @@ -1042,10 +1051,6 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) _bt_restore_meta(xlrec->node, lsn, xlrec->rootblk, xlrec->level, xlrec->rootblk, xlrec->level); - - /* Check to see if this satisfies any incomplete insertions */ - if (record->xl_len > SizeOfBtreeNewroot) - forget_matching_split(xlrec->node, downlink, true); } static void @@ -1125,59 +1130,3 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record) elog(PANIC, "btree_redo: unknown op code %u", info); } } - -void -btree_xlog_startup(void) -{ - incomplete_splits = NIL; -} - -void -btree_xlog_cleanup(void) -{ - ListCell *l; - - foreach(l, incomplete_splits) - { - /* finish an incomplete split */ - bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l); - Buffer lbuf, - rbuf; - Page lpage, - rpage; - BTPageOpaque lpageop, - rpageop; - bool is_only; - Relation reln; - - lbuf = XLogReadBuffer(action->node, action->leftblk, false); - /* failure is impossible because we wrote this page earlier */ - if (!BufferIsValid(lbuf)) - elog(PANIC, "btree_xlog_cleanup: left block unfound"); - lpage = (Page) BufferGetPage(lbuf); - lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage); - rbuf = XLogReadBuffer(action->node, action->rightblk, false); - /* failure is impossible because we wrote this page earlier */ - if (!BufferIsValid(rbuf)) - elog(PANIC, "btree_xlog_cleanup: right block unfound"); - rpage = (Page) BufferGetPage(rbuf); - rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); - - /* if the pages are all of their level, it's a only-page split */ - is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop); - - reln = CreateFakeRelcacheEntry(action->node); - _bt_insert_parent(reln, lbuf, rbuf, NULL, - action->is_root, is_only); - FreeFakeRelcacheEntry(reln); - } - incomplete_splits = NIL; -} - -bool -btree_safe_restartpoint(void) -{ - if (incomplete_splits) - return false; - return true; -} diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 779a42229b..64c6982f50 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -73,6 +73,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */ +#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ /* * The max allowed value of a cycle ID is a bit less than 64K. This is @@ -178,6 +179,7 @@ typedef struct BTMetaPageData #define P_ISHALFDEAD(opaque) ((opaque)->btpo_flags & BTP_HALF_DEAD) #define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) #define P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE) +#define P_INCOMPLETE_SPLIT(opaque) ((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost @@ -253,7 +255,7 @@ typedef struct xl_btree_metadata typedef struct xl_btree_insert { xl_btreetid target; /* inserted tuple id */ - /* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */ + /* BlockNumber finishes_split field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */ /* INDEX TUPLE FOLLOWS AT END OF STRUCT */ } xl_btree_insert; @@ -286,19 +288,18 @@ typedef struct xl_btree_split OffsetNumber firstright; /* first item moved to right page */ /* - * If level > 0, BlockIdData downlink follows. (We use BlockIdData rather - * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit - * aligned.) + * In the _L variants, next are OffsetNumber newitemoff and the new item. + * (In the _R variants, the new item is one of the right page's tuples.) + * The new item, but not newitemoff, is suppressed if XLogInsert chooses + * to store the left page's whole page image. * * If level > 0, an IndexTuple representing the HIKEY of the left page * follows. We don't need this on leaf pages, because it's the same as * the leftmost key in the new right page. Also, it's suppressed if * XLogInsert chooses to store the left page's whole page image. * - * In the _L variants, next are OffsetNumber newitemoff and the new item. - * (In the _R variants, the new item is one of the right page's tuples.) - * The new item, but not newitemoff, is suppressed if XLogInsert chooses - * to store the left page's whole page image. + * If level > 0, BlockNumber of the page whose incomplete-split flag + * this insertion clears. (not aligned) * * Last are the right page's tuples in the form used by _bt_restore_page. */ @@ -642,8 +643,7 @@ extern Datum btoptions(PG_FUNCTION_ARGS); extern bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel); extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access); -extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, - BTStack stack, bool is_root, bool is_only); +extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); /* * prototypes for functions in nbtpage.c @@ -673,7 +673,8 @@ extern BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access); extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, - ScanKey scankey, bool nextkey, int access); + ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, + int access); extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey); extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, @@ -722,8 +723,5 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); */ extern void btree_redo(XLogRecPtr lsn, XLogRecord *record); extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec); -extern void btree_xlog_startup(void); -extern void btree_xlog_cleanup(void); -extern bool btree_safe_restartpoint(void); #endif /* NBTREE_H */ diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index c968101e8a..d9ee57d6da 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, NULL, NULL, NULL) PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, NULL, NULL, NULL) PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL) PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, NULL, NULL, NULL) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, NULL, NULL, NULL) PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, NULL, NULL, NULL) PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, NULL) PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL) diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 297e4505db..ec021fe29a 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -55,7 +55,7 @@ typedef struct BkpBlock /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD07C /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD07D /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData {