diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 2d0f8f4b79..216d419841 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -223,9 +223,10 @@ right to make this happen --- a scan moving in the opposite direction might miss the items if so.) Also, we *never* delete the rightmost page on a tree level (this restriction simplifies the traversal algorithms, as explained below). Page deletion always begins from an empty leaf page. An -internal page can only be deleted as part of a branch leading to a leaf -page, where each internal page has only one child and that child is also to -be deleted. +internal page can only be deleted as part of deleting an entire subtree. +This is always a "skinny" subtree consisting of a "chain" of internal pages +plus a single leaf page. There is one page on each level of the subtree, +and each level/page covers the same key space. Deleting a leaf page is a two-stage process. In the first stage, the page is unlinked from its parent, and marked as half-dead. The parent page must @@ -243,7 +244,12 @@ it, but it's still linked to its siblings. (Note: Lanin and Shasha prefer to make the key space move left, but their argument for doing so hinges on not having left-links, which we have -anyway. So we simplify the algorithm by moving the key space right.) +anyway. So we simplify the algorithm by moving the key space right. This +is only possible because we don't match on a separator key when ascending +the tree during a page split, unlike Lehman and Yao/Lanin and Shasha -- it +doesn't matter if the downlink is re-found in a pivot tuple whose separator +key does not match the one encountered when inserter initially descended +the tree.) To preserve consistency on the parent level, we cannot merge the key space of a page into its right sibling unless the right sibling is a child of diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 1c1d4a4bc9..a36b35010e 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -2278,7 +2278,8 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack) * stack. For example, the checkingunique _bt_doinsert() case may * have to step right when there are many physical duplicates, and its * scantid forces an insertion to the right of the "first page the - * value could be on". + * value could be on". (This is also relied on by all of our callers + * when dealing with !heapkeyspace indexes.) * * Returns write-locked parent page buffer, or InvalidBuffer if pivot * tuple not found (should not happen). Adjusts bts_blkno & diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index e146cc326c..10abd61983 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -39,9 +39,6 @@ static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid); static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, OffsetNumber *deletable, int ndeletable); -static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, - BTStack stack, Buffer *topparent, OffsetNumber *topoff, - BlockNumber *target, BlockNumber *rightsib); static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack); static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, @@ -49,6 +46,12 @@ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty, TransactionId *oldestBtpoXact, uint32 *ndeleted); +static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, + BTStack stack, + Buffer *subtreeparent, + OffsetNumber *poffset, + BlockNumber *topparent, + BlockNumber *topparentrightsib); /* * _bt_initmetapage() -- Fill a page buffer with a correct metapage image @@ -1316,13 +1319,16 @@ _bt_xid_horizon(Relation rel, Relation heapRel, Page page, /* * Check that leftsib page (the btpo_prev of target page) is not marked with - * INCOMPLETE_SPLIT flag. + * INCOMPLETE_SPLIT flag. Used during page deletion. * * Returning true indicates that page flag is set in leftsib (which is * definitely still the left sibling of target). When that happens, the * target doesn't have a downlink in parent, and the page deletion algorithm * isn't prepared to handle that. Deletion of the target page (or the whole * subtree that contains the target page) cannot take place. + * + * Caller should not have a lock on the target page itself, since pages on the + * same level must always be locked left to right to avoid deadlocks. */ static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target) @@ -1356,7 +1362,7 @@ _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target) /* * Check that leafrightsib page (the btpo_next of target leaf page) is not - * marked with ISHALFDEAD flag. + * marked with ISHALFDEAD flag. Used during page deletion. * * Returning true indicates that page flag is set in leafrightsib, so page * deletion cannot go ahead. Our caller is not prepared to deal with the case @@ -1402,121 +1408,6 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) return result; } -/* - * Subroutine to find the parent of the branch we're deleting. This climbs - * up the tree until it finds a page with more than one child, i.e. a page - * that will not be totally emptied by the deletion. The chain of pages below - * it, with one downlink each, will form the branch that we need to delete. - * - * If we cannot remove the downlink from the parent, because it's the - * rightmost entry, returns false. On success, *topparent and *topoff are set - * to the buffer holding the parent, and the offset of the downlink in it. - * *topparent is write-locked, the caller is responsible for releasing it when - * done. *target is set to the topmost page in the branch to-be-deleted, i.e. - * the page whose downlink *topparent / *topoff point to, and *rightsib to its - * right sibling. - * - * "child" is the leaf page we wish to delete, and "stack" is a search stack - * leading to it (it actually leads to the leftmost leaf page with a high key - * matching that of the page to be deleted in !heapkeyspace indexes). Note - * that we will update the stack entry(s) to reflect current downlink - * positions --- this is essentially the same as the corresponding step of - * splitting, and is not expected to affect caller. The caller should - * initialize *target and *rightsib to the leaf page and its right sibling. - * - * Note: it's OK to release page locks on any internal pages between the leaf - * and *topparent, because a safe deletion can't become unsafe due to - * concurrent activity. An internal page can only acquire an entry if the - * child is split, but that cannot happen as long as we hold a lock on the - * leaf. - */ -static bool -_bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, - Buffer *topparent, OffsetNumber *topoff, - BlockNumber *target, BlockNumber *rightsib) -{ - BlockNumber parent; - OffsetNumber poffset, - maxoff; - Buffer pbuf; - Page page; - BTPageOpaque opaque; - - /* - * Locate the downlink of "child" in the parent, updating the stack entry - * if needed. This is how !heapkeyspace indexes deal with having - * non-unique high keys in leaf level pages. Even heapkeyspace indexes - * can have a stale stack due to insertions into the parent. - */ - pbuf = _bt_getstackbuf(rel, stack, child); - if (pbuf == InvalidBuffer) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u", - RelationGetRelationName(rel), child))); - parent = stack->bts_blkno; - poffset = stack->bts_offset; - - page = BufferGetPage(pbuf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - - /* - * If the target is the rightmost child of its parent, then we can't - * delete, unless it's also the only child. - */ - Assert(poffset <= maxoff); - if (poffset >= maxoff) - { - /* It's rightmost child... */ - if (poffset == P_FIRSTDATAKEY(opaque)) - { - BlockNumber leftsibparent; - - /* - * It's only child, so safe if parent would itself be removable. - * We have to check the parent itself, and then recurse to test - * the conditions at the parent's parent. - */ - if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || - P_INCOMPLETE_SPLIT(opaque)) - { - _bt_relbuf(rel, pbuf); - return false; - } - - *target = parent; - *rightsib = opaque->btpo_next; - leftsibparent = opaque->btpo_prev; - - _bt_relbuf(rel, pbuf); - - /* - * Check that the left sibling of parent (if any) is not marked - * with INCOMPLETE_SPLIT flag before proceeding - */ - if (_bt_leftsib_splitflag(rel, leftsibparent, parent)) - return false; - - return _bt_lock_branch_parent(rel, parent, stack->bts_parent, - topparent, topoff, target, rightsib); - } - else - { - /* Unsafe to delete */ - _bt_relbuf(rel, pbuf); - return false; - } - } - else - { - /* Not rightmost child, so safe to delete */ - *topparent = pbuf; - *topoff = poffset; - return true; - } -} - /* * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so. * @@ -1582,7 +1473,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) /* * Internal pages are never deleted directly, only as part of deleting - * the whole branch all the way down to leaf level. + * the whole subtree all the way down to leaf level. * * Also check for deleted pages here. Caller never passes us a fully * deleted page. Only VACUUM can delete pages, so there can't have @@ -1655,8 +1546,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) /* * First, remove downlink pointing to the page (or a parent of the - * page, if we are going to delete a taller branch), and mark the page - * as half-dead. + * page, if we are going to delete a taller subtree), and mark the + * leafbuf page half-dead */ if (!P_ISHALFDEAD(opaque)) { @@ -1675,14 +1566,14 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) BTScanInsert itup_key; ItemId itemid; IndexTuple targetkey; - BlockNumber leftsib, target; - Buffer lbuf; + BlockNumber leftsib, leafblkno; + Buffer sleafbuf; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); leftsib = opaque->btpo_prev; - target = BufferGetBlockNumber(leafbuf); + leafblkno = BufferGetBlockNumber(leafbuf); /* * To avoid deadlocks, we'd better drop the leaf page lock @@ -1694,8 +1585,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) * Check that the left sibling of leafbuf (if any) is not * marked with INCOMPLETE_SPLIT flag before proceeding */ - Assert(target == scanblkno); - if (_bt_leftsib_splitflag(rel, leftsib, target)) + Assert(leafblkno == scanblkno); + if (_bt_leftsib_splitflag(rel, leftsib, leafblkno)) { ReleaseBuffer(leafbuf); return ndeleted; @@ -1705,9 +1596,11 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) itup_key = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page with matching pivot/high key */ itup_key->pivotsearch = true; - stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL); + stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL); /* won't need a second lock or pin on leafbuf */ - _bt_relbuf(rel, lbuf); + Assert(leafblkno == BufferGetBlockNumber(sleafbuf) || + !itup_key->heapkeyspace); + _bt_relbuf(rel, sleafbuf); /* * Re-lock the leaf page, and start over to use our stack @@ -1736,7 +1629,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) /* * Then unlink it from its siblings. Each call to - * _bt_unlink_halfdead_page unlinks the topmost page from the branch, + * _bt_unlink_halfdead_page unlinks the topmost page from the subtree, * making it shallower. Iterate until the leafbuf page is deleted. * * _bt_unlink_halfdead_page should never fail, since we established @@ -1795,21 +1688,31 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) } /* - * First stage of page deletion. Remove the downlink to the top of the - * branch being deleted, and mark the leaf page as half-dead. + * First stage of page deletion. + * + * Establish the height of the to-be-deleted subtree with leafbuf at its + * lowest level, remove the downlink to the subtree, and mark leafbuf + * half-dead. The final to-be-deleted subtree is usually just leafbuf itself, + * but may include additional internal pages (at most one per level of the + * tree below the root). + * + * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is + * the rightmost child of its parent (and parent has more than one downlink). + * Returns 'true' when the first stage of page deletion completed + * successfully. */ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) { BlockNumber leafblkno; BlockNumber leafrightsib; - BlockNumber target; - BlockNumber rightsib; + BlockNumber topparent; + BlockNumber topparentrightsib; ItemId itemid; Page page; BTPageOpaque opaque; - Buffer topparent; - OffsetNumber topoff; + Buffer subtreeparent; + OffsetNumber poffset; OffsetNumber nextoffset; IndexTuple itup; IndexTupleData trunctuple; @@ -1817,8 +1720,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && !P_ISDELETED(opaque) && - !P_ISHALFDEAD(opaque) && P_ISLEAF(opaque) && + Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && + P_ISLEAF(opaque) && !P_IGNORE(opaque) && P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); /* @@ -1847,47 +1750,57 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) * parent, unless it is the only child --- in which case the parent has to * be deleted too, and the same condition applies recursively to it. We * have to check this condition all the way up before trying to delete, - * and lock the final parent of the to-be-deleted subtree. + * and lock the parent of the root of the to-be-deleted subtree (the + * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent + * for us. We remove the downlink to the "top parent" page (subtree root + * page) from the subtree parent page below. + * + * Initialize topparent to be leafbuf page now. The final to-be-deleted + * subtree is often a degenerate one page subtree consisting only of the + * leafbuf page. When that happens, the leafbuf page is the final subtree + * root page/top parent page. */ - rightsib = leafrightsib; - target = leafblkno; - if (!_bt_lock_branch_parent(rel, leafblkno, stack, - &topparent, &topoff, &target, &rightsib)) + topparent = leafblkno; + topparentrightsib = leafrightsib; + if (!_bt_lock_subtree_parent(rel, leafblkno, stack, + &subtreeparent, &poffset, + &topparent, &topparentrightsib)) return false; /* * Check that the parent-page index items we're about to delete/overwrite - * contain what we expect. This can fail if the index has become corrupt - * for some reason. We want to throw any error before entering the - * critical section --- otherwise it'd be a PANIC. - * - * The test on the target item is just an Assert because - * _bt_lock_branch_parent should have guaranteed it has the expected - * contents. The test on the next-child downlink is known to sometimes - * fail in the field, though. + * in subtree parent page contain what we expect. This can fail if the + * index has become corrupt for some reason. We want to throw any error + * before entering the critical section --- otherwise it'd be a PANIC. */ - page = BufferGetPage(topparent); + page = BufferGetPage(subtreeparent); opaque = (BTPageOpaque) PageGetSpecialPointer(page); #ifdef USE_ASSERT_CHECKING - itemid = PageGetItemId(page, topoff); + /* + * This is just an assertion because _bt_lock_subtree_parent should have + * guaranteed tuple has the expected contents + */ + itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); - Assert(BTreeTupleGetDownLink(itup) == target); + Assert(BTreeTupleGetDownLink(itup) == topparent); #endif - nextoffset = OffsetNumberNext(topoff); + nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); - if (BTreeTupleGetDownLink(itup) != rightsib) + if (BTreeTupleGetDownLink(itup) != topparentrightsib) ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"", - rightsib, target, BTreeTupleGetDownLink(itup), - BufferGetBlockNumber(topparent), RelationGetRelationName(rel)))); + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"", + topparentrightsib, topparent, + BTreeTupleGetDownLink(itup), + BufferGetBlockNumber(subtreeparent), + RelationGetRelationName(rel)))); /* * Any insert which would have gone on the leaf block will now go to its - * right sibling. + * right sibling. In other words, the key space moves right. */ PredicateLockPageCombine(rel, leafblkno, leafrightsib); @@ -1895,25 +1808,31 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) START_CRIT_SECTION(); /* - * Update parent. The normal case is a tad tricky because we want to - * delete the target's downlink and the *following* key. Easiest way is - * to copy the right sibling's downlink over the target downlink, and then - * delete the following item. + * Update parent of subtree. We want to delete the downlink to the top + * parent page/root of the subtree, and the *following* key. Easiest way + * is to copy the right sibling's downlink over the downlink that points + * to top parent page, and then delete the right sibling's original pivot + * tuple. + * + * Lanin and Shasha make the key space move left when deleting a page, + * whereas the key space moves right here. That's why we cannot simply + * delete the pivot tuple with the downlink to the top parent page. See + * nbtree/README. */ - page = BufferGetPage(topparent); + page = BufferGetPage(subtreeparent); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - itemid = PageGetItemId(page, topoff); + itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); - BTreeTupleSetDownLink(itup, rightsib); + BTreeTupleSetDownLink(itup, topparentrightsib); - nextoffset = OffsetNumberNext(topoff); + nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); /* - * Mark the leaf page as half-dead, and stamp it with a pointer to the - * highest internal page in the branch we're deleting. We use the tid of - * the high key to store it. + * Mark the leaf page as half-dead, and stamp it with a link to the top + * parent page. When the leaf page is also the top parent page, the link + * is set to InvalidBlockNumber. */ page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -1922,8 +1841,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) Assert(PageGetMaxOffsetNumber(page) == P_HIKEY); MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); - if (target != leafblkno) - BTreeTupleSetTopParent(&trunctuple, target); + if (topparent != leafblkno) + BTreeTupleSetTopParent(&trunctuple, topparent); else BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber); @@ -1932,7 +1851,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) elog(ERROR, "could not overwrite high key in half-dead page"); /* Must mark buffers dirty before XLogInsert */ - MarkBufferDirty(topparent); + MarkBufferDirty(subtreeparent); MarkBufferDirty(leafbuf); /* XLOG stuff */ @@ -1941,16 +1860,16 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) xl_btree_mark_page_halfdead xlrec; XLogRecPtr recptr; - xlrec.poffset = topoff; + xlrec.poffset = poffset; xlrec.leafblk = leafblkno; - if (target != leafblkno) - xlrec.topparent = target; + if (topparent != leafblkno) + xlrec.topparent = topparent; else xlrec.topparent = InvalidBlockNumber; XLogBeginInsert(); XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT); - XLogRegisterBuffer(1, topparent, REGBUF_STANDARD); + XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD); page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -1961,7 +1880,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD); - page = BufferGetPage(topparent); + page = BufferGetPage(subtreeparent); PageSetLSN(page, recptr); page = BufferGetPage(leafbuf); PageSetLSN(page, recptr); @@ -1969,17 +1888,21 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) END_CRIT_SECTION(); - _bt_relbuf(rel, topparent); + _bt_relbuf(rel, subtreeparent); return true; } /* - * Unlink a page in a branch of half-dead pages from its siblings. + * Second stage of page deletion. * - * If the leaf page still has a downlink pointing to it, unlinks the highest - * parent in the to-be-deleted branch instead of the leaf page. To get rid - * of the whole branch, including the leaf page itself, iterate until the - * leaf page is deleted. + * Unlinks a single page (in the subtree undergoing deletion) from its + * siblings. Also marks the page deleted. + * + * To get rid of the whole subtree, including the leaf page itself, call here + * until the leaf page is deleted. The original "top parent" established in + * the first stage of deletion is deleted in the first call here, while the + * leaf page is deleted in the last call here. Note that the leaf page itself + * is often the initial top parent page. * * Returns 'false' if the page could not be unlinked (shouldn't happen). If * the right sibling of the current target page is empty, *rightsib_empty is @@ -2028,7 +1951,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)); + Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque)); /* * Remember some information about the leaf page. @@ -2047,14 +1970,10 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, */ CHECK_FOR_INTERRUPTS(); - /* - * If the leaf page still has a parent pointing to it (or a chain of - * parents), we don't unlink the leaf page yet, but the topmost remaining - * parent in the branch (i.e. the "top parent") - */ + /* Unlink the current top parent of the subtree */ if (!BlockNumberIsValid(target)) { - /* No top parent, so target is leaf page */ + /* Target is leaf page (or leaf page is top parent, if you prefer) */ target = leafblkno; buf = leafbuf; @@ -2063,7 +1982,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, } else { - /* Target is the internal page taken from leaf's top parent */ + /* Target is the internal page taken from leaf's top parent link */ Assert(target != leafblkno); /* Fetch the block number of the target's left sibling */ @@ -2155,10 +2074,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * only one vacuum process running at a time. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque)) - { elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); - } + if (opaque->btpo_prev != leftsib) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), @@ -2180,7 +2098,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); - /* remember the next non-leaf child down in the branch. */ + /* Remember the next non-leaf child down in the subtree */ itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid)); if (nextchild == leafblkno) @@ -2265,7 +2183,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, /* * If we deleted a parent of the targeted leaf page, instead of the leaf * itself, update the leaf to point to the next remaining child in the - * branch. + * subtree. * * Note: We rely on the fact that a buffer pin on the leaf page has been * held since leafhikey was initialized. This is safe, though only @@ -2406,12 +2324,150 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, if (target <= scanblkno) (*ndeleted)++; - /* - * Release the target, if it was not the leaf block. The leaf is always - * kept locked. - */ + /* If the target is not leafbuf, we're done with it now -- release it */ if (target != leafblkno) _bt_relbuf(rel, buf); return true; } + +/* + * Establish how tall the to-be-deleted subtree will be during the first stage + * of page deletion. + * + * Caller's child argument is the block number of the page caller wants to + * delete (this is leafbuf's block number, except when we're called + * recursively). stack is a search stack leading to it. Note that we will + * update the stack entry(s) to reflect current downlink positions --- this is + * similar to the corresponding point in page split handling. + * + * If "first stage" caller cannot go ahead with deleting _any_ pages, returns + * false. Returns true on success, in which case caller can use certain + * details established here to perform the first stage of deletion. This + * function is the last point at which page deletion may be deemed unsafe + * (barring index corruption, or unexpected concurrent page deletions). + * + * We write lock the parent of the root of the to-be-deleted subtree for + * caller on success (i.e. we leave our lock on the *subtreeparent buffer for + * caller). Caller will have to remove a downlink from *subtreeparent. We + * also set a *subtreeparent offset number in *poffset, to indicate the + * location of the pivot tuple that contains the relevant downlink. + * + * The root of the to-be-deleted subtree is called the "top parent". Note + * that the leafbuf page is often the final "top parent" page (you can think + * of the leafbuf page as a degenerate single page subtree when that happens). + * Caller should initialize *topparent to the target leafbuf page block number + * (while *topparentrightsib should be set to leafbuf's right sibling block + * number). We will update *topparent (and *topparentrightsib) for caller + * here, though only when it turns out that caller will delete at least one + * internal page (i.e. only when caller needs to store a valid link to the top + * parent block in the leafbuf page using BTreeTupleSetTopParent()). + */ +static bool +_bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, + Buffer *subtreeparent, OffsetNumber *poffset, + BlockNumber *topparent, BlockNumber *topparentrightsib) +{ + BlockNumber parent, leftsibparent; + OffsetNumber parentoffset, + maxoff; + Buffer pbuf; + Page page; + BTPageOpaque opaque; + + /* + * Locate the pivot tuple whose downlink points to "child". Write lock + * the parent page itself. + */ + pbuf = _bt_getstackbuf(rel, stack, child); + if (pbuf == InvalidBuffer) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u", + RelationGetRelationName(rel), child))); + parent = stack->bts_blkno; + parentoffset = stack->bts_offset; + + page = BufferGetPage(pbuf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + leftsibparent = opaque->btpo_prev; + + /* + * _bt_getstackbuf() completes page splits on returned parent buffer when + * required. + * + * In general it's a bad idea for VACUUM to use up more disk space, which + * is why page deletion does not finish incomplete page splits most of the + * time. We allow this limited exception because the risk is much lower, + * and the potential downside of not proceeding is much higher: A single + * internal page with the INCOMPLETE_SPLIT flag set might otherwise + * prevent us from deleting hundreds of empty leaf pages from one level + * down. + */ + Assert(!P_INCOMPLETE_SPLIT(opaque)); + + if (parentoffset < maxoff) + { + /* + * Child is not the rightmost child in parent, so it's safe to delete + * the subtree whose root/topparent is child page + */ + *subtreeparent = pbuf; + *poffset = parentoffset; + return true; + } + + /* + * Child is the rightmost child of parent. + * + * Since it's the rightmost child of parent, deleting the child (or + * deleting the subtree whose root/topparent is the child page) is only + * safe when it's also possible to delete the parent. + */ + Assert(parentoffset == maxoff); + if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque)) + { + /* + * Child isn't parent's only child, or parent is rightmost on its + * entire level. Definitely cannot delete any pages. + */ + _bt_relbuf(rel, pbuf); + return false; + } + + /* + * Now make sure that the parent deletion is itself safe by examining the + * child's grandparent page. Recurse, passing the parent page as the + * child page (child's grandparent is the parent on the next level up). + * If parent deletion is unsafe, then child deletion must also be unsafe + * (in which case caller cannot delete any pages at all). + */ + *topparent = parent; + *topparentrightsib = opaque->btpo_next; + + /* + * Release lock on parent before recursing. + * + * It's OK to release page locks on parent before recursive call locks + * grandparent. An internal page can only acquire an entry if the child + * is split, but that cannot happen as long as we still hold a lock on the + * leafbuf page. + */ + _bt_relbuf(rel, pbuf); + + /* + * Before recursing, check that the left sibling of parent (if any) is not + * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the + * parent lock). + * + * Note: We deliberately avoid completing incomplete splits here. + */ + if (_bt_leftsib_splitflag(rel, leftsibparent, parent)) + return false; + + /* Recurse to examine child page's grandparent page */ + return _bt_lock_subtree_parent(rel, parent, stack->bts_parent, + subtreeparent, poffset, + topparent, topparentrightsib); +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 493931e1b8..87a8612c28 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -704,7 +704,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) * target page or not (since it's surely empty). */ - /* parent page */ + /* to-be-deleted subtree's parent page */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { OffsetNumber poffset; @@ -749,8 +749,8 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) pageop->btpo_cycleid = 0; /* - * Construct a dummy hikey item that points to the next parent to be - * deleted (if any). + * Construct a dummy high key item that points to top parent page (value + * is InvalidBlockNumber when the top parent page is the leaf page itself) */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); @@ -837,7 +837,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record) /* * If we deleted a parent of the targeted leaf page, instead of the leaf * itself, update the leaf to point to the next remaining child in the - * branch. + * to-be-deleted subtree */ if (XLogRecHasBlockRef(record, 3)) { diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index ec7b2a6c33..5c014bdc66 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -250,7 +250,7 @@ typedef struct xl_btree_vacuum #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) /* - * This is what we need to know about marking an empty branch for deletion. + * This is what we need to know about marking an empty subtree for deletion. * The target identifies the tuple removed from the parent page (note that we * remove this tuple's downlink and the *following* tuple's key). Note that * the leaf page is empty, so we don't need to store its content --- it is @@ -267,7 +267,7 @@ typedef struct xl_btree_mark_page_halfdead BlockNumber leafblk; /* leaf block ultimately being deleted */ BlockNumber leftblk; /* leaf block's left sibling, if any */ BlockNumber rightblk; /* leaf block's right sibling */ - BlockNumber topparent; /* topmost internal page in the branch */ + BlockNumber topparent; /* topmost internal page in the subtree */ } xl_btree_mark_page_halfdead; #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) @@ -294,7 +294,7 @@ typedef struct xl_btree_unlink_page */ BlockNumber leafleftsib; BlockNumber leafrightsib; - BlockNumber topparent; /* next child down in the branch */ + BlockNumber topparent; /* next child down in the subtree */ TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */