diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index f5962f6163..e8c1fccc84 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -35,9 +35,11 @@ #include "utils/snapmgr.h" static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); -static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack); +static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, + BTStack stack); static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, - bool *rightsib_empty); + bool *rightsib_empty, + TransactionId *oldestBtpoXact); static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, OffsetNumber *deletable, int ndeletable); static bool _bt_lock_branch_parent(Relation rel, BlockNumber child, @@ -1470,27 +1472,35 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack, } /* - * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so. + * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so. * - * This action unlinks the page from the b-tree structure, removing all + * This action unlinks the leaf page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and locked (either read or write - * lock is OK). This lock and pin will be dropped before exiting. + * lock is OK). The page must be an empty leaf page, which may be half-dead + * already (a half-dead page should only be passed to us when an earlier + * VACUUM operation was interrupted, though). Note in particular that caller + * should never pass a buffer containing an existing deleted page here. The + * lock and pin on caller's buffer will be dropped before we return. * * Returns the number of pages successfully deleted (zero if page cannot - * be deleted now; could be more than one if parent or sibling pages were - * deleted too). + * be deleted now; could be more than one if parent or right sibling pages + * were deleted too). + * + * Maintains *oldestBtpoXact for any pages that get deleted. Caller is + * responsible for maintaining *oldestBtpoXact in the case of pages that were + * deleted by a previous VACUUM. * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int -_bt_pagedel(Relation rel, Buffer buf) +_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) { int ndeleted = 0; BlockNumber rightsib; @@ -1511,14 +1521,21 @@ _bt_pagedel(Relation rel, Buffer buf) for (;;) { - page = BufferGetPage(buf); + page = BufferGetPage(leafbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Internal pages are never deleted directly, only as part of deleting * the whole branch all the way down to leaf level. + * + * Also check for deleted pages here. Caller never passes us a fully + * deleted page. Only VACUUM can delete pages, so there can't have + * been a concurrent deletion. Assume that we reached any deleted + * page encountered here by following a sibling link, and that the + * index is corrupt. */ - if (!P_ISLEAF(opaque)) + Assert(!P_ISDELETED(opaque)); + if (!P_ISLEAF(opaque) || P_ISDELETED(opaque)) { /* * Pre-9.4 page deletion only marked internal pages as half-dead, @@ -1537,13 +1554,22 @@ _bt_pagedel(Relation rel, Buffer buf) errmsg("index \"%s\" contains a half-dead internal page", RelationGetRelationName(rel)), errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); - _bt_relbuf(rel, buf); + + if (P_ISDELETED(opaque)) + ereport(LOG, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("found deleted block %u while following right link in index \"%s\"", + BufferGetBlockNumber(leafbuf), + RelationGetRelationName(rel)))); + + _bt_relbuf(rel, leafbuf); return ndeleted; } /* * We can never delete rightmost pages nor root pages. While at it, - * check that page is not already deleted and is empty. + * check that page is empty, since it's possible that the leafbuf page + * was empty a moment ago, but has since had some inserts. * * To keep the algorithm simple, we also never delete an incompletely * split page (they should be rare enough that this doesn't make any @@ -1558,14 +1584,14 @@ _bt_pagedel(Relation rel, Buffer buf) * to. On subsequent iterations, we know we stepped right from a page * that passed these tests, so it's OK. */ - if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || + if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || P_INCOMPLETE_SPLIT(opaque)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); - _bt_relbuf(rel, buf); + _bt_relbuf(rel, leafbuf); return ndeleted; } @@ -1603,7 +1629,7 @@ _bt_pagedel(Relation rel, Buffer buf) * To avoid deadlocks, we'd better drop the leaf page lock * before going further. */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK); /* * Fetch the left sibling, to check that it's not marked with @@ -1627,10 +1653,10 @@ _bt_pagedel(Relation rel, Buffer buf) * incompletely-split page to be split again. So we don't * need to walk right here. */ - if (lopaque->btpo_next == BufferGetBlockNumber(buf) && + if (lopaque->btpo_next == BufferGetBlockNumber(leafbuf) && P_INCOMPLETE_SPLIT(lopaque)) { - ReleaseBuffer(buf); + ReleaseBuffer(leafbuf); _bt_relbuf(rel, lbuf); return ndeleted; } @@ -1646,16 +1672,26 @@ _bt_pagedel(Relation rel, Buffer buf) _bt_relbuf(rel, lbuf); /* - * Re-lock the leaf page, and start over, to re-check that the - * page can still be deleted. + * Re-lock the leaf page, and start over to use our stack + * within _bt_mark_page_halfdead. We must do it that way + * because it's possible that leafbuf can no longer be + * deleted. We need to recheck. */ - LockBuffer(buf, BT_WRITE); + LockBuffer(leafbuf, BT_WRITE); continue; } - if (!_bt_mark_page_halfdead(rel, buf, stack)) + /* + * See if it's safe to delete the leaf page, and determine how + * many parent/internal pages above the leaf level will be + * deleted. If it's safe then _bt_mark_page_halfdead will also + * perform the first phase of deletion, which includes marking the + * leafbuf page half-dead. + */ + Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque)); + if (!_bt_mark_page_halfdead(rel, leafbuf, stack)) { - _bt_relbuf(rel, buf); + _bt_relbuf(rel, leafbuf); return ndeleted; } } @@ -1663,23 +1699,32 @@ _bt_pagedel(Relation rel, Buffer buf) /* * Then unlink it from its siblings. Each call to * _bt_unlink_halfdead_page unlinks the topmost page from the branch, - * making it shallower. Iterate until the leaf page is gone. + * making it shallower. Iterate until the leafbuf page is deleted. + * + * _bt_unlink_halfdead_page should never fail, since we established + * that deletion is generally safe in _bt_mark_page_halfdead. */ rightsib_empty = false; + Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)); while (P_ISHALFDEAD(opaque)) { - /* will check for interrupts, once lock is released */ - if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty)) + /* Check for interrupts in _bt_unlink_halfdead_page */ + if (!_bt_unlink_halfdead_page(rel, leafbuf, &rightsib_empty, + oldestBtpoXact)) { - /* _bt_unlink_halfdead_page already released buffer */ + /* _bt_unlink_halfdead_page failed, released buffer */ return ndeleted; } ndeleted++; } + Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque)); + Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact, + *oldestBtpoXact)); + rightsib = opaque->btpo_next; - _bt_relbuf(rel, buf); + _bt_relbuf(rel, leafbuf); /* * Check here, as calling loops will have locks held, preventing @@ -1705,7 +1750,7 @@ _bt_pagedel(Relation rel, Buffer buf) if (!rightsib_empty) break; - buf = _bt_getbuf(rel, rightsib, BT_WRITE); + leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE); } return ndeleted; @@ -1909,9 +1954,19 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) * of the whole branch, including the leaf page itself, iterate until the * leaf page is deleted. * - * Returns 'false' if the page could not be unlinked (shouldn't happen). - * If the (current) right sibling of the page is empty, *rightsib_empty is - * set to true. + * Returns 'false' if the page could not be unlinked (shouldn't happen). If + * the right sibling of the current target page is empty, *rightsib_empty is + * set to true, allowing caller to delete the target's right sibling page in + * passing. Note that *rightsib_empty is only actually used by caller when + * target page is leafbuf, following last call here for leafbuf/the subtree + * containing leafbuf. (We always set *rightsib_empty for caller, just to be + * consistent.) + * + * We maintain *oldestBtpoXact for pages that are deleted by the current + * VACUUM operation here. This must be handled here because we conservatively + * assume that there needs to be a new call to ReadNewTransactionId() each + * time a page gets deleted. See comments about the underlying assumption + * below. * * Must hold pin and lock on leafbuf at entry (read or write doesn't matter). * On success exit, we'll be holding pin and write lock. On failure exit, @@ -1919,7 +1974,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) * to avoid having to reacquire a lock we already released). */ static bool -_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) +_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty, + TransactionId *oldestBtpoXact) { BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); BlockNumber leafleftsib; @@ -2057,9 +2113,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) lbuf = InvalidBuffer; /* - * Next write-lock the target page itself. It should be okay to take just - * a write lock not a superexclusive lock, since no scans would stop on an - * empty page. + * Next write-lock the target page itself. It's okay to take a write lock + * rather than a superexclusive lock, since no scan will stop on an empty + * page. */ LockBuffer(buf, BT_WRITE); page = BufferGetPage(buf); @@ -2204,6 +2260,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque)); opaque->btpo_flags &= ~BTP_HALF_DEAD; opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = ReadNewTransactionId(); @@ -2309,6 +2366,10 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) _bt_relbuf(rel, lbuf); _bt_relbuf(rel, rbuf); + if (!TransactionIdIsValid(*oldestBtpoXact) || + TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact)) + *oldestBtpoXact = opaque->btpo.xact; + /* * Release the target, if it was not the leaf block. The leaf is always * kept locked. diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 36294789f3..f68601ff8e 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -92,7 +92,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc; static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, - BTCycleId cycleid, TransactionId *oldestBtpoXact); + BTCycleId cycleid); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); static BTVacuumPosting btreevacuumposting(BTVacState *vstate, @@ -787,8 +787,14 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) } /* - * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that - * btbulkdelete() wasn't called. + * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup + * + * Called by btvacuumcleanup when btbulkdelete was never called because no + * tuples need to be deleted. + * + * When we return false, VACUUM can even skip the cleanup-only call to + * btvacuumscan (i.e. there will be no btvacuumscan call for this index at + * all). Otherwise, a cleanup-only btvacuumscan call is required. */ static bool _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) @@ -815,8 +821,15 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) RecentGlobalXmin)) { /* - * If oldest btpo.xact in the deleted pages is older than - * RecentGlobalXmin, then at least one deleted page can be recycled. + * If any oldest btpo.xact from a previously deleted page in the index + * is older than RecentGlobalXmin, then at least one deleted page can + * be recycled -- don't skip cleanup. + * + * Note that btvacuumpage currently doesn't make any effort to + * recognize when a recycled page is already in the FSM (i.e. put + * there by a previous VACUUM operation). We have to be conservative + * because the FSM isn't crash safe. Hopefully recycled pages get + * reused before too long. */ result = true; } @@ -873,20 +886,9 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* The ENSURE stuff ensures we clean up shared memory on failure */ PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); { - TransactionId oldestBtpoXact; - cycleid = _bt_start_vacuum(rel); - btvacuumscan(info, stats, callback, callback_state, cycleid, - &oldestBtpoXact); - - /* - * Update cleanup-related information in metapage. This information is - * used only for cleanup but keeping them up to date can avoid - * unnecessary cleanup even after bulkdelete. - */ - _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, - info->num_heap_tuples); + btvacuumscan(info, stats, callback, callback_state, cycleid); } PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); _bt_end_vacuum(rel); @@ -918,18 +920,12 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) */ if (stats == NULL) { - TransactionId oldestBtpoXact; - /* Check if we need a cleanup */ if (!_bt_vacuum_needs_cleanup(info)) return NULL; stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact); - - /* Update cleanup-related information in the metapage */ - _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, - info->num_heap_tuples); + btvacuumscan(info, stats, NULL, NULL, 0); } /* @@ -954,7 +950,9 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) * according to the vacuum callback, looking for empty pages that can be * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no - * btbulkdelete call occurred). + * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true). + * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup + * is maintained. * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. @@ -962,7 +960,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, - BTCycleId cycleid, TransactionId *oldestBtpoXact) + BTCycleId cycleid) { Relation rel = info->index; BTVacState vstate; @@ -1046,6 +1044,15 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, MemoryContextDelete(vstate.pagedelcontext); + /* + * Maintain a count of the oldest btpo.xact and current number of heap + * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup). + * The oldest page is typically a page deleted by a previous VACUUM + * operation. + */ + _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact, + info->num_heap_tuples); + /* * If we found any recyclable pages (and recorded them in the FSM), then * forcibly update the upper-level FSM pages to ensure that searchers can @@ -1064,9 +1071,6 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; - - if (oldestBtpoXact) - *oldestBtpoXact = vstate.oldestBtpoXact; } /* @@ -1137,24 +1141,30 @@ restart: /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { - /* Okay to recycle this page */ + /* Okay to recycle this page (which could be leaf or internal) */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { - /* Already deleted, but can't recycle yet */ + /* + * Already deleted page (which could be leaf or internal). Can't + * recycle yet. + */ stats->pages_deleted++; - /* Update the oldest btpo.xact */ + /* Maintain the oldest btpo.xact */ if (!TransactionIdIsValid(vstate->oldestBtpoXact) || TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) vstate->oldestBtpoXact = opaque->btpo.xact; } else if (P_ISHALFDEAD(opaque)) { - /* Half-dead, try to delete */ + /* + * Half-dead leaf page. Try to delete now. Might update + * oldestBtpoXact and pages_deleted below. + */ delete_now = true; } else if (P_ISLEAF(opaque)) @@ -1316,10 +1326,11 @@ restart: else { /* - * If the page has been split during this vacuum cycle, it seems - * worth expending a write to clear btpo_cycleid even if we don't - * have any deletions to do. (If we do, _bt_delitems_vacuum takes - * care of this.) This ensures we won't process the page again. + * If the leaf page has been split during this vacuum cycle, it + * seems worth expending a write to clear btpo_cycleid even if we + * don't have any deletions to do. (If we do, _bt_delitems_vacuum + * takes care of this.) This ensures we won't process the page + * again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. @@ -1334,11 +1345,11 @@ restart: } /* - * If it's now empty, try to delete; else count the live tuples (live - * table TIDs in posting lists are counted as separate live tuples). - * We don't delete when recursing, though, to avoid putting entries - * into freePages out-of-order (doesn't seem worth any extra code to - * handle the case). + * If the leaf page is now empty, try to delete it; else count the + * live tuples (live table TIDs in posting lists are counted as + * separate live tuples). We don't delete when recursing, though, to + * avoid putting entries into freePages out-of-order (doesn't seem + * worth any extra code to handle the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); @@ -1357,16 +1368,11 @@ restart: MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); - ndel = _bt_pagedel(rel, buf); + ndel = _bt_pagedel(rel, buf, &vstate->oldestBtpoXact); /* count only this page, else may double-count parent */ if (ndel) - { stats->pages_deleted++; - if (!TransactionIdIsValid(vstate->oldestBtpoXact) || - TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) - vstate->oldestBtpoXact = opaque->btpo.xact; - } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9a3acd26b7..7bccd7ecc3 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1080,7 +1080,8 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf, extern void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel); -extern int _bt_pagedel(Relation rel, Buffer buf); +extern int _bt_pagedel(Relation rel, Buffer leafbuf, + TransactionId *oldestBtpoXact); /* * prototypes for functions in nbtsearch.c