diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 5b75c891ef..ef2972d523 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -127,6 +127,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel, bool readonly, bool heapallindexed); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); +static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque); static void bt_target_page_check(BtreeCheckState *state); static ScanKey bt_right_page_check_scankey(BtreeCheckState *state); static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock, @@ -716,7 +719,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) */ if (state->readonly) { - if (!P_LEFTMOST(opaque)) + if (!bt_leftmost_ignoring_half_dead(state, current, opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u is not leftmost in index \"%s\"", @@ -769,10 +772,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) } /* - * readonly mode can only ever land on live pages and half-dead pages, - * so sibling pointers should always be in mutual agreement + * Sibling links should be in mutual agreement. There arises + * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling + * of the parent's low-key downlink is half-dead. (A half-dead page + * has no downlink from its parent.) Under heavyweight locking, the + * last bt_leftmost_ignoring_half_dead() validated this btpo_prev. */ - if (state->readonly && opaque->btpo_prev != leftcurrent) + if (state->readonly && + opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("left link/right link pair in index \"%s\" not in agreement", @@ -822,6 +829,67 @@ nextpage: return nextleveldown; } +/* + * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of + * half-dead, sibling-linked pages to the left. If a half-dead page appears + * under state->readonly, the database exited recovery between the first-stage + * and second-stage WAL records of a deletion. + */ +static bool +bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque) +{ + BlockNumber reached = start_opaque->btpo_prev, + reached_from = start; + bool all_half_dead = true; + + /* + * To handle the !readonly case, we'd need to accept BTP_DELETED pages and + * potentially observe nbtree/README "Page deletion and backwards scans". + */ + Assert(state->readonly); + + while (reached != P_NONE && all_half_dead) + { + Page page = palloc_btree_page(state, reached); + BTPageOpaque reached_opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + CHECK_FOR_INTERRUPTS(); + + /* + * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page() + * writes that side-links will continue to point to the siblings. + * Check btpo_next for that property. + */ + all_half_dead = P_ISHALFDEAD(reached_opaque) && + reached != start && + reached != reached_from && + reached_opaque->btpo_next == reached_from; + if (all_half_dead) + { + XLogRecPtr pagelsn = PageGetLSN(page); + + /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */ + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless interrupted page deletion detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u right block=%u page lsn=%X/%X.", + reached, reached_from, + (uint32) (pagelsn >> 32), + (uint32) pagelsn))); + + reached_from = reached; + reached = reached_opaque->btpo_prev; + } + + pfree(page); + } + + return all_half_dead; +} + /* * Function performs the following checks on target page, or pages ancillary to * target page: