From 3a205c91665fddf76e333844884354874dc8ae1b Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Mon, 30 Oct 2023 14:46:05 -0700 Subject: [PATCH] amcheck: Distinguish interrupted page deletion from corruption. This prevents false-positive reports about "the first child of leftmost target page is not leftmost of its level", "block %u is not leftmost" and "left link/right link pair". They appeared if amcheck ran before VACUUM cleaned things, after a cluster exited recovery between the first-stage and second-stage WAL records of a deletion. Back-patch to v11 (all supported versions). Reviewed by Peter Geoghegan. Discussion: https://postgr.es/m/20231005025232.c7.nmisch@google.com --- contrib/amcheck/t/005_pitr.pl | 88 +++++++++++++++++++++++++++++++++ contrib/amcheck/verify_nbtree.c | 76 ++++++++++++++++++++++++++-- 2 files changed, 160 insertions(+), 4 deletions(-) create mode 100644 contrib/amcheck/t/005_pitr.pl diff --git a/contrib/amcheck/t/005_pitr.pl b/contrib/amcheck/t/005_pitr.pl new file mode 100644 index 0000000000..205c8fc588 --- /dev/null +++ b/contrib/amcheck/t/005_pitr.pl @@ -0,0 +1,88 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +# Test integrity of intermediate states by PITR to those states +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# origin node: generate WAL records of interest. +my $origin = PostgreSQL::Test::Cluster->new('origin'); +$origin->init(has_archiving => 1, allows_streaming => 1); +$origin->append_conf('postgresql.conf', 'autovacuum = off'); +$origin->start; +$origin->backup('my_backup'); +# Create a table with each of 6 PK values spanning 1/4 of a block. Delete the +# first four, so one index leaf is eligible for deletion. Make a replication +# slot just so pg_waldump will always have access to later WAL. +my $setup = <safe_psql('postgres', $setup); +my $before_vacuum_walfile = + $origin->safe_psql('postgres', "SELECT pg_walfile_name(pg_current_wal_lsn())"); +# VACUUM to delete the aforementioned leaf page. Force an XLogFlush() by +# dropping a permanent table. That way, the XLogReader infrastructure can +# always see VACUUM's records, even under synchronous_commit=off. Finally, +# find the LSN of that VACUUM's last UNLINK_PAGE record. +my $vacuum = <safe_psql('postgres', $vacuum); +$origin->stop; +my $unlink_lsn = do { + my $stdout; + run_log(['pg_waldump', '-p', $origin->data_dir . '/pg_wal', + $before_vacuum_walfile, $after_unlink_walfile], + '>', \$stdout); + $stdout =~ m|^rmgr: Btree .*, lsn: ([/0-9A-F]+), .*, desc: UNLINK_PAGE left|m; + $1; +}; +die "did not find UNLINK_PAGE record" unless $unlink_lsn; + +# replica node: amcheck at notable points in the WAL stream +my $replica = PostgreSQL::Test::Cluster->new('replica'); +$replica->init_from_backup($origin, 'my_backup', has_restoring => 1); +$replica->append_conf('postgresql.conf', + "recovery_target_lsn = '$unlink_lsn'"); +$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off'); +$replica->append_conf('postgresql.conf', 'recovery_target_action = promote'); +$replica->start; +$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") + or die "Timed out while waiting for PITR promotion"; +# recovery done; run amcheck +my $debug = "SET client_min_messages = 'debug1'"; +my ($rc, $stderr); +$rc = $replica->psql( + 'postgres', + "$debug; SELECT bt_index_parent_check('not_leftmost_pk', true)", + stderr => \$stderr); +print STDERR $stderr, "\n"; +is($rc, 0, "bt_index_parent_check passes"); +like( + $stderr, + qr/interrupted page deletion detected/, + "bt_index_parent_check: interrupted page deletion detected"); +$rc = $replica->psql( + 'postgres', + "$debug; SELECT bt_index_check('not_leftmost_pk', true)", + stderr => \$stderr); +print STDERR $stderr, "\n"; +is($rc, 0, "bt_index_check passes"); + +done_testing(); diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 26272ef866..dcdcb9b074 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -135,6 +135,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel, bool rootdescend); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); +static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque); static void bt_target_page_check(BtreeCheckState *state); static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, @@ -752,7 +755,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) */ if (state->readonly) { - if (!P_LEFTMOST(opaque)) + if (!bt_leftmost_ignoring_half_dead(state, current, opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u is not leftmost in index \"%s\"", @@ -807,10 +810,14 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) } /* - * readonly mode can only ever land on live pages and half-dead pages, - * so sibling pointers should always be in mutual agreement + * Sibling links should be in mutual agreement. There arises + * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling + * of the parent's low-key downlink is half-dead. (A half-dead page + * has no downlink from its parent.) Under heavyweight locking, the + * last bt_leftmost_ignoring_half_dead() validated this btpo_prev. */ - if (state->readonly && opaque->btpo_prev != leftcurrent) + if (state->readonly && + opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("left link/right link pair in index \"%s\" not in agreement", @@ -860,6 +867,67 @@ nextpage: return nextleveldown; } +/* + * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of + * half-dead, sibling-linked pages to the left. If a half-dead page appears + * under state->readonly, the database exited recovery between the first-stage + * and second-stage WAL records of a deletion. + */ +static bool +bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque) +{ + BlockNumber reached = start_opaque->btpo_prev, + reached_from = start; + bool all_half_dead = true; + + /* + * To handle the !readonly case, we'd need to accept BTP_DELETED pages and + * potentially observe nbtree/README "Page deletion and backwards scans". + */ + Assert(state->readonly); + + while (reached != P_NONE && all_half_dead) + { + Page page = palloc_btree_page(state, reached); + BTPageOpaque reached_opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + CHECK_FOR_INTERRUPTS(); + + /* + * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page() + * writes that side-links will continue to point to the siblings. + * Check btpo_next for that property. + */ + all_half_dead = P_ISHALFDEAD(reached_opaque) && + reached != start && + reached != reached_from && + reached_opaque->btpo_next == reached_from; + if (all_half_dead) + { + XLogRecPtr pagelsn = PageGetLSN(page); + + /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */ + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless interrupted page deletion detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u right block=%u page lsn=%X/%X.", + reached, reached_from, + (uint32) (pagelsn >> 32), + (uint32) pagelsn))); + + reached_from = reached; + reached = reached_opaque->btpo_prev; + } + + pfree(page); + } + + return all_half_dead; +} + /* * Function performs the following checks on target page, or pages ancillary to * target page: