diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index 88271687a3..5e9002d250 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -12,6 +12,7 @@ PGFILEDESC = "amcheck - function for verifying relation integrity" REGRESS = check check_btree check_heap +EXTRA_INSTALL = contrib/pg_walinspect TAP_TESTS = 1 ifdef USE_PGXS diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build index 4c8e2e2f13..656a5ed90b 100644 --- a/contrib/amcheck/meson.build +++ b/contrib/amcheck/meson.build @@ -44,6 +44,7 @@ tests += { 't/002_cic.pl', 't/003_cic_2pc.pl', 't/004_verify_nbtree_unique.pl', + 't/005_pitr.pl', ], }, } diff --git a/contrib/amcheck/t/005_pitr.pl b/contrib/amcheck/t/005_pitr.pl new file mode 100644 index 0000000000..6bcc1596f2 --- /dev/null +++ b/contrib/amcheck/t/005_pitr.pl @@ -0,0 +1,82 @@ +# Copyright (c) 2021-2023, PostgreSQL Global Development Group + +# Test integrity of intermediate states by PITR to those states +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# origin node: generate WAL records of interest. +my $origin = PostgreSQL::Test::Cluster->new('origin'); +$origin->init(has_archiving => 1, allows_streaming => 1); +$origin->append_conf('postgresql.conf', 'autovacuum = off'); +$origin->start; +$origin->backup('my_backup'); +# Create a table with each of 6 PK values spanning 1/4 of a block. Delete the +# first four, so one index leaf is eligible for deletion. Make a replication +# slot just so pg_walinspect will always have access to later WAL. +my $setup = <safe_psql('postgres', $setup); +my $before_vacuum_lsn = + $origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()"); +# VACUUM to delete the aforementioned leaf page. Force an XLogFlush() by +# dropping a permanent table. That way, the XLogReader infrastructure can +# always see VACUUM's records, even under synchronous_commit=off. Finally, +# find the LSN of that VACUUM's last UNLINK_PAGE record. +my $vacuum = <safe_psql('postgres', $vacuum); +$origin->stop; +die "did not find UNLINK_PAGE record" unless $unlink_lsn; + +# replica node: amcheck at notable points in the WAL stream +my $replica = PostgreSQL::Test::Cluster->new('replica'); +$replica->init_from_backup($origin, 'my_backup', has_restoring => 1); +$replica->append_conf('postgresql.conf', + "recovery_target_lsn = '$unlink_lsn'"); +$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off'); +$replica->append_conf('postgresql.conf', 'recovery_target_action = promote'); +$replica->start; +$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';") + or die "Timed out while waiting for PITR promotion"; +# recovery done; run amcheck +my $debug = "SET client_min_messages = 'debug1'"; +my ($rc, $stderr); +$rc = $replica->psql( + 'postgres', + "$debug; SELECT bt_index_parent_check('not_leftmost_pk', true)", + stderr => \$stderr); +print STDERR $stderr, "\n"; +is($rc, 0, "bt_index_parent_check passes"); +like( + $stderr, + qr/interrupted page deletion detected/, + "bt_index_parent_check: interrupted page deletion detected"); +$rc = $replica->psql( + 'postgres', + "$debug; SELECT bt_index_check('not_leftmost_pk', true)", + stderr => \$stderr); +print STDERR $stderr, "\n"; +is($rc, 0, "bt_index_check passes"); + +done_testing(); diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 877452f38c..bcff849aa9 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -157,6 +157,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel, bool rootdescend, bool checkunique); static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); +static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque); static void bt_recheck_sibling_links(BtreeCheckState *state, BlockNumber btpo_prev_from_target, BlockNumber leftcurrent); @@ -826,7 +829,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) */ if (state->readonly) { - if (!P_LEFTMOST(opaque)) + if (!bt_leftmost_ignoring_half_dead(state, current, opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("block %u is not leftmost in index \"%s\"", @@ -880,8 +883,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) */ } - /* Sibling links should be in mutual agreement */ - if (opaque->btpo_prev != leftcurrent) + /* + * Sibling links should be in mutual agreement. There arises + * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling + * of the parent's low-key downlink is half-dead. (A half-dead page + * has no downlink from its parent.) Under heavyweight locking, the + * last bt_leftmost_ignoring_half_dead() validated this btpo_prev. + * Without heavyweight locking, validation of the P_NONE case remains + * unimplemented. + */ + if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE) bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent); /* Check level */ @@ -1117,6 +1128,66 @@ bt_entry_unique_check(BtreeCheckState *state, IndexTuple itup, } } +/* + * Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of + * half-dead, sibling-linked pages to the left. If a half-dead page appears + * under state->readonly, the database exited recovery between the first-stage + * and second-stage WAL records of a deletion. + */ +static bool +bt_leftmost_ignoring_half_dead(BtreeCheckState *state, + BlockNumber start, + BTPageOpaque start_opaque) +{ + BlockNumber reached = start_opaque->btpo_prev, + reached_from = start; + bool all_half_dead = true; + + /* + * To handle the !readonly case, we'd need to accept BTP_DELETED pages and + * potentially observe nbtree/README "Page deletion and backwards scans". + */ + Assert(state->readonly); + + while (reached != P_NONE && all_half_dead) + { + Page page = palloc_btree_page(state, reached); + BTPageOpaque reached_opaque = BTPageGetOpaque(page); + + CHECK_FOR_INTERRUPTS(); + + /* + * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page() + * writes that side-links will continue to point to the siblings. + * Check btpo_next for that property. + */ + all_half_dead = P_ISHALFDEAD(reached_opaque) && + reached != start && + reached != reached_from && + reached_opaque->btpo_next == reached_from; + if (all_half_dead) + { + XLogRecPtr pagelsn = PageGetLSN(page); + + /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */ + ereport(DEBUG1, + (errcode(ERRCODE_NO_DATA), + errmsg_internal("harmless interrupted page deletion detected in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Block=%u right block=%u page lsn=%X/%X.", + reached, reached_from, + LSN_FORMAT_ARGS(pagelsn)))); + + reached_from = reached; + reached = reached_opaque->btpo_prev; + } + + pfree(page); + } + + return all_half_dead; +} + /* * Raise an error when target page's left link does not point back to the * previous target page, called leftcurrent here. The leftcurrent page's @@ -1157,6 +1228,9 @@ bt_recheck_sibling_links(BtreeCheckState *state, BlockNumber btpo_prev_from_target, BlockNumber leftcurrent) { + /* passing metapage to BTPageGetOpaque() would give irrelevant findings */ + Assert(leftcurrent != P_NONE); + if (!state->readonly) { Buffer lbuf; @@ -2235,7 +2309,8 @@ bt_child_highkey_check(BtreeCheckState *state, opaque = BTPageGetOpaque(page); /* The first page we visit at the level should be leftmost */ - if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque)) + if (first && !BlockNumberIsValid(state->prevrightlink) && + !bt_leftmost_ignoring_half_dead(state, blkno, opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",