From 0e162810df7657bac24ba4657460a87104523fc6 Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Mon, 19 Feb 2024 12:52:28 -0800 Subject: [PATCH] Fix test race between primary XLOG_RUNNING_XACTS and standby logical slot. Before the previous commit, the test could hang until LOG_SNAPSHOT_INTERVAL_MS (15s), until checkpoint_timeout (300s), or indefinitely. An indefinite hang was awfully improbable. It entailed the test reaching checkpoint_timeout before the DecodingContextFindStartpoint() of a CREATE SUBSCRIPTION, yet after the preceding WAL record. Back-patch to v16, which introduced the test. Bertrand Drouvot, reported by Noah Misch. Discussion: https://postgr.es/m/20240211010227.a2.nmisch@google.com --- src/test/perl/PostgreSQL/Test/Cluster.pm | 46 +++++++++++++------ .../t/035_standby_logical_decoding.pl | 4 +- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 07da74cf56..cfaf91ec63 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -3181,6 +3181,36 @@ $SIG{TERM} = $SIG{INT} = sub { =pod +=item $node->log_standby_snapshot(self, standby, slot_name) + +Log a standby snapshot on primary once the slot restart_lsn is determined on +the standby. + +=cut + +sub log_standby_snapshot +{ + my ($self, $standby, $slot_name) = @_; + + # Once the slot's restart_lsn is determined, the standby looks for + # xl_running_xacts WAL record from the restart_lsn onwards. First wait + # until the slot restart_lsn is determined. + + $standby->poll_query_until( + 'postgres', qq[ + SELECT restart_lsn IS NOT NULL + FROM pg_catalog.pg_replication_slots WHERE slot_name = '$slot_name' + ]) + or die + "timed out waiting for logical slot to calculate its restart_lsn"; + + # Then arrange for the xl_running_xacts record for which the standby is + # waiting. + $self->safe_psql('postgres', 'SELECT pg_log_standby_snapshot()'); +} + +=pod + =item $node->create_logical_slot_on_standby(self, primary, slot_name, dbname) Create logical replication slot on given standby @@ -3206,21 +3236,9 @@ sub create_logical_slot_on_standby '2>', \$stderr); - # Once the slot's restart_lsn is determined, the standby looks for - # xl_running_xacts WAL record from the restart_lsn onwards. First wait - # until the slot restart_lsn is determined. - - $self->poll_query_until( - 'postgres', qq[ - SELECT restart_lsn IS NOT NULL - FROM pg_catalog.pg_replication_slots WHERE slot_name = '$slot_name' - ]) - or die - "timed out waiting for logical slot to calculate its restart_lsn"; - - # Then arrange for the xl_running_xacts record for which pg_recvlogical is + # Arrange for the xl_running_xacts record for which pg_recvlogical is # waiting. - $primary->safe_psql('postgres', 'SELECT pg_log_standby_snapshot()'); + $primary->log_standby_snapshot($self, $slot_name); $handle->finish(); diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index b020361b29..0710bccc17 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -465,8 +465,8 @@ $psql_subscriber{subscriber_stdin} .= "\n"; $psql_subscriber{run}->pump_nb(); -# Speed up the subscription creation -$node_primary->safe_psql('postgres', "SELECT pg_log_standby_snapshot()"); +# Log the standby snapshot to speed up the subscription creation +$node_primary->log_standby_snapshot($node_standby, 'tap_sub'); # Explicitly shut down psql instance gracefully - to avoid hangs # or worse on windows