Fix test race between primary XLOG_RUNNING_XACTS and standby logical slot.

Before the previous commit, the test could hang until
LOG_SNAPSHOT_INTERVAL_MS (15s), until checkpoint_timeout (300s), or
indefinitely.  An indefinite hang was awfully improbable.  It entailed
the test reaching checkpoint_timeout before the
DecodingContextFindStartpoint() of a CREATE SUBSCRIPTION, yet after the
preceding WAL record.  Back-patch to v16, which introduced the test.

Bertrand Drouvot, reported by Noah Misch.

Discussion: https://postgr.es/m/20240211010227.a2.nmisch@google.com
This commit is contained in:
Noah Misch 2024-02-19 12:52:28 -08:00
parent 4791f87f34
commit 0e162810df
2 changed files with 34 additions and 16 deletions

View File

@ -3181,6 +3181,36 @@ $SIG{TERM} = $SIG{INT} = sub {
=pod
=item $node->log_standby_snapshot(self, standby, slot_name)
Log a standby snapshot on primary once the slot restart_lsn is determined on
the standby.
=cut
sub log_standby_snapshot
{
my ($self, $standby, $slot_name) = @_;
# Once the slot's restart_lsn is determined, the standby looks for
# xl_running_xacts WAL record from the restart_lsn onwards. First wait
# until the slot restart_lsn is determined.
$standby->poll_query_until(
'postgres', qq[
SELECT restart_lsn IS NOT NULL
FROM pg_catalog.pg_replication_slots WHERE slot_name = '$slot_name'
])
or die
"timed out waiting for logical slot to calculate its restart_lsn";
# Then arrange for the xl_running_xacts record for which the standby is
# waiting.
$self->safe_psql('postgres', 'SELECT pg_log_standby_snapshot()');
}
=pod
=item $node->create_logical_slot_on_standby(self, primary, slot_name, dbname)
Create logical replication slot on given standby
@ -3206,21 +3236,9 @@ sub create_logical_slot_on_standby
'2>',
\$stderr);
# Once the slot's restart_lsn is determined, the standby looks for
# xl_running_xacts WAL record from the restart_lsn onwards. First wait
# until the slot restart_lsn is determined.
$self->poll_query_until(
'postgres', qq[
SELECT restart_lsn IS NOT NULL
FROM pg_catalog.pg_replication_slots WHERE slot_name = '$slot_name'
])
or die
"timed out waiting for logical slot to calculate its restart_lsn";
# Then arrange for the xl_running_xacts record for which pg_recvlogical is
# Arrange for the xl_running_xacts record for which pg_recvlogical is
# waiting.
$primary->safe_psql('postgres', 'SELECT pg_log_standby_snapshot()');
$primary->log_standby_snapshot($self, $slot_name);
$handle->finish();

View File

@ -465,8 +465,8 @@ $psql_subscriber{subscriber_stdin} .= "\n";
$psql_subscriber{run}->pump_nb();
# Speed up the subscription creation
$node_primary->safe_psql('postgres', "SELECT pg_log_standby_snapshot()");
# Log the standby snapshot to speed up the subscription creation
$node_primary->log_standby_snapshot($node_standby, 'tap_sub');
# Explicitly shut down psql instance gracefully - to avoid hangs
# or worse on windows