Add test case for obsoleting slot with active walsender, take 2

The code to signal a running walsender when its reserved WAL size grows
too large is completely uncovered before this commit; this adds coverage
for that case.

This test involves sending SIGSTOP to walsender and walreceiver, then
advancing enough WAL for a checkpoint to trigger, then sending SIGCONT.

There's no precedent for STOP signalling in Perl tests, and my reading
of relevant manpages says it's likely to fail on Windows.  Because of
this, this test is always skipped on that platform.

This version fixes a couple of rarely hit race conditions in the
previous attempt 09126984a263; most notably, both LOG string searches
are loops, not just the second one; we acquire the start-of-log position
before STOP-signalling; and reference the correct process name in the
test description.  All per Tom Lane.

Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/202106102202.mjw4huiix7lo@alvherre.pgsql
This commit is contained in:
Alvaro Herrera 2021-06-23 09:53:18 -04:00
parent 741d7f1047
commit 24043c27b4
No known key found for this signature in database
GPG Key ID: 1C20ACB9D5C564AE
1 changed files with 94 additions and 3 deletions

View File

@ -11,7 +11,7 @@ use TestLib;
use PostgresNode;
use File::Path qw(rmtree);
use Test::More tests => 14;
use Test::More tests => $TestLib::windows_os ? 14 : 18;
use Time::HiRes qw(usleep);
$ENV{PGDATABASE} = 'postgres';
@ -211,8 +211,8 @@ for (my $i = 0; $i < 10000; $i++)
}
ok($failed, 'check that replication has been broken');
$node_primary->stop('immediate');
$node_standby->stop('immediate');
$node_primary->stop;
$node_standby->stop;
my $node_primary2 = get_new_node('primary2');
$node_primary2->init(allows_streaming => 1);
@ -253,6 +253,97 @@ my @result =
timeout => '60'));
is($result[1], 'finished', 'check if checkpoint command is not blocked');
$node_primary2->stop;
$node_standby->stop;
# The next test depends on Perl's `kill`, which apparently is not
# portable to Windows. (It would be nice to use Test::More's `subtest`,
# but that's not in the ancient version we require.)
if ($TestLib::windows_os)
{
done_testing();
exit;
}
# Get a slot terminated while the walsender is active
# We do this by sending SIGSTOP to the walsender. Skip this on Windows.
my $node_primary3 = get_new_node('primary3');
$node_primary3->init(allows_streaming => 1, extra => ['--wal-segsize=1']);
$node_primary3->append_conf(
'postgresql.conf', qq(
min_wal_size = 2MB
max_wal_size = 2MB
log_checkpoints = yes
max_slot_wal_keep_size = 1MB
));
$node_primary3->start;
$node_primary3->safe_psql('postgres',
"SELECT pg_create_physical_replication_slot('rep3')");
# Take backup
$backup_name = 'my_backup';
$node_primary3->backup($backup_name);
# Create standby
my $node_standby3 = get_new_node('standby_3');
$node_standby3->init_from_backup($node_primary3, $backup_name,
has_streaming => 1);
$node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'");
$node_standby3->start;
$node_primary3->wait_for_catchup($node_standby3->name, 'replay');
my $senderpid = $node_primary3->safe_psql('postgres',
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
my $receiverpid = $node_standby3->safe_psql('postgres',
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
$logstart = get_log_size($node_primary3);
# freeze walsender and walreceiver. Slot will still be active, but walreceiver
# won't get anything anymore.
kill 'STOP', $senderpid, $receiverpid;
advance_wal($node_primary3, 2);
my $max_attempts = 180;
while ($max_attempts-- >= 0)
{
if (find_in_log(
$node_primary3,
"terminating process $senderpid to release replication slot \"rep3\"",
$logstart))
{
ok(1, "walsender termination logged");
last;
}
sleep 1;
}
# Now let the walsender continue; slot should be killed now.
# (Must not let walreceiver run yet; otherwise the standby could start another
# one before the slot can be killed)
kill 'CONT', $senderpid;
$node_primary3->poll_query_until('postgres',
"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep3'",
"lost")
or die "timed out waiting for slot to be lost";
$max_attempts = 180;
while ($max_attempts-- >= 0)
{
if (find_in_log(
$node_primary3,
'invalidating slot "rep3" because its restart_lsn', $logstart))
{
ok(1, "slot invalidation logged");
last;
}
sleep 1;
}
# Now let the walreceiver continue, so that the node can be stopped cleanly
kill 'CONT', $receiverpid;
$node_primary3->stop;
$node_standby3->stop;
#####################################
# Advance WAL of $node by $n segments
sub advance_wal