From 1816a1c6ffe46782eee9a16a974b4aa3f4b8457b Mon Sep 17 00:00:00 2001 From: Alvaro Herrera Date: Wed, 29 Apr 2020 18:46:42 -0400 Subject: [PATCH] Fix checkpoint signalling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checkpointer uses its MyLatch to wake up when a checkpoint request is received. But before commit c6550776394e the latch was not used for anything else, so the code could just go to sleep after each loop without rechecking the sleeping condition. That commit added a separate ResetLatch in its code path[1], which can cause a checkpoint to go unnoticed for potentially a long time. Fix by skipping sleep if any checkpoint flags are set. Also add a test to verify this; authored by Kyotaro Horiguchi. [1] CreateCheckPoint -> InvalidateObsoleteReplicationSlots -> ConditionVariableTimeSleep Report and diagnosis by Kyotaro Horiguchi. Co-authored-by: Kyotaro Horiguchi Co-authored-by: Álvaro Herrera Discussion: https://postgr.es/m/20200408.141956.891237856186513376.horikyota.ntt@gmail.com --- src/backend/postmaster/checkpointer.c | 7 ++++ src/test/recovery/t/019_replslot_limit.pl | 44 +++++++++++++++++++++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index e354a78725..2b552a7ff9 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -494,6 +494,13 @@ CheckpointerMain(void) */ pgstat_send_bgwriter(); + /* + * If any checkpoint flags have been set, redo the loop to handle the + * checkpoint without sleeping. + */ + if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) + continue; + /* * Sleep until we are signaled or it's time for another checkpoint or * xlog file switch. diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl index 32dce54522..634f2bec8b 100644 --- a/src/test/recovery/t/019_replslot_limit.pl +++ b/src/test/recovery/t/019_replslot_limit.pl @@ -8,7 +8,7 @@ use TestLib; use PostgresNode; use File::Path qw(rmtree); -use Test::More tests => 13; +use Test::More tests => 14; use Time::HiRes qw(usleep); $ENV{PGDATABASE} = 'postgres'; @@ -179,7 +179,47 @@ for (my $i = 0; $i < 10000; $i++) } ok($failed, 'check that replication has been broken'); -$node_standby->stop; +$node_master->stop('immediate'); +$node_standby->stop('immediate'); + +my $node_master2 = get_new_node('master2'); +$node_master2->init(allows_streaming => 1); +$node_master2->append_conf( + 'postgresql.conf', qq( +min_wal_size = 32MB +max_wal_size = 32MB +log_checkpoints = yes +)); +$node_master2->start; +$node_master2->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('rep1')"); +$backup_name = 'my_backup2'; +$node_master2->backup($backup_name); + +$node_master2->stop; +$node_master2->append_conf( + 'postgresql.conf', qq( +max_slot_wal_keep_size = 0 +)); +$node_master2->start; + +$node_standby = get_new_node('standby_2'); +$node_standby->init_from_backup($node_master2, $backup_name, + has_streaming => 1); +$node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'"); +$node_standby->start; +my @result = + split( + '\n', + $node_master2->safe_psql( + 'postgres', + "CREATE TABLE tt(); + DROP TABLE tt; + SELECT pg_switch_wal(); + CHECKPOINT; + SELECT 'finished';", + timeout => '60')); +is($result[1], 'finished', 'check if checkpoint command is not blocked'); ##################################### # Advance WAL of $node by $n segments