Fix corner case failure of new standby to follow new primary.

This only happens if (1) the new standby has no WAL available locally, (2) the new standby is starting from the old timeline, (3) the promotion happened in the WAL segment from which the new standby is starting, (4) the timeline history file for the new timeline is available from the archive but the WAL files for are not (i.e. this is a race), (5) the WAL files for the new timeline are available via streaming, and (6) recovery_target_timeline='latest'. Commit ee994272ca introduced this logic and was an improvement over the previous code, but it mishandled this case. If recovery_target_timeline='latest' and restore_command is set, validateRecoveryParameters() can change recoveryTargetTLI to be different from receiveTLI. If streaming is then tried afterward, expectedTLEs gets initialized with the history of the wrong timeline. It's supposed to be a list of entries explaining how to get to the target timeline, but in this case it ends up with a list of entries explaining how to get to the new standby's original timeline, which isn't right. Dilip Kumar and Robert Haas, reviewed by Kyotaro Horiguchi. Discussion: http://postgr.es/m/CAFiTN-sE-jr=LB8jQuxeqikd-Ux+jHiXyh4YDiZMPedgQKup0g@mail.gmail.com
2021-06-09 16:17:00 -04:00 · 2021-06-09 16:17:00 -04:00 · caba8f0d43
parent 845cad4d51
commit caba8f0d43
3 changed files with 115 additions and 1 deletions
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@ -12658,11 +12658,19 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 						 * pg_wal by now.  Use XLOG_FROM_STREAM so that source
 						 * info is set correctly and XLogReceiptTime isn't
 						 * changed.
+						 *
+						 * NB: We must set readTimeLineHistory based on
+						 * recoveryTargetTLI, not receiveTLI. Normally they'll
+						 * be the same, but if recovery_target_timeline is
+						 * 'latest' and archiving is configured, then it's
+						 * possible that we managed to retrieve one or more
+						 * new timeline history files from the archive,
+						 * updating recoveryTargetTLI.
 						 */
 						if (readFile < 0)
 						{
 							if (!expectedTLEs)
-								expectedTLEs = readTimeLineHistory(receiveTLI);
+								expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
 							readFile = XLogFileRead(readSegNo, PANIC,
 													receiveTLI,
 													XLOG_FROM_STREAM, false);
--- a/src/test/recovery/t/025_stuck_on_old_timeline.pl
+++ b/src/test/recovery/t/025_stuck_on_old_timeline.pl
@ -0,0 +1,96 @@
+
+# Copyright (c) 2021, PostgreSQL Global Development Group
+
+# Testing streaming replication where standby is promoted and a new cascading
+# standby (without WAL) is connected to the promoted standby.  Both archiving
+# and streaming are enabled, but only the history file is available from the
+# archive, so the WAL files all have to be streamed.  Test that the cascading
+# standby can follow the new primary (promoted standby).
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use FindBin;
+use Test::More tests => 1;
+
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+
+# Set up an archive command that will copy the history file but not the WAL
+# files. No real archive command should behave this way; the point is to
+# simulate a race condition where the new cascading standby starts up after
+# the timeline history file reaches the archive but before any of the WAL files
+# get there.
+$node_primary->init(allows_streaming => 1, has_archiving => 1);
+my $perlbin = $^X;
+$perlbin =~ s{\\}{\\\\}g if ($TestLib::windows_os);
+my $archivedir_primary = $node_primary->archive_dir;
+$node_primary->append_conf('postgresql.conf', qq(
+archive_command = '$perlbin "$FindBin::RealBin/cp_history_files" "%p" "$archivedir_primary/%f"'
+));
+$node_primary->start;
+
+# Take backup from primary
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# Create streaming standby linking to primary
+my $node_standby = get_new_node('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	allows_streaming => 1, has_streaming => 1, has_archiving => 1);
+$node_standby->start;
+
+# Take backup of standby, use -Xnone so that pg_wal is empty.
+$node_standby->backup($backup_name, backup_options => ['-Xnone']);
+
+# Create cascading standby but don't start it yet.
+# Must set up both streaming and archiving.
+my $node_cascade = get_new_node('cascade');
+$node_cascade->init_from_backup($node_standby, $backup_name,
+	has_streaming => 1);
+$node_cascade->enable_restoring($node_primary);
+$node_cascade->append_conf('postgresql.conf', qq(
+recovery_target_timeline='latest'
+));
+
+# Promote the standby.
+$node_standby->promote;
+
+# Wait for promotion to complete
+$node_standby->poll_query_until('postgres',
+								"SELECT NOT pg_is_in_recovery();")
+	or die "Timed out while waiting for promotion";
+
+# Find next WAL segment to be archived
+my $walfile_to_be_archived = $node_standby->safe_psql('postgres',
+	"SELECT pg_walfile_name(pg_current_wal_lsn());");
+
+# Make WAL segment eligible for archival
+$node_standby->safe_psql('postgres', 'SELECT pg_switch_wal()');
+
+# Wait until the WAL segment has been archived.
+# Since the history file gets created on promotion and is archived before any
+# WAL segment, this is enough to guarantee that the history file was
+# archived.
+my $archive_wait_query =
+  "SELECT '$walfile_to_be_archived' <= last_archived_wal FROM pg_stat_archiver;";
+$node_standby->poll_query_until('postgres', $archive_wait_query)
+  or die "Timed out while waiting for WAL segment to be archived";
+my $last_archived_wal_file = $walfile_to_be_archived;
+
+# Start cascade node
+$node_cascade->start;
+
+# Create some content on promoted standby and check its presence on the
+# cascading standby.
+$node_standby->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT 1 AS a");
+
+# Wait for the replication to catch up
+$node_standby->wait_for_catchup($node_cascade, 'replay',
+	$node_standby->lsn('insert'));
+
+# Check that cascading standby has the new content
+my $result =
+  $node_cascade->safe_psql('postgres', "SELECT count(*) FROM tab_int");
+print "cascade: $result\n";
+is($result, 1, 'check streamed content on cascade standby');
--- a/src/test/recovery/t/cp_history_files
+++ b/src/test/recovery/t/cp_history_files
@ -0,0 +1,10 @@
+#!/usr/bin/perl
+
+use File::Copy;
+use strict;
+use warnings;
+
+die "wrong number of arguments" if @ARGV != 2;
+my ($source, $target) = @ARGV;
+exit if $source !~ /history/;
+copy($source, $target) or die "couldn't copy $source to $target: $!";