diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index f0439edce3..eac58f5cf0 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -56,6 +56,9 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader, * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of * the data blocks touched by the WAL records, and return them in a page map. + * + * 'endpoint' is the end of the last record to read. The record starting at + * 'endpoint' is the first one that is not read. */ void extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, @@ -96,7 +99,13 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, startpoint = InvalidXLogRecPtr; /* continue reading at next record */ - } while (xlogreader->ReadRecPtr != endpoint); + } while (xlogreader->EndRecPtr < endpoint); + + /* + * If 'endpoint' didn't point exactly at a record boundary, the caller + * messed up. + */ + Assert(xlogreader->EndRecPtr == endpoint); XLogReaderFree(xlogreader); if (xlogreadfd != -1) diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 3f42ea6627..2a18f675c8 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -108,6 +108,7 @@ main(int argc, char **argv) XLogRecPtr chkptrec; TimeLineID chkpttli; XLogRecPtr chkptredo; + XLogRecPtr target_wal_endrec; size_t size; char *buffer; bool rewind_needed; @@ -255,42 +256,55 @@ main(int argc, char **argv) { pg_log_info("source and target cluster are on the same timeline"); rewind_needed = false; + target_wal_endrec = 0; } else { + XLogRecPtr chkptendrec; + findCommonAncestorTimeline(&divergerec, &lastcommontliIndex); pg_log_info("servers diverged at WAL location %X/%X on timeline %u", (uint32) (divergerec >> 32), (uint32) divergerec, targetHistory[lastcommontliIndex].tli); + /* + * Determine the end-of-WAL on the target. + * + * The WAL ends at the last shutdown checkpoint, or at + * minRecoveryPoint if it was a standby. (If we supported rewinding a + * server that was not shut down cleanly, we would need to replay + * until we reach the first invalid record, like crash recovery does.) + */ + + /* read the checkpoint record on the target to see where it ends. */ + chkptendrec = readOneRecord(datadir_target, + ControlFile_target.checkPoint, + targetNentries - 1); + + if (ControlFile_target.minRecoveryPoint > chkptendrec) + { + target_wal_endrec = ControlFile_target.minRecoveryPoint; + } + else + { + target_wal_endrec = chkptendrec; + } + /* * Check for the possibility that the target is in fact a direct * ancestor of the source. In that case, there is no divergent history * in the target that needs rewinding. */ - if (ControlFile_target.checkPoint >= divergerec) + if (target_wal_endrec > divergerec) { rewind_needed = true; } else { - XLogRecPtr chkptendrec; + /* the last common checkpoint record must be part of target WAL */ + Assert(target_wal_endrec == divergerec); - /* Read the checkpoint record on the target to see where it ends. */ - chkptendrec = readOneRecord(datadir_target, - ControlFile_target.checkPoint, - targetNentries - 1); - - /* - * If the histories diverged exactly at the end of the shutdown - * checkpoint record on the target, there are no WAL records in - * the target that don't belong in the source's history, and no - * rewind is needed. - */ - if (chkptendrec == divergerec) - rewind_needed = false; - else - rewind_needed = true; + rewind_needed = false; } } @@ -321,14 +335,12 @@ main(int argc, char **argv) /* * Read the target WAL from last checkpoint before the point of fork, to * extract all the pages that were modified on the target cluster after - * the fork. We can stop reading after reaching the final shutdown record. - * XXX: If we supported rewinding a server that was not shut down cleanly, - * we would need to replay until the end of WAL here. + * the fork. */ if (showprogress) pg_log_info("reading WAL in target"); extractPageMap(datadir_target, chkptrec, lastcommontliIndex, - ControlFile_target.checkPoint); + target_wal_endrec); filemap_finalize(); if (showprogress) diff --git a/src/bin/pg_rewind/t/008_min_recovery_point.pl b/src/bin/pg_rewind/t/008_min_recovery_point.pl new file mode 100644 index 0000000000..6c010bfc6a --- /dev/null +++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl @@ -0,0 +1,156 @@ +# +# Test situation where a target data directory contains +# WAL records beyond both the last checkpoint and the divergence +# point: +# +# Target WAL (TLI 2): +# +# backup ... Checkpoint A ... INSERT 'rewind this' +# (TLI 1 -> 2) +# +# ^ last common ^ minRecoveryPoint +# checkpoint +# +# Source WAL (TLI 3): +# +# backup ... Checkpoint A ... Checkpoint B ... INSERT 'keep this' +# (TLI 1 -> 2) (TLI 2 -> 3) +# +# +# The last common checkpoint is Checkpoint A. But there is WAL on TLI 2 +# after the last common checkpoint that needs to be rewound. We used to +# have a bug where minRecoveryPoint was ignored, and pg_rewind concluded +# that the target doesn't need rewinding in this scenario, because the +# last checkpoint on the target TLI was an ancestor of the source TLI. +# +# +# This test does not make use of RewindTest as it requires three +# nodes. + +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 3; + +use File::Copy; + +my $tmp_folder = TestLib::tempdir; + +my $node_1 = get_new_node('node_1'); +$node_1->init(allows_streaming => 1); +$node_1->append_conf('postgresql.conf', qq( +wal_keep_segments='5' +)); + +$node_1->start; + +# Create a couple of test tables +$node_1->safe_psql('postgres', 'CREATE TABLE public.foo (t TEXT)'); +$node_1->safe_psql('postgres', 'CREATE TABLE public.bar (t TEXT)'); +$node_1->safe_psql('postgres', "INSERT INTO public.bar VALUES ('in both')"); + + +# Take backup +my $backup_name = 'my_backup'; +$node_1->backup($backup_name); + +# Create streaming standby from backup +my $node_2 = get_new_node('node_2'); +$node_2->init_from_backup($node_1, $backup_name, + has_streaming => 1); +$node_2->start; + +# Create streaming standby from backup +my $node_3 = get_new_node('node_3'); +$node_3->init_from_backup($node_1, $backup_name, + has_streaming => 1); +$node_3->start; + +# Stop node_1 + +$node_1->stop('fast'); + +# Promote node_3 +$node_3->promote; + +# node_1 rejoins node_3 + +my $node_3_connstr = $node_3->connstr; + +$node_1->append_conf('postgresql.conf', qq( +primary_conninfo='$node_3_connstr' +)); +$node_1->set_standby_mode(); +$node_1->start(); + +# node_2 follows node_3 + +$node_2->append_conf('postgresql.conf', qq( +primary_conninfo='$node_3_connstr' +)); +$node_2->restart(); + +# Promote node_1 + +$node_1->promote; + +# We now have a split-brain with two primaries. Insert a row on both to +# demonstratively create a split brain. After the rewind, we should only +# see the insert on 1, as the insert on node 3 is rewound away. +$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')"); + +# Insert more rows in node 1, to bump up the XID counter. Otherwise, if +# rewind doesn't correctly rewind the changes made on the other node, +# we might fail to notice if the inserts are invisible because the XIDs +# are not marked as committed. +$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')"); +$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')"); + +# Also insert a row in 'bar' on node 3. It is unmodified in node 1, so it won't get +# overwritten by replaying the WAL from node 1. +$node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')"); + +# Wait for node 2 to catch up +$node_2->poll_query_until('postgres', + q|SELECT COUNT(*) > 1 FROM public.bar|, 't'); + +# At this point node_2 will shut down without a shutdown checkpoint, +# but with WAL entries beyond the preceding shutdown checkpoint. +$node_2->stop('fast'); +$node_3->stop('fast'); + +my $node_2_pgdata = $node_2->data_dir; +my $node_1_connstr = $node_1->connstr; + +# Keep a temporary postgresql.conf or it would be overwritten during the rewind. +copy( + "$node_2_pgdata/postgresql.conf", + "$tmp_folder/node_2-postgresql.conf.tmp"); + +command_ok( + [ + 'pg_rewind', + "--source-server=$node_1_connstr", + "--target-pgdata=$node_2_pgdata" + ], + 'pg_rewind detects rewind needed'); + +# Now move back postgresql.conf with old settings +move( + "$tmp_folder/node_2-postgresql.conf.tmp", + "$node_2_pgdata/postgresql.conf"); + +$node_2->start; + +# Check contents of the test tables after rewind. The rows inserted in node 3 +# before rewind should've been overwritten with the data from node 1. +my $result; +$result = $node_2->safe_psql('postgres', 'checkpoint'); +$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo'); +is($result, qq(keep this +and this +and this too), 'table foo after rewind'); + +$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.bar'); +is($result, qq(in both), 'table bar after rewind');