From ab55d742eb7162c22ee60f1e15e07d2a60063c4e Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 11 Jun 2021 16:12:36 -0400 Subject: [PATCH] Fix multiple crasher bugs in partitioned-table replication logic. apply_handle_tuple_routing(), having detected and reported that the tuple it needed to update didn't exist, tried to update that tuple anyway, leading to a null-pointer dereference. logicalrep_partition_open() failed to ensure that the LogicalRepPartMapEntry it built for a partition was fully independent of that for the partition root, leading to trouble if the root entry was later freed or rebuilt. Meanwhile, on the publisher's side, pgoutput_change() sometimes attempted to apply execute_attr_map_tuple() to a NULL tuple. The first of these was reported by Sergey Bernikov in bug #17055; I found the other two while developing some test cases for this sadly under-tested code. Diagnosis and patch for the first issue by Amit Langote; patches for the others by me; new test cases by me. Back-patch to v13 where this logic came in. Discussion: https://postgr.es/m/17055-9ba800ec8522668b@postgresql.org --- src/backend/replication/logical/relation.c | 13 +++- src/backend/replication/logical/worker.c | 48 ++++++++------ src/backend/replication/pgoutput/pgoutput.c | 7 +- src/test/subscription/t/001_rep_changes.pl | 36 ++++++++++- src/test/subscription/t/013_partition.pl | 71 ++++++++++++++++++++- 5 files changed, 146 insertions(+), 29 deletions(-) diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c index e861c0ff80..4930f2ca34 100644 --- a/src/backend/replication/logical/relation.c +++ b/src/backend/replication/logical/relation.c @@ -620,7 +620,9 @@ logicalrep_partmap_init(void) * logicalrep_partition_open * * Returned entry reuses most of the values of the root table's entry, save - * the attribute map, which can be different for the partition. + * the attribute map, which can be different for the partition. However, + * we must physically copy all the data, in case the root table's entry + * gets freed/rebuilt. * * Note there's no logicalrep_partition_close, because the caller closes the * component relation. @@ -656,7 +658,7 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root, part_entry->partoid = partOid; - /* Remote relation is used as-is from the root entry. */ + /* Remote relation is copied as-is from the root entry. */ entry = &part_entry->relmapentry; entry->remoterel.remoteid = remoterel->remoteid; entry->remoterel.nspname = pstrdup(remoterel->nspname); @@ -699,7 +701,12 @@ logicalrep_partition_open(LogicalRepRelMapEntry *root, } } else - entry->attrmap = attrmap; + { + /* Lacking copy_attmap, do this the hard way. */ + entry->attrmap = make_attrmap(attrmap->maplen); + memcpy(entry->attrmap->attnums, attrmap->attnums, + attrmap->maplen * sizeof(AttrNumber)); + } entry->updatable = root->updatable; diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 98c26002e8..689a66cc72 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -1477,12 +1477,13 @@ apply_handle_update_internal(ApplyExecutionData *edata, else { /* - * The tuple to be updated could not be found. + * The tuple to be updated could not be found. Do nothing except for + * emitting a log message. * - * TODO what to do here, change the log level to LOG perhaps? + * XXX should this be promoted to ereport(LOG) perhaps? */ elog(DEBUG1, - "logical replication did not find row for update " + "logical replication did not find row to be updated " "in replication target relation \"%s\"", RelationGetRelationName(localrel)); } @@ -1589,9 +1590,14 @@ apply_handle_delete_internal(ApplyExecutionData *edata, } else { - /* The tuple to be deleted could not be found. */ + /* + * The tuple to be deleted could not be found. Do nothing except for + * emitting a log message. + * + * XXX should this be promoted to ereport(LOG) perhaps? + */ elog(DEBUG1, - "logical replication did not find row for delete " + "logical replication did not find row to be deleted " "in replication target relation \"%s\"", RelationGetRelationName(localrel)); } @@ -1728,30 +1734,30 @@ apply_handle_tuple_routing(ApplyExecutionData *edata, found = FindReplTupleInLocalRel(estate, partrel, &part_entry->remoterel, remoteslot_part, &localslot); - - oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); - if (found) - { - /* Apply the update. */ - slot_modify_data(remoteslot_part, localslot, - part_entry, - newtup); - MemoryContextSwitchTo(oldctx); - } - else + if (!found) { /* - * The tuple to be updated could not be found. + * The tuple to be updated could not be found. Do nothing + * except for emitting a log message. * - * TODO what to do here, change the log level to LOG - * perhaps? + * XXX should this be promoted to ereport(LOG) perhaps? */ elog(DEBUG1, - "logical replication did not find row for update " - "in replication target relation \"%s\"", + "logical replication did not find row to be updated " + "in replication target relation's partition \"%s\"", RelationGetRelationName(partrel)); + return; } + /* + * Apply the update to the local tuple, putting the result in + * remoteslot_part. + */ + oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + slot_modify_data(remoteslot_part, localslot, part_entry, + newtup); + MemoryContextSwitchTo(oldctx); + /* * Does the updated tuple still satisfy the current * partition's constraint? diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index fe12d08a94..63f108f960 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -612,8 +612,11 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, /* Convert tuples if needed. */ if (relentry->map) { - oldtuple = execute_attr_map_tuple(oldtuple, relentry->map); - newtuple = execute_attr_map_tuple(newtuple, relentry->map); + if (oldtuple) + oldtuple = execute_attr_map_tuple(oldtuple, + relentry->map); + newtuple = execute_attr_map_tuple(newtuple, + relentry->map); } } diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl index 1f654ee6af..b2fdb289db 100644 --- a/src/test/subscription/t/001_rep_changes.pl +++ b/src/test/subscription/t/001_rep_changes.pl @@ -6,7 +6,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 28; +use Test::More tests => 31; # Initialize publisher node my $node_publisher = get_new_node('publisher'); @@ -118,7 +118,7 @@ $node_publisher->safe_psql('postgres', "INSERT INTO tab_mixed VALUES (2, 'bar', 2.2)"); $node_publisher->safe_psql('postgres', - "INSERT INTO tab_full_pk VALUES (1, 'foo')"); + "INSERT INTO tab_full_pk VALUES (1, 'foo'), (2, 'baz')"); $node_publisher->safe_psql('postgres', "INSERT INTO tab_nothing VALUES (generate_series(1,20))"); @@ -297,6 +297,38 @@ is( $result, qq(local|1.1|baz|1 local|2.2|bar|2), 'update works with different column order and subscriber local values'); +$result = $node_subscriber->safe_psql('postgres', + "SELECT * FROM tab_full_pk ORDER BY a"); +is( $result, qq(1|bar +2|baz), + 'update works with REPLICA IDENTITY FULL and a primary key'); + +# Check that subscriber handles cases where update/delete target tuple +# is missing. We have to look for the DEBUG1 log messages about that, +# so temporarily bump up the log verbosity. +$node_subscriber->append_conf('postgresql.conf', "log_min_messages = debug1"); +$node_subscriber->reload; + +$node_subscriber->safe_psql('postgres', "DELETE FROM tab_full_pk"); + +$node_publisher->safe_psql('postgres', + "UPDATE tab_full_pk SET b = 'quux' WHERE a = 1"); +$node_publisher->safe_psql('postgres', "DELETE FROM tab_full_pk WHERE a = 2"); + +$node_publisher->wait_for_catchup('tap_sub'); + +my $logfile = slurp_file($node_subscriber->logfile()); +ok( $logfile =~ + qr/logical replication did not find row to be updated in replication target relation "tab_full_pk"/, + 'update target row is missing'); +ok( $logfile =~ + qr/logical replication did not find row to be deleted in replication target relation "tab_full_pk"/, + 'delete target row is missing'); + +$node_subscriber->append_conf('postgresql.conf', + "log_min_messages = warning"); +$node_subscriber->reload; + # check behavior with toasted values $node_publisher->safe_psql('postgres', diff --git a/src/test/subscription/t/013_partition.pl b/src/test/subscription/t/013_partition.pl index 9de01017be..d0b5a55d18 100644 --- a/src/test/subscription/t/013_partition.pl +++ b/src/test/subscription/t/013_partition.pl @@ -6,7 +6,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 56; +use Test::More tests => 62; # setup @@ -347,6 +347,46 @@ $result = $node_subscriber2->safe_psql('postgres', "SELECT a FROM tab1 ORDER BY 1"); is($result, qq(), 'truncate of tab1 replicated'); +# Check that subscriber handles cases where update/delete target tuple +# is missing. We have to look for the DEBUG1 log messages about that, +# so temporarily bump up the log verbosity. +$node_subscriber1->append_conf('postgresql.conf', + "log_min_messages = debug1"); +$node_subscriber1->reload; + +$node_publisher->safe_psql('postgres', + "INSERT INTO tab1 VALUES (1, 'foo'), (4, 'bar'), (10, 'baz')"); + +$node_publisher->wait_for_catchup('sub1'); +$node_publisher->wait_for_catchup('sub2'); + +$node_subscriber1->safe_psql('postgres', "DELETE FROM tab1"); + +$node_publisher->safe_psql('postgres', + "UPDATE tab1 SET b = 'quux' WHERE a = 4"); +$node_publisher->safe_psql('postgres', "DELETE FROM tab1"); + +$node_publisher->wait_for_catchup('sub1'); +$node_publisher->wait_for_catchup('sub2'); + +my $logfile = slurp_file($node_subscriber1->logfile()); +ok( $logfile =~ + qr/logical replication did not find row to be updated in replication target relation's partition "tab1_2_2"/, + 'update target row is missing in tab1_2_2'); +ok( $logfile =~ + qr/logical replication did not find row to be deleted in replication target relation "tab1_1"/, + 'delete target row is missing in tab1_1'); +ok( $logfile =~ + qr/logical replication did not find row to be deleted in replication target relation "tab1_2_2"/, + 'delete target row is missing in tab1_2_2'); +ok( $logfile =~ + qr/logical replication did not find row to be deleted in replication target relation "tab1_def"/, + 'delete target row is missing in tab1_def'); + +$node_subscriber1->append_conf('postgresql.conf', + "log_min_messages = warning"); +$node_subscriber1->reload; + # Tests for replication using root table identity and schema # publisher @@ -650,3 +690,32 @@ is( $result, qq(pub_tab2|1|xxx pub_tab2|3|yyy pub_tab2|5|zzz xxx_c|6|aaa), 'inserts into tab2 replicated'); + +# Check that subscriber handles cases where update/delete target tuple +# is missing. We have to look for the DEBUG1 log messages about that, +# so temporarily bump up the log verbosity. +$node_subscriber1->append_conf('postgresql.conf', + "log_min_messages = debug1"); +$node_subscriber1->reload; + +$node_subscriber1->safe_psql('postgres', "DELETE FROM tab2"); + +$node_publisher->safe_psql('postgres', + "UPDATE tab2 SET b = 'quux' WHERE a = 5"); +$node_publisher->safe_psql('postgres', "DELETE FROM tab2 WHERE a = 1"); + +$node_publisher->wait_for_catchup('sub_viaroot'); +$node_publisher->wait_for_catchup('sub2'); + +$logfile = slurp_file($node_subscriber1->logfile()); +ok( $logfile =~ + qr/logical replication did not find row to be updated in replication target relation's partition "tab2_1"/, + 'update target row is missing in tab2_1'); +ok( $logfile =~ + qr/logical replication did not find row to be deleted in replication target relation "tab2_1"/, + 'delete target row is missing in tab2_1'); + +# No need for this until more tests are added. +# $node_subscriber1->append_conf('postgresql.conf', +# "log_min_messages = warning"); +# $node_subscriber1->reload;