diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 3ad4a9f587..f020737f80 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -508,6 +508,9 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); @@ -523,7 +526,20 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) */ map[truncByte] &= (1 << truncOffset) - 1; + /* + * Truncation of a relation is WAL-logged at a higher-level, and we + * will be called at WAL replay. But if checksums are enabled, we need + * to still write a WAL record to protect against a torn page, if the + * page is flushed to disk before the truncation WAL record. We cannot + * use MarkBufferDirtyHint here, because that will not dirty the page + * during recovery. + */ MarkBufferDirty(mapBuffer); + if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) + log_newpage_buffer(mapBuffer, false); + + END_CRIT_SECTION(); + UnlockReleaseBuffer(mapBuffer); } else diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index bbd90c911a..4138b04839 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -327,8 +327,26 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); - MarkBufferDirtyHint(buf, false); + + /* + * Truncation of a relation is WAL-logged at a higher-level, and we + * will be called at WAL replay. But if checksums are enabled, we need + * to still write a WAL record to protect against a torn page, if the + * page is flushed to disk before the truncation WAL record. We cannot + * use MarkBufferDirtyHint here, because that will not dirty the page + * during recovery. + */ + MarkBufferDirty(buf); + if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) + log_newpage_buffer(buf, false); + + END_CRIT_SECTION(); + UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; diff --git a/src/test/recovery/t/008_fsm_truncation.pl b/src/test/recovery/t/008_fsm_truncation.pl new file mode 100644 index 0000000000..9f6bdb0b64 --- /dev/null +++ b/src/test/recovery/t/008_fsm_truncation.pl @@ -0,0 +1,93 @@ +# Test WAL replay of FSM changes. +# +# FSM changes don't normally need to be WAL-logged, except for truncation. +# The FSM mustn't return a page that doesn't exist (anymore). +use strict; +use warnings; + +use PostgresNode; +use TestLib; +use Test::More tests => 1; + +my $node_master = get_new_node('master'); +$node_master->init(allows_streaming => 1); + +$node_master->append_conf('postgresql.conf', qq{ +fsync = on +wal_level = replica +wal_log_hints = on +max_prepared_transactions = 5 +autovacuum = off +}); + +# Create a master node and its standby, initializing both with some data +# at the same time. +$node_master->start; + +$node_master->backup('master_backup'); +my $node_standby = get_new_node('standby'); +$node_standby->init_from_backup($node_master, 'master_backup', + has_streaming => 1); +$node_standby->start; + +$node_master->psql('postgres', qq{ +create table testtab (a int, b char(100)); +insert into testtab select generate_series(1,1000), 'foo'; +insert into testtab select generate_series(1,1000), 'foo'; +delete from testtab where ctid > '(8,0)'; +}); + +# Take a lock on the table to prevent following vacuum from truncating it +$node_master->psql('postgres', qq{ +begin; +lock table testtab in row share mode; +prepare transaction 'p1'; +}); + +# Vacuum, update FSM without truncation +$node_master->psql('postgres', 'vacuum verbose testtab'); + +# Force a checkpoint +$node_master->psql('postgres', 'checkpoint'); + +# Now do some more insert/deletes, another vacuum to ensure full-page writes +# are done +$node_master->psql('postgres', qq{ +insert into testtab select generate_series(1,1000), 'foo'; +delete from testtab where ctid > '(8,0)'; +vacuum verbose testtab; +}); + +# Ensure all buffers are now clean on the standby +$node_standby->psql('postgres', 'checkpoint'); + +# Release the lock, vacuum again which should lead to truncation +$node_master->psql('postgres', qq{ +rollback prepared 'p1'; +vacuum verbose testtab; +}); + +$node_master->psql('postgres', 'checkpoint'); +my $until_lsn = + $node_master->safe_psql('postgres', "SELECT pg_current_xlog_location();"); + +# Wait long enough for standby to receive and apply all WAL +my $caughtup_query = + "SELECT '$until_lsn'::pg_lsn <= pg_last_xlog_replay_location()"; +$node_standby->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for standby to catch up"; + +# Promote the standby +$node_standby->promote; +$node_standby->poll_query_until('postgres', + "SELECT NOT pg_is_in_recovery()") + or die "Timed out while waiting for promotion of standby"; +$node_standby->psql('postgres', 'checkpoint'); + +# Restart to discard in-memory copy of FSM +$node_standby->restart; + +# Insert should work on standby +is($node_standby->psql('postgres', + qq{insert into testtab select generate_series(1,1000), 'foo';}), + 0, 'INSERT succeeds with truncated relation FSM');