From 6e9ffcf13a4a0d56f03b4048231066cb5d061299 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 25 Mar 2022 14:23:26 -0400 Subject: [PATCH] Harden TAP tests that intentionally corrupt page checksums. The previous method for doing that was to write zeroes into a predetermined set of page locations. However, there's a roughly 1-in-64K chance that the existing checksum will match by chance, and yesterday several buildfarm animals started to reproducibly see that, resulting in test failures because no checksum mismatch was reported. Since the checksum includes the page LSN, test success depends on the length of the installation's WAL history, which is affected by (at least) the initial catalog contents, the set of locales installed on the system, and the length of the pathname of the test directory. Sooner or later we were going to hit a chance match, and today is that day. Harden these tests by specifically inverting the checksum field and leaving all else alone, thereby guaranteeing that the checksum is incorrect. In passing, fix places that were using seek() to set up for syswrite(), a combination that the Perl docs very explicitly warn against. We've probably escaped problems because no regular buffered I/O is done on these filehandles; but if it ever breaks, we wouldn't deserve or get much sympathy. Although we've only seen problems in HEAD, now that we recognize the environmental dependencies it seems like it might be just a matter of time until someone manages to hit this in back-branch testing. Hence, back-patch to v11 where we started doing this kind of test. Discussion: https://postgr.es/m/3192026.1648185780@sss.pgh.pa.us --- src/bin/pg_basebackup/t/010_pg_basebackup.pl | 33 +++++++------------- src/bin/pg_checksums/t/002_actions.pl | 9 ++---- src/test/perl/PostgresNode.pm | 31 ++++++++++++++++++ 3 files changed, 44 insertions(+), 29 deletions(-) diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl index 208df557b8..ac1ac79470 100644 --- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl +++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl @@ -36,7 +36,7 @@ if (open my $badchars, '>>', "$tempdir/pgdata/FOO\xe0\xe0\xe0BAR") } $node->set_replication_conf(); -system_or_bail 'pg_ctl', '-D', $pgdata, 'reload'; +$node->reload; $node->command_fails( [ 'pg_basebackup', '-D', "$tempdir/backup" ], @@ -494,17 +494,13 @@ my $file_corrupt2 = $node->safe_psql('postgres', q{SELECT b INTO corrupt2 FROM generate_series(1,2) AS b; ALTER TABLE corrupt2 SET (autovacuum_enabled=false); SELECT pg_relation_filepath('corrupt2')} ); -# set page header and block sizes -my $pageheader_size = 24; +# get block size for corruption steps my $block_size = $node->safe_psql('postgres', 'SHOW block_size;'); # induce corruption -system_or_bail 'pg_ctl', '-D', $pgdata, 'stop'; -open $file, '+<', "$pgdata/$file_corrupt1"; -seek($file, $pageheader_size, 0); -syswrite($file, "\0\0\0\0\0\0\0\0\0"); -close $file; -system_or_bail 'pg_ctl', '-D', $pgdata, 'start'; +$node->stop; +$node->corrupt_page_checksum($file_corrupt1, 0); +$node->start; $node->command_checks_all( [ 'pg_basebackup', '-D', "$tempdir/backup_corrupt" ], @@ -515,16 +511,12 @@ $node->command_checks_all( rmtree("$tempdir/backup_corrupt"); # induce further corruption in 5 more blocks -system_or_bail 'pg_ctl', '-D', $pgdata, 'stop'; -open $file, '+<', "$pgdata/$file_corrupt1"; +$node->stop; for my $i (1 .. 5) { - my $offset = $pageheader_size + $i * $block_size; - seek($file, $offset, 0); - syswrite($file, "\0\0\0\0\0\0\0\0\0"); + $node->corrupt_page_checksum($file_corrupt1, $i * $block_size); } -close $file; -system_or_bail 'pg_ctl', '-D', $pgdata, 'start'; +$node->start; $node->command_checks_all( [ 'pg_basebackup', '-D', "$tempdir/backup_corrupt2" ], @@ -535,12 +527,9 @@ $node->command_checks_all( rmtree("$tempdir/backup_corrupt2"); # induce corruption in a second file -system_or_bail 'pg_ctl', '-D', $pgdata, 'stop'; -open $file, '+<', "$pgdata/$file_corrupt2"; -seek($file, $pageheader_size, 0); -syswrite($file, "\0\0\0\0\0\0\0\0\0"); -close $file; -system_or_bail 'pg_ctl', '-D', $pgdata, 'start'; +$node->stop; +$node->corrupt_page_checksum($file_corrupt2, 0); +$node->start; $node->command_checks_all( [ 'pg_basebackup', '-D', "$tempdir/backup_corrupt3" ], diff --git a/src/bin/pg_checksums/t/002_actions.pl b/src/bin/pg_checksums/t/002_actions.pl index b7a64de64c..29a1a2a9cb 100644 --- a/src/bin/pg_checksums/t/002_actions.pl +++ b/src/bin/pg_checksums/t/002_actions.pl @@ -19,6 +19,7 @@ sub check_relation_corruption my $tablespace = shift; my $pgdata = $node->data_dir; + # Create table and discover its filesystem location. $node->safe_psql( 'postgres', "SELECT a INTO $table FROM generate_series(1,10000) AS a; @@ -32,9 +33,6 @@ sub check_relation_corruption my $relfilenode_corrupted = $node->safe_psql('postgres', "SELECT relfilenode FROM pg_class WHERE relname = '$table';"); - # Set page header and block size - my $pageheader_size = 24; - my $block_size = $node->safe_psql('postgres', 'SHOW block_size;'); $node->stop; # Checksums are correct for single relfilenode as the table is not @@ -49,10 +47,7 @@ sub check_relation_corruption ); # Time to create some corruption - open my $file, '+<', "$pgdata/$file_corrupted"; - seek($file, $pageheader_size, 0); - syswrite($file, "\0\0\0\0\0\0\0\0\0"); - close $file; + $node->corrupt_page_checksum($file_corrupted, 0); # Checksum checks on single relfilenode fail $node->command_checks_all( diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm index 07120e097b..83464733fa 100644 --- a/src/test/perl/PostgresNode.pm +++ b/src/test/perl/PostgresNode.pm @@ -2346,6 +2346,37 @@ sub pg_recvlogical_upto =pod +=item $node->corrupt_page_checksum(self, file, page_offset) + +Intentionally corrupt the checksum field of one page in a file. +The server must be stopped for this to work reliably. + +The file name should be specified relative to the cluster datadir. +page_offset had better be a multiple of the cluster's block size. + +=cut + +sub corrupt_page_checksum +{ + my ($self, $file, $page_offset) = @_; + my $pgdata = $self->data_dir; + my $pageheader; + + open my $fh, '+<', "$pgdata/$file" or die "open($file) failed: $!"; + binmode $fh; + sysseek($fh, $page_offset, 0) or die "sysseek failed: $!"; + sysread($fh, $pageheader, 24) or die "sysread failed: $!"; + # This inverts the pd_checksum field (only); see struct PageHeaderData + $pageheader ^= "\0\0\0\0\0\0\0\0\xff\xff"; + sysseek($fh, $page_offset, 0) or die "sysseek failed: $!"; + syswrite($fh, $pageheader) or die "syswrite failed: $!"; + close $fh; + + return; +} + +=pod + =back =cut