Fix various checksum check problems for pg_verify_checksums and base backups

Three issues are fixed in this patch:
- Base backups forgot to ignore files specific to EXEC_BACKEND, leading
to spurious warnings when checksums are enabled, per analysis from me.
- pg_verify_checksums forgot about files specific to EXEC_BACKEND,
leading to failures of the tool on any such build, particularly Windows.
This error was originally found by newly-introduced TAP tests in various
buildfarm members using EXEC_BACKEND.
- pg_verify_checksums forgot to count for temporary files and temporary
paths, which could be valid relation files, without checksums, per
report from Andres Freund.  More tests are added to cover this case.

A new test case which emulates corruption for a file in a different
tablespace is added, coming from from Michael Banck, while I have coded
the main code and refactored the test code.

Author: Michael Banck, Michael Paquier
Reviewed-by: Stephen Frost, David Steele
Discussion: https://postgr.es/m/20181021134206.GA14282@paquier.xyz
This commit is contained in:
Michael Paquier 2018-11-30 10:34:45 +09:00
parent a1c91dd110
commit 5c99513975
3 changed files with 121 additions and 46 deletions

View File

@ -189,12 +189,19 @@ static const char *excludeFiles[] =
/*
* List of files excluded from checksum validation.
*
* Note: this list should be kept in sync with what pg_verify_checksums.c
* includes.
*/
static const char *const noChecksumFiles[] = {
"pg_control",
"pg_filenode.map",
"pg_internal.init",
"PG_VERSION",
#ifdef EXEC_BACKEND
"config_exec_params",
"config_exec_params.new",
#endif
NULL,
};

View File

@ -20,6 +20,7 @@
#include "storage/bufpage.h"
#include "storage/checksum.h"
#include "storage/checksum_impl.h"
#include "storage/fd.h"
static int64 files = 0;
@ -49,11 +50,20 @@ usage(void)
printf(_("Report bugs to <pgsql-bugs@postgresql.org>.\n"));
}
/*
* List of files excluded from checksum validation.
*
* Note: this list should be kept in sync with what basebackup.c includes.
*/
static const char *const skip[] = {
"pg_control",
"pg_filenode.map",
"pg_internal.init",
"PG_VERSION",
#ifdef EXEC_BACKEND
"config_exec_params",
"config_exec_params.new",
#endif
NULL,
};
@ -62,13 +72,10 @@ skipfile(const char *fn)
{
const char *const *f;
if (strcmp(fn, ".") == 0 ||
strcmp(fn, "..") == 0)
return true;
for (f = skip; *f; f++)
if (strcmp(*f, fn) == 0)
return true;
return false;
}
@ -146,9 +153,22 @@ scan_directory(const char *basedir, const char *subdir)
char fn[MAXPGPATH];
struct stat st;
if (skipfile(de->d_name))
if (strcmp(de->d_name, ".") == 0 ||
strcmp(de->d_name, "..") == 0)
continue;
/* Skip temporary files */
if (strncmp(de->d_name,
PG_TEMP_FILE_PREFIX,
strlen(PG_TEMP_FILE_PREFIX)) == 0)
continue;
/* Skip temporary folders */
if (strncmp(de->d_name,
PG_TEMP_FILES_DIR,
strlen(PG_TEMP_FILES_DIR)) == 0)
return;
snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
if (lstat(fn, &st) < 0)
{
@ -163,6 +183,9 @@ scan_directory(const char *basedir, const char *subdir)
*segmentpath;
BlockNumber segmentno = 0;
if (skipfile(de->d_name))
continue;
/*
* Cut off at the segment boundary (".") to get the segment number
* in order to mix it into the checksum. Then also cut off at the

View File

@ -5,7 +5,74 @@ use strict;
use warnings;
use PostgresNode;
use TestLib;
use Test::More tests => 36;
use Test::More tests => 45;
# Utility routine to create and check a table with corrupted checksums
# on a wanted tablespace. Note that this stops and starts the node
# multiple times to perform the checks, leaving the node started
# at the end.
sub check_relation_corruption
{
my $node = shift;
my $table = shift;
my $tablespace = shift;
my $pgdata = $node->data_dir;
$node->safe_psql('postgres',
"SELECT a INTO $table FROM generate_series(1,10000) AS a;
ALTER TABLE $table SET (autovacuum_enabled=false);");
$node->safe_psql('postgres',
"ALTER TABLE ".$table." SET TABLESPACE ".$tablespace.";");
my $file_corrupted = $node->safe_psql('postgres',
"SELECT pg_relation_filepath('$table');");
my $relfilenode_corrupted = $node->safe_psql('postgres',
"SELECT relfilenode FROM pg_class WHERE relname = '$table';");
# Set page header and block size
my $pageheader_size = 24;
my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
$node->stop;
# Checksums are correct for single relfilenode as the table is not
# corrupted yet.
command_ok(['pg_verify_checksums', '-D', $pgdata,
'-r', $relfilenode_corrupted],
"succeeds for single relfilenode on tablespace $tablespace with offline cluster");
# Time to create some corruption
open my $file, '+<', "$pgdata/$file_corrupted";
seek($file, $pageheader_size, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
close $file;
# Checksum checks on single relfilenode fail
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata, '-r',
$relfilenode_corrupted],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
"fails with corrupted data for single relfilenode on tablespace $tablespace");
# Global checksum checks fail as well
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
"fails with corrupted data on tablespace $tablespace");
# Drop corrupted table again and make sure there is no more corruption.
$node->start;
$node->safe_psql('postgres', "DROP TABLE $table;");
$node->stop;
$node->command_ok(['pg_verify_checksums', '-D', $pgdata],
"succeeds again after table drop on tablespace $tablespace");
$node->start;
return;
}
# Initialize node with checksums enabled.
my $node = get_new_node('node_checksum');
@ -27,6 +94,12 @@ append_to_file "$pgdata/global/99999_init.123", "";
append_to_file "$pgdata/global/99999_fsm.123", "";
append_to_file "$pgdata/global/99999_vm.123", "";
# These are temporary files and folders with dummy contents, which
# should be ignored by the scan.
append_to_file "$pgdata/global/pgsql_tmp_123", "foo";
mkdir "$pgdata/global/pgsql_tmp";
append_to_file "$pgdata/global/pgsql_tmp/1.1", "foo";
# Checksums pass on a newly-created cluster
command_ok(['pg_verify_checksums', '-D', $pgdata],
"succeeds with offline cluster");
@ -36,47 +109,16 @@ $node->start;
command_fails(['pg_verify_checksums', '-D', $pgdata],
"fails with online cluster");
# Create table to corrupt and get its relfilenode
# Check corruption of table on default tablespace.
check_relation_corruption($node, 'corrupt1', 'pg_default');
# Create tablespace to check corruptions in a non-default tablespace.
my $basedir = $node->basedir;
my $tablespace_dir = "$basedir/ts_corrupt_dir";
mkdir ($tablespace_dir);
$node->safe_psql('postgres',
"SELECT a INTO corrupt1 FROM generate_series(1,10000) AS a;
ALTER TABLE corrupt1 SET (autovacuum_enabled=false);");
my $file_corrupted = $node->safe_psql('postgres',
"SELECT pg_relation_filepath('corrupt1')");
my $relfilenode_corrupted = $node->safe_psql('postgres',
"SELECT relfilenode FROM pg_class WHERE relname = 'corrupt1';");
# Set page header and block size
my $pageheader_size = 24;
my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
$node->stop;
# Checksums are correct for single relfilenode as the table is not
# corrupted yet.
command_ok(['pg_verify_checksums', '-D', $pgdata,
'-r', $relfilenode_corrupted],
"succeeds for single relfilenode with offline cluster");
# Time to create some corruption
open my $file, '+<', "$pgdata/$file_corrupted";
seek($file, $pageheader_size, 0);
syswrite($file, '\0\0\0\0\0\0\0\0\0');
close $file;
# Global checksum checks fail
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
'fails with corrupted data');
# Checksum checks on single relfilenode fail
$node->command_checks_all([ 'pg_verify_checksums', '-D', $pgdata, '-r',
$relfilenode_corrupted],
1,
[qr/Bad checksums:.*1/],
[qr/checksum verification failed/],
'fails for corrupted data on single relfilenode');
"CREATE TABLESPACE ts_corrupt LOCATION '$tablespace_dir';");
check_relation_corruption($node, 'corrupt2', 'ts_corrupt');
# Utility routine to check that pg_verify_checksums is able to detect
# correctly-named relation files filled with some corrupted data.
@ -101,6 +143,9 @@ sub fail_corrupt
return;
}
# Stop instance for the follow-up checks.
$node->stop;
# Authorized relation files filled with corrupted data cause the
# checksum checks to fail. Make sure to use file names different
# than the previous ones.