Fix old-fd issues using global barriers everywhere.

Commits 4eb21763 and b74e94dc introduced a way to force every backend to
close all relation files, to fix an ancient Windows-only bug.

This commit extends that behavior to all operating systems and adds
a couple of extra barrier points, to fix a totally different class of
bug: the reuse of relfilenodes in scenarios that have no other kind of
cache invalidation to prevent file descriptor mix-ups.

In all releases, data corruption could occur when you moved a database
to another tablespace and then back again.  Despite that, no back-patch
for now as the infrastructure required is too new and invasive.  In
master only, since commit aa010514, it could also happen when using
CREATE DATABASE with a user-supplied OID or via pg_upgrade.

Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Discussion: https://postgr.es/m/20220209220004.kb3dgtn2x2k2gtdm%40alap3.anarazel.de
This commit is contained in:
Thomas Munro 2022-05-07 15:19:52 +12:00
parent b74e94dc27
commit e2f65f4255
5 changed files with 241 additions and 25 deletions

View File

@ -1687,10 +1687,8 @@ dropdb(const char *dbname, bool missing_ok, bool force)
*/
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
#if defined(USE_BARRIER_SMGRRELEASE)
/* Close all smgr fds in all backends. */
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
#endif
/*
* Remove all tablespace subdirs belonging to the database.
@ -1940,10 +1938,8 @@ movedb(const char *dbname, const char *tblspcname)
RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT
| CHECKPOINT_FLUSH_ALL);
#if defined(USE_BARRIER_SMGRRELEASE)
/* Close all smgr fds in all backends. */
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
#endif
/*
* Now drop all buffers holding data of the target database; they should
@ -3054,6 +3050,9 @@ dbase_redo(XLogReaderState *record)
*/
FlushDatabaseBuffers(xlrec->src_db_id);
/* Close all sgmr fds in all backends. */
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
/*
* Copy this subdirectory to the new location
*
@ -3111,10 +3110,8 @@ dbase_redo(XLogReaderState *record)
/* Clean out the xlog relcache too */
XLogDropDatabase(xlrec->db_id);
#if defined(USE_BARRIER_SMGRRELEASE)
/* Close all sgmr fds in all backends. */
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
#endif
for (i = 0; i < xlrec->ntablespaces; i++)
{

View File

@ -548,11 +548,10 @@ DropTableSpace(DropTableSpaceStmt *stmt)
* use a global barrier to ask all backends to close all files, and
* wait until they're finished.
*/
#if defined(USE_BARRIER_SMGRRELEASE)
LWLockRelease(TablespaceCreateLock);
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
#endif
/* And now try again. */
if (!destroy_tablespace_directories(tablespaceoid, false))
{
@ -1574,6 +1573,9 @@ tblspc_redo(XLogReaderState *record)
{
xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
/* Close all smgr fds in all backends. */
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
/*
* If we issued a WAL record for a drop tablespace it implies that
* there were no files in it at all when the DROP was done. That means
@ -1591,11 +1593,6 @@ tblspc_redo(XLogReaderState *record)
*/
if (!destroy_tablespace_directories(xlrec->ts_id, true))
{
#if defined(USE_BARRIER_SMGRRELEASE)
/* Close all smgr fds in all backends. */
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
#endif
ResolveRecoveryConflictWithTablespace(xlrec->ts_id);
/*

View File

@ -152,17 +152,6 @@
#define EXEC_BACKEND
#endif
/*
* If USE_BARRIER_SMGRRELEASE is defined, certain code paths that unlink
* directories will ask other backends to close all smgr file descriptors.
* This is enabled on Windows, because otherwise unlinked but still open files
* can prevent rmdir(containing_directory) from succeeding. On other
* platforms, it can be defined to exercise those code paths.
*/
#if defined(WIN32)
#define USE_BARRIER_SMGRRELEASE
#endif
/*
* Define this if your operating system supports link()
*/

View File

@ -9,7 +9,7 @@
#
#-------------------------------------------------------------------------
EXTRA_INSTALL=contrib/test_decoding
EXTRA_INSTALL=contrib/test_decoding contrib/pg_prewarm
subdir = src/test/recovery
top_builddir = ../../..

View File

@ -0,0 +1,233 @@
use strict;
use warnings;
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
use File::Basename;
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
$node_primary->init(allows_streaming => 1);
$node_primary->append_conf('postgresql.conf', q[
allow_in_place_tablespaces = true
log_connections=on
# to avoid "repairing" corruption
full_page_writes=off
log_min_messages=debug2
autovacuum_naptime=1s
shared_buffers=1MB
]);
$node_primary->start;
# Create streaming standby linking to primary
my $backup_name = 'my_backup';
$node_primary->backup($backup_name);
my $node_standby = PostgreSQL::Test::Cluster->new('standby');
$node_standby->init_from_backup($node_primary, $backup_name,
has_streaming => 1);
$node_standby->start;
# To avoid hanging while expecting some specific input from a psql
# instance being driven by us, add a timeout high enough that it
# should never trigger even on very slow machines, unless something
# is really wrong.
my $psql_timeout = IPC::Run::timer(300);
my %psql_primary = (stdin => '', stdout => '', stderr => '');
$psql_primary{run} = IPC::Run::start(
[ 'psql', '-XA', '-f', '-', '-d', $node_primary->connstr('postgres') ],
'<',
\$psql_primary{stdin},
'>',
\$psql_primary{stdout},
'2>',
\$psql_primary{stderr},
$psql_timeout);
my %psql_standby = ('stdin' => '', 'stdout' => '', 'stderr' => '');
$psql_standby{run} = IPC::Run::start(
[ 'psql', '-XA', '-f', '-', '-d', $node_standby->connstr('postgres') ],
'<',
\$psql_standby{stdin},
'>',
\$psql_standby{stdout},
'2>',
\$psql_standby{stderr},
$psql_timeout);
# Create template database with a table that we'll update, to trigger dirty
# rows. Using a template database + preexisting rows makes it a bit easier to
# reproduce, because there's no cache invalidations generated.
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db_template OID = 50000;");
$node_primary->safe_psql('conflict_db_template', q[
CREATE TABLE large(id serial primary key, dataa text, datab text);
INSERT INTO large(dataa, datab) SELECT g.i::text, 1 FROM generate_series(1, 4000) g(i);]);
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;");
$node_primary->safe_psql('postgres', q[
CREATE EXTENSION pg_prewarm;
CREATE TABLE replace_sb(data text);
INSERT INTO replace_sb(data) SELECT random()::text FROM generate_series(1, 15000);]);
# Use longrunning transactions, so that AtEOXact_SMgr doesn't close files
send_query_and_wait(
\%psql_primary,
q[BEGIN;],
qr/BEGIN/m);
send_query_and_wait(
\%psql_standby,
q[BEGIN;],
qr/BEGIN/m);
# Cause lots of dirty rows in shared_buffers
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 1;");
# Now do a bunch of work in another database. That will end up needing to
# write back dirty data from the previous step, opening the relevant file
# descriptors
cause_eviction(\%psql_primary, \%psql_standby);
# drop and recreate database
$node_primary->safe_psql('postgres', "DROP DATABASE conflict_db;");
$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;");
verify($node_primary, $node_standby, 1,
"initial contents as expected");
# Again cause lots of dirty rows in shared_buffers, but use a different update
# value so we can check everything is OK
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 2;");
# Again cause a lot of IO. That'll again write back dirty data, but uses (XXX
# adjust after bugfix) the already opened file descriptor.
# FIXME
cause_eviction(\%psql_primary, \%psql_standby);
verify($node_primary, $node_standby, 2,
"update to reused relfilenode (due to DB oid conflict) is not lost");
$node_primary->safe_psql('conflict_db', "VACUUM FULL large;");
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 3;");
verify($node_primary, $node_standby, 3,
"restored contents as expected");
# Test for old filehandles after moving a database in / out of tablespace
$node_primary->safe_psql('postgres', q[CREATE TABLESPACE test_tablespace LOCATION '']);
# cause dirty buffers
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 4;");
# cause files to be opened in backend in other database
cause_eviction(\%psql_primary, \%psql_standby);
# move database back / forth
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace');
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE pg_default');
# cause dirty buffers
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 5;");
cause_eviction(\%psql_primary, \%psql_standby);
verify($node_primary, $node_standby, 5,
"post move contents as expected");
$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace');
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 7;");
cause_eviction(\%psql_primary, \%psql_standby);
$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 8;");
$node_primary->safe_psql('postgres', 'DROP DATABASE conflict_db');
$node_primary->safe_psql('postgres', 'DROP TABLESPACE test_tablespace');
$node_primary->safe_psql('postgres', 'REINDEX TABLE pg_database');
# explicitly shut down psql instances gracefully - to avoid hangs
# or worse on windows
$psql_primary{stdin} .= "\\q\n";
$psql_primary{run}->finish;
$psql_standby{stdin} .= "\\q\n";
$psql_standby{run}->finish;
$node_primary->stop();
$node_standby->stop();
# Make sure that there weren't crashes during shutdown
command_like([ 'pg_controldata', $node_primary->data_dir ],
qr/Database cluster state:\s+shut down\n/, 'primary shut down ok');
command_like([ 'pg_controldata', $node_standby->data_dir ],
qr/Database cluster state:\s+shut down in recovery\n/, 'standby shut down ok');
done_testing();
sub verify
{
my ($primary, $standby, $counter, $message) = @_;
my $query = "SELECT datab, count(*) FROM large GROUP BY 1 ORDER BY 1 LIMIT 10";
is($primary->safe_psql('conflict_db', $query),
"$counter|4000",
"primary: $message");
$primary->wait_for_catchup($standby);
is($standby->safe_psql('conflict_db', $query),
"$counter|4000",
"standby: $message");
}
sub cause_eviction
{
my ($psql_primary, $psql_standby) = @_;
send_query_and_wait(
$psql_primary,
q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;],
qr/warmed_buffers/m);
send_query_and_wait(
$psql_standby,
q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;],
qr/warmed_buffers/m);
}
# Send query, wait until string matches
sub send_query_and_wait
{
my ($psql, $query, $untl) = @_;
my $ret;
# send query
$$psql{stdin} .= $query;
$$psql{stdin} .= "\n";
# wait for query results
$$psql{run}->pump_nb();
while (1)
{
last if $$psql{stdout} =~ /$untl/;
if ($psql_timeout->is_expired)
{
BAIL_OUT("aborting wait: program timed out\n"
. "stream contents: >>$$psql{stdout}<<\n"
. "pattern searched for: $untl\n");
return 0;
}
if (not $$psql{run}->pumpable())
{
BAIL_OUT("aborting wait: program died\n"
. "stream contents: >>$$psql{stdout}<<\n"
. "pattern searched for: $untl\n");
return 0;
}
$$psql{run}->pump();
}
$$psql{stdout} = '';
return 1;
}