Quick exit on log stream child exit in pg_basebackup

If the log streaming child process (thread on Windows) dies during
backup then the whole backup will be aborted at the end of the
backup.  Instead, trap ungraceful termination of the log streaming
child and exit early.  This also adds a TAP test for simulating this
by terminating the responsible backend.

Reviewed-by: Michael Paquier <michael@paquier.xyz>
Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Reviewed-by: Magnus Hagander <magnus@hagander.net>
Discussion: https://postgr.es/m/0F69E282-97F9-4DB7-8D6D-F927AA6340C8@yesql.se
Discussion: https://postgr.es/m/VI1PR83MB0189818B82C19059CB62E26199A89@VI1PR83MB0189.EURPRD83.prod.outlook.com
This commit is contained in:
Daniel Gustafsson 2022-02-23 14:24:43 +01:00
parent c7d7e12039
commit 0475a97f74
2 changed files with 79 additions and 2 deletions

View File

@ -174,6 +174,8 @@ static int bgpipe[2] = {-1, -1};
/* Handle to child process */
static pid_t bgchild = -1;
static bool in_log_streamer = false;
/* Flag to indicate if child process exited unexpectedly */
static volatile sig_atomic_t bgchild_exited = false;
/* End position for xlog streaming, empty string if unknown yet */
static XLogRecPtr xlogendptr;
@ -277,6 +279,18 @@ disconnect_atexit(void)
}
#ifndef WIN32
/*
* If the bgchild exits prematurely and raises a SIGCHLD signal, we can abort
* processing rather than wait until the backup has finished and error out at
* that time. On Windows, we use a background thread which can communicate
* without the need for a signal handler.
*/
static void
sigchld_handler(SIGNAL_ARGS)
{
bgchild_exited = true;
}
/*
* On windows, our background thread dies along with the process. But on
* Unix, if we have started a subprocess, we want to kill it off so it
@ -285,7 +299,7 @@ disconnect_atexit(void)
static void
kill_bgchild_atexit(void)
{
if (bgchild > 0)
if (bgchild > 0 && !bgchild_exited)
kill(bgchild, SIGTERM);
}
#endif
@ -572,17 +586,28 @@ LogStreamerMain(logstreamer_param *param)
stream.do_sync);
if (!ReceiveXlogStream(param->bgconn, &stream))
{
/*
* Any errors will already have been reported in the function process,
* but we need to tell the parent that we didn't shutdown in a nice
* way.
*/
#ifdef WIN32
/*
* In order to signal the main thread of an ungraceful exit we
* set the same flag that we use on Unix to signal SIGCHLD.
*/
bgchild_exited = true;
#endif
return 1;
}
if (!stream.walmethod->finish())
{
pg_log_error("could not finish writing WAL files: %m");
#ifdef WIN32
bgchild_exited = true;
#endif
return 1;
}
@ -1134,6 +1159,12 @@ ReceiveCopyData(PGconn *conn, WriteDataCallback callback,
exit(1);
}
if (bgchild_exited)
{
pg_log_error("background process terminated unexpectedly");
exit(1);
}
(*callback) (r, copybuf, callback_data);
PQfreemem(copybuf);
@ -2882,6 +2913,18 @@ main(int argc, char **argv)
}
atexit(disconnect_atexit);
#ifndef WIN32
/*
* Trap SIGCHLD to be able to handle the WAL stream process exiting. There
* is no SIGCHLD on Windows, there we rely on the background thread setting
* the signal variable on unexpected but graceful exit. If the WAL stream
* thread crashes on Windows it will bring down the entire process as it's
* a thread, so there is nothing to catch should that happen. A crash on
* UNIX will be caught by the signal handler.
*/
pqsignal(SIGCHLD, sigchld_handler);
#endif
/*
* Set umask so that directories/files are created with the same
* permissions as directories/files in the source data directory.

View File

@ -776,4 +776,38 @@ SKIP:
rmtree("$tempdir/backup_gzip3");
}
# Test background stream process terminating before the basebackup has
# finished, the main process should exit gracefully with an error message on
# stderr. To reduce the risk of timing related issues we invoke the base
# backup with rate throttling enabled.
$node->safe_psql('postgres',
q{CREATE TABLE t AS SELECT a FROM generate_series(1,10000) AS a;});
my $sigchld_bb_timeout = IPC::Run::timer(60);
my ($sigchld_bb_stdin, $sigchld_bb_stdout, $sigchld_bb_stderr) = ('', '', '');
my $sigchld_bb = IPC::Run::start(
[
@pg_basebackup_defs, '--wal-method=stream', '-D', "$tempdir/sigchld",
'--max-rate=32', '-d', $node->connstr('postgres')
],
'<',
\$sigchld_bb_stdin,
'>',
\$sigchld_bb_stdout,
'2>',
\$sigchld_bb_stderr,
$sigchld_bb_timeout);
is($node->poll_query_until('postgres',
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE " .
"application_name = '010_pg_basebackup.pl' AND wait_event = 'WalSenderMain' " .
"AND backend_type = 'walsender' AND query ~ 'START_REPLICATION'"),
"1",
"Walsender killed");
ok(pump_until($sigchld_bb, $sigchld_bb_timeout, \$sigchld_bb_stderr,
qr/background process terminated unexpectedly/),
'background process exit message');
$sigchld_bb->finish();
done_testing();