From 56a95ee5118bf6d46e261b8d352f7dafac10587d Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Tue, 19 Dec 2017 03:46:14 +0900 Subject: [PATCH] Fix bug in cancellation of non-exclusive backup to avoid assertion failure. Previously an assertion failure occurred when pg_stop_backup() for non-exclusive backup was aborted while it's waiting for WAL files to be archived. This assertion failure happened in do_pg_abort_backup() which was called when a non-exclusive backup was canceled. do_pg_abort_backup() assumes that there is at least one non-exclusive backup running when it's called. But pg_stop_backup() can be canceled even after it marks the end of non-exclusive backup (e.g., during waiting for WAL archiving). This broke the assumption that do_pg_abort_backup() relies on, and which caused an assertion failure. This commit changes do_pg_abort_backup() so that it does nothing when non-exclusive backup has been already marked as completed. That is, the asssumption is also changed, and do_pg_abort_backup() now can handle even the case where it's called when there is no running backup. Backpatch to 9.6 where SQL-callable non-exclusive backup was added. Author: Masahiko Sawada and Michael Paquier Reviewed-By: Robert Haas and Fujii Masao Discussion: https://www.postgresql.org/message-id/CAD21AoD2L1Fu2c==gnVASMyFAAaq3y-AQ2uEVj-zTCGFFjvmDg@mail.gmail.com --- src/backend/access/transam/xlog.c | 36 ++++++++++++++++++++++++---- src/backend/replication/basebackup.c | 5 ++-- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 0791404263..3e9a12dacd 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -10628,13 +10628,20 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, /* * Mark that start phase has correctly finished for an exclusive backup. * Session-level locks are updated as well to reflect that state. + * + * Note that CHECK_FOR_INTERRUPTS() must not occur while updating + * backup counters and session-level lock. Otherwise they can be + * updated inconsistently, and which might cause do_pg_abort_backup() + * to fail. */ if (exclusive) { WALInsertLockAcquireExclusive(); XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS; - WALInsertLockRelease(); + + /* Set session-level lock */ sessionBackupState = SESSION_BACKUP_EXCLUSIVE; + WALInsertLockRelease(); } else sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE; @@ -10838,7 +10845,11 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) } /* - * OK to update backup counters and forcePageWrites + * OK to update backup counters, forcePageWrites and session-level lock. + * + * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them. + * Otherwise they can be updated inconsistently, and which might cause + * do_pg_abort_backup() to fail. */ WALInsertLockAcquireExclusive(); if (exclusive) @@ -10862,11 +10873,20 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) { XLogCtl->Insert.forcePageWrites = false; } - WALInsertLockRelease(); - /* Clean up session-level lock */ + /* + * Clean up session-level lock. + * + * You might think that WALInsertLockRelease() can be called + * before cleaning up session-level lock because session-level + * lock doesn't need to be protected with WAL insertion lock. + * But since CHECK_FOR_INTERRUPTS() can occur in it, + * session-level lock must be cleaned up before it. + */ sessionBackupState = SESSION_BACKUP_NONE; + WALInsertLockRelease(); + /* * Read and parse the START WAL LOCATION line (this code is pretty crude, * but we are not expecting any variability in the file format). @@ -11104,8 +11124,16 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) void do_pg_abort_backup(void) { + /* + * Quick exit if session is not keeping around a non-exclusive backup + * already started. + */ + if (sessionBackupState == SESSION_BACKUP_NONE) + return; + WALInsertLockAcquireExclusive(); Assert(XLogCtl->Insert.nonExclusiveBackups > 0); + Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE); XLogCtl->Insert.nonExclusiveBackups--; if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE && diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index cd7d391b2f..05ca95bac2 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -215,7 +215,7 @@ perform_base_backup(basebackup_options *opt) * Once do_pg_start_backup has been called, ensure that any failure causes * us to abort the backup so we don't "leak" a backup counter. For this * reason, *all* functionality between do_pg_start_backup() and - * do_pg_stop_backup() should be inside the error cleanup block! + * the end of do_pg_stop_backup() should be inside the error cleanup block! */ PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0); @@ -324,10 +324,11 @@ perform_base_backup(basebackup_options *opt) else pq_putemptymessage('c'); /* CopyDone */ } + + endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli); } PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0); - endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli); if (opt->includewal) {