From 8b34cff3381a6a1d3c3d97141d468d468d2a2551 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 25 Jan 2024 17:07:56 +0900 Subject: [PATCH] Add more LOG messages when starting and ending recovery from a backup Three LOG messages are added in the recovery code paths, providing information that can be useful to track corruption issues depending on the state of the cluster, telling that: - Recovery has started from a backup_label. - Recovery is restarting from a backup start LSN, without a backup_label. - Recovery has completed from a backup. This was originally applied on HEAD as of 1d35f705e191, and there is consensus that this can be useful for older versions. This applies cleanly down to 15, so do it down to this version for now (older versions have heavily refactored the WAL recovery paths, making the change less straight-forward to do). Author: Andres Freund Reviewed-by: David Steele, Laurenz Albe, Michael Paquier Discussion: https://postgr.es/m/20231117041811.vz4vgkthwjnwp2pp@awork3.anarazel.de Backpatch-through: 15 --- src/backend/access/transam/xlogrecovery.c | 34 +++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 166f7b7b79..1503b21671 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -621,6 +621,22 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (StandbyModeRequested) EnableStandbyMode(); + /* + * Omitting backup_label when creating a new replica, PITR node etc. + * unfortunately is a common cause of corruption. Logging that + * backup_label was used makes it a bit easier to exclude that as the + * cause of observed corruption. + * + * Do so before we try to read the checkpoint record (which can fail), + * as otherwise it can be hard to understand why a checkpoint other + * than ControlFile->checkPoint is used. + */ + ereport(LOG, + (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u", + LSN_FORMAT_ARGS(RedoStartLSN), + LSN_FORMAT_ARGS(CheckPointLoc), + CheckPointTLI))); + /* * When a backup_label file is present, we want to roll forward from * the checkpoint it identifies, rather than using pg_control. @@ -759,6 +775,16 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, EnableStandbyMode(); } + /* + * For the same reason as when starting up with backup_label present, + * emit a log message when we continue initializing from a base + * backup. + */ + if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) + ereport(LOG, + (errmsg("restarting backup recovery with redo LSN %X/%X", + LSN_FORMAT_ARGS(ControlFile->backupStartPoint)))); + /* Get the last valid checkpoint record. */ CheckPointLoc = ControlFile->checkPoint; CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; @@ -2102,6 +2128,9 @@ CheckRecoveryConsistency(void) if (!XLogRecPtrIsInvalid(backupEndPoint) && backupEndPoint <= lastReplayedEndRecPtr) { + XLogRecPtr saveBackupStartPoint = backupStartPoint; + XLogRecPtr saveBackupEndPoint = backupEndPoint; + elog(DEBUG1, "end of backup reached"); /* @@ -2112,6 +2141,11 @@ CheckRecoveryConsistency(void) backupStartPoint = InvalidXLogRecPtr; backupEndPoint = InvalidXLogRecPtr; backupEndRequired = false; + + ereport(LOG, + (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X", + LSN_FORMAT_ARGS(saveBackupStartPoint), + LSN_FORMAT_ARGS(saveBackupEndPoint)))); } /*