diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 5d40fb5020..1d4d74b171 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -38,6 +38,7 @@ #include "postmaster/interrupt.h" #include "storage/buf_internals.h" #include "storage/dsm.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/latch.h" #include "storage/lwlock.h" diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 595e02de72..79314c69ab 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -32,6 +32,7 @@ OBJS = \ xlogfuncs.o \ xloginsert.o \ xlogreader.o \ + xlogrecovery.o \ xlogutils.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c9516e03fa..bb1f106946 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -29,6 +29,7 @@ #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" +#include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "catalog/index.h" #include "catalog/namespace.h" diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 859eb32c48..eb3c516058 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3,6 +3,30 @@ * xlog.c * PostgreSQL write-ahead log manager * + * The Write-Ahead Log (WAL) functionality is split into several source + * files, in addition to this one: + * + * xloginsert.c - Functions for constructing WAL records + * xlogrecovery.c - WAL recovery and standby code + * xlogreader.c - Facility for reading WAL files and parsing WAL records + * xlogutils.c - Helper functions for WAL redo routines + * + * This file contains functions for coordinating database startup and + * checkpointing, and managing the write-ahead log buffers when the + * system is running. + * + * StartupXLOG() is the main entry point of the startup process. It + * coordinates database startup, performing WAL recovery, and the + * transition from WAL recovery into normal operations. + * + * XLogInsertRecord() inserts a WAL record into the WAL buffers. Most + * callers should not call this directly, but use the functions in + * xloginsert.c to construct the WAL record. XLogFlush() can be used + * to force the WAL to disk. + * + * In addition to those, there are many other functions for interrogating + * the current system state, and for starting/stopping backups. + * * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -36,12 +60,11 @@ #include "access/xlogarchive.h" #include "access/xloginsert.h" #include "access/xlogreader.h" +#include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "catalog/catversion.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" -#include "commands/progress.h" -#include "commands/tablespace.h" #include "common/controldata_utils.h" #include "executor/instrument.h" #include "miscadmin.h" @@ -72,7 +95,6 @@ #include "storage/smgr.h" #include "storage/spin.h" #include "storage/sync.h" -#include "utils/builtins.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/ps_status.h" @@ -84,10 +106,6 @@ extern uint32 bootstrap_data_checksum_version; -/* Unsupported old recovery command file names (relative to $PGDATA) */ -#define RECOVERY_COMMAND_FILE "recovery.conf" -#define RECOVERY_COMMAND_DONE "recovery.done" - /* timeline ID to be used when bootstrapping */ #define BootstrapTimeLineID 1 @@ -177,13 +195,6 @@ const struct config_enum_entry archive_mode_options[] = { {NULL, 0, false} }; -const struct config_enum_entry recovery_target_action_options[] = { - {"pause", RECOVERY_TARGET_ACTION_PAUSE, false}, - {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false}, - {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false}, - {NULL, 0, false} -}; - /* * Statistics for current checkpoint are collected in this global struct. * Because only the checkpointer or a stand-alone backend can perform @@ -191,19 +202,6 @@ const struct config_enum_entry recovery_target_action_options[] = { */ CheckpointStatsData CheckpointStats; -/* Local copy of WalRcv->flushedUpto */ -static XLogRecPtr flushedUpto = 0; -static TimeLineID receiveTLI = 0; - -/* - * abortedRecPtr is the start pointer of a broken record at end of WAL when - * recovery completes; missingContrecPtr is the location of the first - * contrecord that went missing. See CreateOverwriteContrecordRecord for - * details. - */ -static XLogRecPtr abortedRecPtr; -static XLogRecPtr missingContrecPtr; - /* * During recovery, lastFullPageWrites keeps track of full_page_writes that * the replayed WAL records indicate. It's initialized with full_page_writes @@ -219,18 +217,6 @@ static bool lastFullPageWrites; */ static bool LocalRecoveryInProgress = true; -/* - * Local copy of SharedHotStandbyActive variable. False actually means "not - * known, need to check the shared state". - */ -static bool LocalHotStandbyActive = false; - -/* - * Local copy of SharedPromoteIsTriggered variable. False actually means "not - * known, need to check the shared state". - */ -static bool LocalPromoteIsTriggered = false; - /* * Local state for XLogInsertAllowed(): * 1: unconditionally allowed to insert XLOG @@ -243,87 +229,6 @@ static bool LocalPromoteIsTriggered = false; */ static int LocalXLogInsertAllowed = -1; -/* - * When ArchiveRecoveryRequested is set, archive recovery was requested, - * ie. signal files were present. When InArchiveRecovery is set, we are - * currently recovering using offline XLOG archives. These variables are only - * valid in the startup process. - * - * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're - * currently performing crash recovery using only XLOG files in pg_wal, but - * will switch to using offline XLOG archives as soon as we reach the end of - * WAL in pg_wal. -*/ -bool ArchiveRecoveryRequested = false; -bool InArchiveRecovery = false; - -static bool standby_signal_file_found = false; -static bool recovery_signal_file_found = false; - -/* Buffers dedicated to consistency checks of size BLCKSZ */ -static char *replay_image_masked = NULL; -static char *primary_image_masked = NULL; - -/* options formerly taken from recovery.conf for archive recovery */ -char *recoveryRestoreCommand = NULL; -char *recoveryEndCommand = NULL; -char *archiveCleanupCommand = NULL; -RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; -bool recoveryTargetInclusive = true; -int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE; -TransactionId recoveryTargetXid; -char *recovery_target_time_string; -static TimestampTz recoveryTargetTime; -const char *recoveryTargetName; -XLogRecPtr recoveryTargetLSN; -int recovery_min_apply_delay = 0; - -/* options formerly taken from recovery.conf for XLOG streaming */ -bool StandbyModeRequested = false; -char *PrimaryConnInfo = NULL; -char *PrimarySlotName = NULL; -char *PromoteTriggerFile = NULL; -bool wal_receiver_create_temp_slot = false; - -/* are we currently in standby mode? */ -bool StandbyMode = false; - -/* - * if recoveryStopsBefore/After returns true, it saves information of the stop - * point here - */ -static TransactionId recoveryStopXid; -static TimestampTz recoveryStopTime; -static XLogRecPtr recoveryStopLSN; -static char recoveryStopName[MAXFNAMELEN]; -static bool recoveryStopAfter; - -/* - * recoveryTargetTimeLineGoal: what the user requested, if any - * - * recoveryTargetTLIRequested: numeric value of requested timeline, if constant - * - * recoveryTargetTLI: the currently understood target timeline; changes - * - * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of - * its known parents, newest first (so recoveryTargetTLI is always the - * first list member). Only these TLIs are expected to be seen in the WAL - * segments we read, and indeed only these TLIs will be considered as - * candidate WAL files to open at all. - * - * curFileTLI: the TLI appearing in the name of the current input WAL file. - * (This is not necessarily the same as the timeline from which we are - * replaying WAL, which StartupXLOG calls replayTLI, because we could be - * scanning data that was copied from an ancestor timeline when the current - * file was created.) During a sequential scan we do not allow this value - * to decrease. - */ -RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST; -TimeLineID recoveryTargetTLIRequested = 0; -TimeLineID recoveryTargetTLI = 0; -static List *expectedTLEs; -static TimeLineID curFileTLI; - /* * ProcLastRecPtr points to the start of the last XLOG record inserted by the * current backend. It is updated for all inserts. XactLastRecEnd points to @@ -374,21 +279,6 @@ static XLogRecPtr RedoRecPtr; */ static bool doPageWrites; -/* Has the recovery code requested a walreceiver wakeup? */ -static bool doRequestWalReceiverReply; - -/* - * RedoStartLSN points to the checkpoint's REDO location which is specified - * in a backup label file, backup history file or control file. In standby - * mode, XLOG streaming usually starts from the position where an invalid - * record was found. But if we fail to read even the initial checkpoint - * record, we use the REDO location instead of the checkpoint location as - * the start position of XLOG streaming. Otherwise we would have to jump - * backwards to the REDO location after reading the checkpoint record, - * because the REDO record can precede the checkpoint record. - */ -static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr; - /*---------- * Shared-memory data structures for XLOG control * @@ -650,12 +540,6 @@ typedef struct XLogCtlData */ RecoveryState SharedRecoveryState; - /* - * SharedHotStandbyActive indicates if we allow hot standby queries to be - * run. Protected by info_lck. - */ - bool SharedHotStandbyActive; - /* * InstallXLogFileSegmentActive indicates whether the checkpointer should * arrange for future segments by recycling and/or PreallocXlogFiles(). @@ -666,12 +550,6 @@ typedef struct XLogCtlData */ bool InstallXLogFileSegmentActive; - /* - * SharedPromoteIsTriggered indicates if a standby promotion has been - * triggered. Protected by info_lck. - */ - bool SharedPromoteIsTriggered; - /* * WalWriterSleeping indicates whether the WAL writer is currently in * low-power mode (and hence should be nudged if an async commit occurs). @@ -679,23 +557,6 @@ typedef struct XLogCtlData */ bool WalWriterSleeping; - /* - * recoveryWakeupLatch is used to wake up the startup process to continue - * WAL replay, if it is waiting for WAL to arrive or failover trigger file - * to appear. - * - * Note that the startup process also uses another latch, its procLatch, - * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for - * signaling the startup process in favor of using its procLatch, which - * comports better with possible generic signal handlers using that latch. - * But we should not do that because the startup process doesn't assume - * that it's waken up by walreceiver process or SIGHUP signal handler - * while it's waiting for recovery conflict. The separate latches, - * recoveryWakeupLatch and procLatch, should be used for inter-process - * communication for WAL replay and recovery conflict, respectively. - */ - Latch recoveryWakeupLatch; - /* * During recovery, we keep a copy of the latest checkpoint record here. * lastCheckPointRecPtr points to start of checkpoint record and @@ -708,28 +569,6 @@ typedef struct XLogCtlData XLogRecPtr lastCheckPointEndPtr; CheckPoint lastCheckPoint; - /* - * lastReplayedEndRecPtr points to end+1 of the last record successfully - * replayed. When we're currently replaying a record, ie. in a redo - * function, replayEndRecPtr points to the end+1 of the record being - * replayed, otherwise it's equal to lastReplayedEndRecPtr. - */ - XLogRecPtr lastReplayedEndRecPtr; - TimeLineID lastReplayedTLI; - XLogRecPtr replayEndRecPtr; - TimeLineID replayEndTLI; - /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ - TimestampTz recoveryLastXTime; - - /* - * timestamp of when we started replaying the current chunk of WAL data, - * only relevant for replication or archive recovery - */ - TimestampTz currentChunkStartTime; - /* Recovery pause state */ - RecoveryPauseState recoveryPauseState; - ConditionVariable recoveryNotPausedCV; - /* * lastFpwDisableRecPtr points to the start of the last replayed * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled. @@ -787,21 +626,6 @@ static int UsableBytesInSegment; */ static XLogwrtResult LogwrtResult = {0, 0}; -/* - * Codes indicating where we got a WAL file from during recovery, or where - * to attempt to get one. - */ -typedef enum -{ - XLOG_FROM_ANY = 0, /* request to read WAL from any source */ - XLOG_FROM_ARCHIVE, /* restored using restore_command */ - XLOG_FROM_PG_WAL, /* existing file in pg_wal */ - XLOG_FROM_STREAM /* streamed from primary */ -} XLogSource; - -/* human-readable names for XLogSources, for debugging output */ -static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"}; - /* * openLogFile is -1 or a kernel FD for an open log file segment. * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI. @@ -814,74 +638,17 @@ static int openLogFile = -1; static XLogSegNo openLogSegNo = 0; static TimeLineID openLogTLI = 0; -/* - * These variables are used similarly to the ones above, but for reading - * the XLOG. readOff is the offset of the page just read, readLen - * indicates how much of it has been read into readBuf, and readSource - * indicates where we got the currently open file from. - * Note: we could use Reserve/ReleaseExternalFD to track consumption of - * this FD too; but it doesn't currently seem worthwhile, since the XLOG is - * not read by general-purpose sessions. - */ -static int readFile = -1; -static XLogSegNo readSegNo = 0; -static uint32 readOff = 0; -static uint32 readLen = 0; -static XLogSource readSource = XLOG_FROM_ANY; - -/* - * Keeps track of which source we're currently reading from. This is - * different from readSource in that this is always set, even when we don't - * currently have a WAL file open. If lastSourceFailed is set, our last - * attempt to read from currentSource failed, and we should try another source - * next. - * - * pendingWalRcvRestart is set when a config change occurs that requires a - * walreceiver restart. This is only valid in XLOG_FROM_STREAM state. - */ -static XLogSource currentSource = XLOG_FROM_ANY; -static bool lastSourceFailed = false; -static bool pendingWalRcvRestart = false; - -typedef struct XLogPageReadPrivate -{ - int emode; - bool fetching_ckpt; /* are we fetching a checkpoint record? */ - bool randAccess; - TimeLineID replayTLI; -} XLogPageReadPrivate; - -/* - * These variables track when we last obtained some WAL data to process, - * and where we got it from. (XLogReceiptSource is initially the same as - * readSource, but readSource gets reset to zero when we don't have data - * to process right now. It is also different from currentSource, which - * also changes when we try to read from a source and fail, while - * XLogReceiptSource tracks where we last successfully read some WAL.) - */ -static TimestampTz XLogReceiptTime = 0; -static XLogSource XLogReceiptSource = XLOG_FROM_ANY; - /* * Local copies of equivalent fields in the control file. When running - * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we + * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we * expect to replay all the WAL available, and updateMinRecoveryPoint is * switched to false to prevent any updates while replaying records. * Those values are kept consistent as long as crash recovery runs. */ -static XLogRecPtr minRecoveryPoint; -static TimeLineID minRecoveryPointTLI; +static XLogRecPtr LocalMinRecoveryPoint; +static TimeLineID LocalMinRecoveryPointTLI; static bool updateMinRecoveryPoint = true; -/* - * Have we reached a consistent database state? In crash recovery, we have - * to replay all the WAL, so reachedConsistency is never set. During archive - * recovery, the database is consistent once minRecoveryPoint is reached. - */ -bool reachedConsistency = false; - -static bool InRedo = false; - /* For WALInsertLockAcquire/Release functions */ static int MyLockNo = 0; static bool holdingAllLocks = false; @@ -890,27 +657,11 @@ static bool holdingAllLocks = false; static MemoryContext walDebugCxt = NULL; #endif -static void readRecoverySignalFile(void); -static void validateRecoveryParameters(void); -static void XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, - TimeLineID newTLI); static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog, TimeLineID newTLI); -static bool recoveryStopsBefore(XLogReaderState *record); -static bool recoveryStopsAfter(XLogReaderState *record); -static char *getRecoveryStopReason(void); -static void ConfirmRecoveryPaused(void); -static void recoveryPausesHere(bool endOfRecovery); -static bool recoveryApplyDelay(XLogReaderState *record); -static void SetLatestXTime(TimestampTz xtime); -static void SetCurrentChunkStartTime(TimestampTz xtime); static void CheckRequiredParameterValues(void); static void XLogReportParameters(void); -static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, - TimeLineID prevTLI, TimeLineID replayTLI); -static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, - XLogReaderState *state); static int LocalSetXLogInsertAllowed(void); static void CreateEndOfRecoveryRecord(void); static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, @@ -922,22 +673,10 @@ static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic); -static bool XLogCheckpointNeeded(XLogSegNo new_segno); static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible); static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, bool find_free, XLogSegNo max_segno, TimeLineID tli); -static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, - XLogSource source, bool notfoundOk); -static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); -static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, - int reqLen, XLogRecPtr targetRecPtr, char *readBuf); -static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, - bool fetching_ckpt, XLogRecPtr tliRecPtr, - TimeLineID replayTLI, - XLogRecPtr replayLSN); -static void XLogShutdownWalRcv(void); -static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); static void XLogFileClose(void); static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli); static void RemoveTempXlogFiles(void); @@ -949,36 +688,16 @@ static void UpdateLastRemovedPtr(char *filename); static void ValidateXLOGDirectoryStructure(void); static void CleanupBackupHistory(void); static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); -static XLogRecord *ReadRecord(XLogReaderState *xlogreader, - int emode, bool fetching_ckpt, - TimeLineID replayTLI); -static void CheckRecoveryConsistency(void); static bool PerformRecoveryXLogAction(void); -static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, - XLogRecPtr RecPtr, int whichChkpt, bool report, - TimeLineID replayTLI); -static bool rescanLatestTimeLine(TimeLineID replayTLI, - XLogRecPtr replayLSN); static void InitControlFile(uint64 sysidentifier); static void WriteControlFile(void); static void ReadControlFile(void); +static void UpdateControlFile(void); static char *str_time(pg_time_t tnow); -static void SetPromoteIsTriggered(void); -static bool CheckForStandbyTrigger(void); -#ifdef WAL_DEBUG -static void xlog_outrec(StringInfo buf, XLogReaderState *record); -#endif -static void xlog_block_info(StringInfo buf, XLogReaderState *record); -static void xlog_outdesc(StringInfo buf, XLogReaderState *record); static void pg_start_backup_callback(int code, Datum arg); static void pg_stop_backup_callback(int code, Datum arg); -static bool read_backup_label(XLogRecPtr *checkPointLoc, - TimeLineID *backupLabelTLI, - bool *backupEndRequired, bool *backupFromStandby); -static bool read_tablespace_map(List **tablespaces); -static void rm_redo_error_callback(void *arg); static int get_sync_bit(int method); static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, @@ -994,7 +713,6 @@ static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli); static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); -static void checkXLogConsistency(XLogReaderState *record); static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); @@ -1442,114 +1160,6 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) return true; } -/* - * Checks whether the current buffer page and backup page stored in the - * WAL record are consistent or not. Before comparing the two pages, a - * masking can be applied to the pages to ignore certain areas like hint bits, - * unused space between pd_lower and pd_upper among other things. This - * function should be called once WAL replay has been completed for a - * given record. - */ -static void -checkXLogConsistency(XLogReaderState *record) -{ - RmgrId rmid = XLogRecGetRmid(record); - RelFileNode rnode; - ForkNumber forknum; - BlockNumber blkno; - int block_id; - - /* Records with no backup blocks have no need for consistency checks. */ - if (!XLogRecHasAnyBlockRefs(record)) - return; - - Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); - - for (block_id = 0; block_id <= record->max_block_id; block_id++) - { - Buffer buf; - Page page; - - if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) - { - /* - * WAL record doesn't contain a block reference with the given id. - * Do nothing. - */ - continue; - } - - Assert(XLogRecHasBlockImage(record, block_id)); - - if (XLogRecBlockImageApply(record, block_id)) - { - /* - * WAL record has already applied the page, so bypass the - * consistency check as that would result in comparing the full - * page stored in the record with itself. - */ - continue; - } - - /* - * Read the contents from the current buffer and store it in a - * temporary page. - */ - buf = XLogReadBufferExtended(rnode, forknum, blkno, - RBM_NORMAL_NO_LOG); - if (!BufferIsValid(buf)) - continue; - - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - page = BufferGetPage(buf); - - /* - * Take a copy of the local page where WAL has been applied to have a - * comparison base before masking it... - */ - memcpy(replay_image_masked, page, BLCKSZ); - - /* No need for this page anymore now that a copy is in. */ - UnlockReleaseBuffer(buf); - - /* - * If the block LSN is already ahead of this WAL record, we can't - * expect contents to match. This can happen if recovery is - * restarted. - */ - if (PageGetLSN(replay_image_masked) > record->EndRecPtr) - continue; - - /* - * Read the contents from the backup copy, stored in WAL record and - * store it in a temporary page. There is no need to allocate a new - * page here, a local buffer is fine to hold its contents and a mask - * can be directly applied on it. - */ - if (!RestoreBlockImage(record, block_id, primary_image_masked)) - elog(ERROR, "failed to restore block image"); - - /* - * If masking function is defined, mask both the primary and replay - * images - */ - if (RmgrTable[rmid].rm_mask != NULL) - { - RmgrTable[rmid].rm_mask(replay_image_masked, blkno); - RmgrTable[rmid].rm_mask(primary_image_masked, blkno); - } - - /* Time to compare the primary and replay images. */ - if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) - { - elog(FATAL, - "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", - rnode.spcNode, rnode.dbNode, rnode.relNode, - forknum, blkno); - } - } -} - /* * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved * area in the WAL. @@ -2435,7 +2045,7 @@ XLOGfileslop(XLogRecPtr lastredoptr) * * Note: it is caller's responsibility that RedoRecPtr is up-to-date. */ -static bool +bool XLogCheckpointNeeded(XLogSegNo new_segno) { XLogSegNo old_segno; @@ -2829,7 +2439,7 @@ static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) { /* Quick check using our local copy of the variable */ - if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint)) + if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint)) return; /* @@ -2843,7 +2453,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) * available is replayed in this case. This also saves from extra locks * taken on the control file from the startup process. */ - if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery) + if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery) { updateMinRecoveryPoint = false; return; @@ -2852,12 +2462,12 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); /* update local copy */ - minRecoveryPoint = ControlFile->minRecoveryPoint; - minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; - if (XLogRecPtrIsInvalid(minRecoveryPoint)) + if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint)) updateMinRecoveryPoint = false; - else if (force || minRecoveryPoint < lsn) + else if (force || LocalMinRecoveryPoint < lsn) { XLogRecPtr newMinRecoveryPoint; TimeLineID newMinRecoveryPointTLI; @@ -2875,11 +2485,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) * all. Instead, we just log a warning and continue with recovery. * (See also the comments about corrupt LSNs in XLogFlush.) */ - SpinLockAcquire(&XLogCtl->info_lck); - newMinRecoveryPoint = XLogCtl->replayEndRecPtr; - newMinRecoveryPointTLI = XLogCtl->replayEndTLI; - SpinLockRelease(&XLogCtl->info_lck); - + newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI); if (!force && newMinRecoveryPoint < lsn) elog(WARNING, "xlog min recovery request %X/%X is past current point %X/%X", @@ -2891,12 +2497,12 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) ControlFile->minRecoveryPoint = newMinRecoveryPoint; ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI; UpdateControlFile(); - minRecoveryPoint = newMinRecoveryPoint; - minRecoveryPointTLI = newMinRecoveryPointTLI; + LocalMinRecoveryPoint = newMinRecoveryPoint; + LocalMinRecoveryPointTLI = newMinRecoveryPointTLI; ereport(DEBUG2, (errmsg_internal("updated min recovery point to %X/%X on timeline %u", - LSN_FORMAT_ARGS(minRecoveryPoint), + LSN_FORMAT_ARGS(newMinRecoveryPoint), newMinRecoveryPointTLI))); } } @@ -3256,11 +2862,11 @@ XLogNeedsFlush(XLogRecPtr record) * which cannot update its local copy of minRecoveryPoint as long as * it has not replayed all WAL available when doing crash recovery. */ - if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery) + if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery) updateMinRecoveryPoint = false; /* Quick exit if already known to be updated or cannot be updated */ - if (record <= minRecoveryPoint || !updateMinRecoveryPoint) + if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint) return false; /* @@ -3269,8 +2875,8 @@ XLogNeedsFlush(XLogRecPtr record) */ if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED)) return true; - minRecoveryPoint = ControlFile->minRecoveryPoint; - minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; LWLockRelease(ControlFileLock); /* @@ -3278,11 +2884,11 @@ XLogNeedsFlush(XLogRecPtr record) * process doing crash recovery, which should not update the control * file value if crash recovery is still running. */ - if (XLogRecPtrIsInvalid(minRecoveryPoint)) + if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint)) updateMinRecoveryPoint = false; /* check again */ - if (record <= minRecoveryPoint || !updateMinRecoveryPoint) + if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint) return false; else return true; @@ -3763,192 +3369,6 @@ XLogFileOpen(XLogSegNo segno, TimeLineID tli) return fd; } -/* - * Open a logfile segment for reading (during recovery). - * - * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive. - * Otherwise, it's assumed to be already available in pg_wal. - */ -static int -XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, - XLogSource source, bool notfoundOk) -{ - char xlogfname[MAXFNAMELEN]; - char activitymsg[MAXFNAMELEN + 16]; - char path[MAXPGPATH]; - int fd; - - XLogFileName(xlogfname, tli, segno, wal_segment_size); - - switch (source) - { - case XLOG_FROM_ARCHIVE: - /* Report recovery progress in PS display */ - snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", - xlogfname); - set_ps_display(activitymsg); - - if (!RestoreArchivedFile(path, xlogfname, - "RECOVERYXLOG", - wal_segment_size, - InRedo)) - return -1; - break; - - case XLOG_FROM_PG_WAL: - case XLOG_FROM_STREAM: - XLogFilePath(path, tli, segno, wal_segment_size); - break; - - default: - elog(ERROR, "invalid XLogFileRead source %d", source); - } - - /* - * If the segment was fetched from archival storage, replace the existing - * xlog segment (if any) with the archival version. - */ - if (source == XLOG_FROM_ARCHIVE) - { - Assert(!XLogCtl->InstallXLogFileSegmentActive); - KeepFileRestoredFromArchive(path, xlogfname); - - /* - * Set path to point at the new file in pg_wal. - */ - snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); - } - - fd = BasicOpenFile(path, O_RDONLY | PG_BINARY); - if (fd >= 0) - { - /* Success! */ - curFileTLI = tli; - - /* Report recovery progress in PS display */ - snprintf(activitymsg, sizeof(activitymsg), "recovering %s", - xlogfname); - set_ps_display(activitymsg); - - /* Track source of data in assorted state variables */ - readSource = source; - XLogReceiptSource = source; - /* In FROM_STREAM case, caller tracks receipt time, not me */ - if (source != XLOG_FROM_STREAM) - XLogReceiptTime = GetCurrentTimestamp(); - - return fd; - } - if (errno != ENOENT || !notfoundOk) /* unexpected failure? */ - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", path))); - return -1; -} - -/* - * Open a logfile segment for reading (during recovery). - * - * This version searches for the segment with any TLI listed in expectedTLEs. - */ -static int -XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source) -{ - char path[MAXPGPATH]; - ListCell *cell; - int fd; - List *tles; - - /* - * Loop looking for a suitable timeline ID: we might need to read any of - * the timelines listed in expectedTLEs. - * - * We expect curFileTLI on entry to be the TLI of the preceding file in - * sequence, or 0 if there was no predecessor. We do not allow curFileTLI - * to go backwards; this prevents us from picking up the wrong file when a - * parent timeline extends to higher segment numbers than the child we - * want to read. - * - * If we haven't read the timeline history file yet, read it now, so that - * we know which TLIs to scan. We don't save the list in expectedTLEs, - * however, unless we actually find a valid segment. That way if there is - * neither a timeline history file nor a WAL segment in the archive, and - * streaming replication is set up, we'll read the timeline history file - * streamed from the primary when we start streaming, instead of - * recovering with a dummy history generated here. - */ - if (expectedTLEs) - tles = expectedTLEs; - else - tles = readTimeLineHistory(recoveryTargetTLI); - - foreach(cell, tles) - { - TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell); - TimeLineID tli = hent->tli; - - if (tli < curFileTLI) - break; /* don't bother looking at too-old TLIs */ - - /* - * Skip scanning the timeline ID that the logfile segment to read - * doesn't belong to - */ - if (hent->begin != InvalidXLogRecPtr) - { - XLogSegNo beginseg = 0; - - XLByteToSeg(hent->begin, beginseg, wal_segment_size); - - /* - * The logfile segment that doesn't belong to the timeline is - * older or newer than the segment that the timeline started or - * ended at, respectively. It's sufficient to check only the - * starting segment of the timeline here. Since the timelines are - * scanned in descending order in this loop, any segments newer - * than the ending segment should belong to newer timeline and - * have already been read before. So it's not necessary to check - * the ending segment of the timeline here. - */ - if (segno < beginseg) - continue; - } - - if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE) - { - fd = XLogFileRead(segno, emode, tli, - XLOG_FROM_ARCHIVE, true); - if (fd != -1) - { - elog(DEBUG1, "got WAL segment from archive"); - if (!expectedTLEs) - expectedTLEs = tles; - return fd; - } - } - - if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL) - { - fd = XLogFileRead(segno, emode, tli, - XLOG_FROM_PG_WAL, true); - if (fd != -1) - { - if (!expectedTLEs) - expectedTLEs = tles; - return fd; - } - } - } - - /* Couldn't find it. For simplicity, complain about front timeline */ - XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size); - errno = ENOENT; - ereport(emode, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", path))); - return -1; -} - /* * Close the current logfile segment for writing. */ @@ -4216,7 +3636,7 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr, * 'switchpoint' is the current point in WAL where we switch to new timeline, * and 'newTLI' is the new timeline we switch to. */ -static void +void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI) { DIR *xldir; @@ -4441,261 +3861,6 @@ CleanupBackupHistory(void) FreeDir(xldir); } -/* - * Attempt to read the next XLOG record. - * - * Before first call, the reader needs to be positioned to the first record - * by calling XLogBeginRead(). - * - * If no valid record is available, returns NULL, or fails if emode is PANIC. - * (emode must be either PANIC, LOG). In standby mode, retries until a valid - * record is available. - */ -static XLogRecord * -ReadRecord(XLogReaderState *xlogreader, int emode, - bool fetching_ckpt, TimeLineID replayTLI) -{ - XLogRecord *record; - XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; - - /* Pass through parameters to XLogPageRead */ - private->fetching_ckpt = fetching_ckpt; - private->emode = emode; - private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr); - private->replayTLI = replayTLI; - - /* This is the first attempt to read this page. */ - lastSourceFailed = false; - - for (;;) - { - char *errormsg; - - record = XLogReadRecord(xlogreader, &errormsg); - if (record == NULL) - { - /* - * When not in standby mode we find that WAL ends in an incomplete - * record, keep track of that record. After recovery is done, - * we'll write a record to indicate downstream WAL readers that - * that portion is to be ignored. - */ - if (!StandbyMode && - !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr)) - { - abortedRecPtr = xlogreader->abortedRecPtr; - missingContrecPtr = xlogreader->missingContrecPtr; - } - - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } - - /* - * We only end up here without a message when XLogPageRead() - * failed - in that case we already logged something. In - * StandbyMode that only happens if we have been triggered, so we - * shouldn't loop anymore in that case. - */ - if (errormsg) - ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), - (errmsg_internal("%s", errormsg) /* already translated */ )); - } - - /* - * Check page TLI is one of the expected values. - */ - else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs)) - { - char fname[MAXFNAMELEN]; - XLogSegNo segno; - int32 offset; - - XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size); - offset = XLogSegmentOffset(xlogreader->latestPagePtr, - wal_segment_size); - XLogFileName(fname, xlogreader->seg.ws_tli, segno, - wal_segment_size); - ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), - (errmsg("unexpected timeline ID %u in log segment %s, offset %u", - xlogreader->latestPageTLI, - fname, - offset))); - record = NULL; - } - - if (record) - { - /* Great, got a record */ - return record; - } - else - { - /* No valid record available from this source */ - lastSourceFailed = true; - - /* - * If archive recovery was requested, but we were still doing - * crash recovery, switch to archive recovery and retry using the - * offline archive. We have now replayed all the valid WAL in - * pg_wal, so we are presumably now consistent. - * - * We require that there's at least some valid WAL present in - * pg_wal, however (!fetching_ckpt). We could recover using the - * WAL from the archive, even if pg_wal is completely empty, but - * we'd have no idea how far we'd have to replay to reach - * consistency. So err on the safe side and give up. - */ - if (!InArchiveRecovery && ArchiveRecoveryRequested && - !fetching_ckpt) - { - ereport(DEBUG1, - (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); - InArchiveRecovery = true; - if (StandbyModeRequested) - StandbyMode = true; - - /* initialize minRecoveryPoint to this record */ - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->state = DB_IN_ARCHIVE_RECOVERY; - if (ControlFile->minRecoveryPoint < xlogreader->EndRecPtr) - { - ControlFile->minRecoveryPoint = xlogreader->EndRecPtr; - ControlFile->minRecoveryPointTLI = replayTLI; - } - /* update local copy */ - minRecoveryPoint = ControlFile->minRecoveryPoint; - minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; - - /* - * The startup process can update its local copy of - * minRecoveryPoint from this point. - */ - updateMinRecoveryPoint = true; - - UpdateControlFile(); - - /* - * We update SharedRecoveryState while holding the lock on - * ControlFileLock so both states are consistent in shared - * memory. - */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE; - SpinLockRelease(&XLogCtl->info_lck); - - LWLockRelease(ControlFileLock); - - CheckRecoveryConsistency(); - - /* - * Before we retry, reset lastSourceFailed and currentSource - * so that we will check the archive next. - */ - lastSourceFailed = false; - currentSource = XLOG_FROM_ANY; - - continue; - } - - /* In standby mode, loop back to retry. Otherwise, give up. */ - if (StandbyMode && !CheckForStandbyTrigger()) - continue; - else - return NULL; - } - } -} - -/* - * Scan for new timelines that might have appeared in the archive since we - * started recovery. - * - * If there are any, the function changes recovery target TLI to the latest - * one and returns 'true'. - */ -static bool -rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) -{ - List *newExpectedTLEs; - bool found; - ListCell *cell; - TimeLineID newtarget; - TimeLineID oldtarget = recoveryTargetTLI; - TimeLineHistoryEntry *currentTle = NULL; - - newtarget = findNewestTimeLine(recoveryTargetTLI); - if (newtarget == recoveryTargetTLI) - { - /* No new timelines found */ - return false; - } - - /* - * Determine the list of expected TLIs for the new TLI - */ - - newExpectedTLEs = readTimeLineHistory(newtarget); - - /* - * If the current timeline is not part of the history of the new timeline, - * we cannot proceed to it. - */ - found = false; - foreach(cell, newExpectedTLEs) - { - currentTle = (TimeLineHistoryEntry *) lfirst(cell); - - if (currentTle->tli == recoveryTargetTLI) - { - found = true; - break; - } - } - if (!found) - { - ereport(LOG, - (errmsg("new timeline %u is not a child of database system timeline %u", - newtarget, - replayTLI))); - return false; - } - - /* - * The current timeline was found in the history file, but check that the - * next timeline was forked off from it *after* the current recovery - * location. - */ - if (currentTle->end < replayLSN) - { - ereport(LOG, - (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", - newtarget, - replayTLI, - LSN_FORMAT_ARGS(replayLSN)))); - return false; - } - - /* The new timeline history seems valid. Switch target */ - recoveryTargetTLI = newtarget; - list_free_deep(expectedTLEs); - expectedTLEs = newExpectedTLEs; - - /* - * As in StartupXLOG(), try to ensure we have all the history files - * between the old target and new target in pg_wal. - */ - restoreTimeLineHistoryFiles(oldtarget + 1, newtarget); - - ereport(LOG, - (errmsg("new target timeline is %u", - recoveryTargetTLI))); - - return true; -} - /* * I/O routines for pg_control * @@ -5038,7 +4203,7 @@ ReadControlFile(void) * Utility wrapper to update the control file. Note that the control * file gets flushed. */ -void +static void UpdateControlFile(void) { update_controlfile(DataDir, ControlFile, true); @@ -5316,16 +4481,12 @@ XLOGShmemInit(void) */ XLogCtl->XLogCacheBlck = XLOGbuffers - 1; XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH; - XLogCtl->SharedHotStandbyActive = false; XLogCtl->InstallXLogFileSegmentActive = false; - XLogCtl->SharedPromoteIsTriggered = false; XLogCtl->WalWriterSleeping = false; SpinLockInit(&XLogCtl->Insert.insertpos_lck); SpinLockInit(&XLogCtl->info_lck); SpinLockInit(&XLogCtl->ulsn_lck); - InitSharedLatch(&XLogCtl->recoveryWakeupLatch); - ConditionVariableInit(&XLogCtl->recoveryNotPausedCV); } /* @@ -5511,175 +4672,6 @@ str_time(pg_time_t tnow) return buf; } -/* - * See if there are any recovery signal files and if so, set state for - * recovery. - * - * See if there is a recovery command file (recovery.conf), and if so - * throw an ERROR since as of PG12 we no longer recognize that. - */ -static void -readRecoverySignalFile(void) -{ - struct stat stat_buf; - - if (IsBootstrapProcessingMode()) - return; - - /* - * Check for old recovery API file: recovery.conf - */ - if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("using recovery command file \"%s\" is not supported", - RECOVERY_COMMAND_FILE))); - - /* - * Remove unused .done file, if present. Ignore if absent. - */ - unlink(RECOVERY_COMMAND_DONE); - - /* - * Check for recovery signal files and if found, fsync them since they - * represent server state information. We don't sweat too much about the - * possibility of fsync failure, however. - * - * If present, standby signal file takes precedence. If neither is present - * then we won't enter archive recovery. - */ - if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) - { - int fd; - - fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY, - S_IRUSR | S_IWUSR); - if (fd >= 0) - { - (void) pg_fsync(fd); - close(fd); - } - standby_signal_file_found = true; - } - else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0) - { - int fd; - - fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY, - S_IRUSR | S_IWUSR); - if (fd >= 0) - { - (void) pg_fsync(fd); - close(fd); - } - recovery_signal_file_found = true; - } - - StandbyModeRequested = false; - ArchiveRecoveryRequested = false; - if (standby_signal_file_found) - { - StandbyModeRequested = true; - ArchiveRecoveryRequested = true; - } - else if (recovery_signal_file_found) - { - StandbyModeRequested = false; - ArchiveRecoveryRequested = true; - } - else - return; - - /* - * We don't support standby mode in standalone backends; that requires - * other processes such as the WAL receiver to be alive. - */ - if (StandbyModeRequested && !IsUnderPostmaster) - ereport(FATAL, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("standby mode is not supported by single-user servers"))); -} - -static void -validateRecoveryParameters(void) -{ - if (!ArchiveRecoveryRequested) - return; - - /* - * Check for compulsory parameters - */ - if (StandbyModeRequested) - { - if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) && - (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0)) - ereport(WARNING, - (errmsg("specified neither primary_conninfo nor restore_command"), - errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there."))); - } - else - { - if (recoveryRestoreCommand == NULL || - strcmp(recoveryRestoreCommand, "") == 0) - ereport(FATAL, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("must specify restore_command when standby mode is not enabled"))); - } - - /* - * Override any inconsistent requests. Note that this is a change of - * behaviour in 9.5; prior to this we simply ignored a request to pause if - * hot_standby = off, which was surprising behaviour. - */ - if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE && - !EnableHotStandby) - recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN; - - /* - * Final parsing of recovery_target_time string; see also - * check_recovery_target_time(). - */ - if (recoveryTarget == RECOVERY_TARGET_TIME) - { - recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, - CStringGetDatum(recovery_target_time_string), - ObjectIdGetDatum(InvalidOid), - Int32GetDatum(-1))); - } - - /* - * If user specified recovery_target_timeline, validate it or compute the - * "latest" value. We can't do this until after we've gotten the restore - * command and set InArchiveRecovery, because we need to fetch timeline - * history files from the archive. - */ - if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC) - { - TimeLineID rtli = recoveryTargetTLIRequested; - - /* Timeline 1 does not have a history file, all else should */ - if (rtli != 1 && !existsTimeLineHistory(rtli)) - ereport(FATAL, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("recovery target timeline %u does not exist", - rtli))); - recoveryTargetTLI = rtli; - } - else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) - { - /* We start the "latest" search from pg_control's timeline */ - recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI); - } - else - { - /* - * else we just use the recoveryTargetTLI as already read from - * ControlFile - */ - Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE); - } -} - /* * Initialize the first WAL segment on new timeline. */ @@ -5840,752 +4832,6 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog, } } -/* - * Extract timestamp from WAL record. - * - * If the record contains a timestamp, returns true, and saves the timestamp - * in *recordXtime. If the record type has no timestamp, returns false. - * Currently, only transaction commit/abort records and restore points contain - * timestamps. - */ -static bool -getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime) -{ - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - uint8 xact_info = info & XLOG_XACT_OPMASK; - uint8 rmid = XLogRecGetRmid(record); - - if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) - { - *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time; - return true; - } - if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT || - xact_info == XLOG_XACT_COMMIT_PREPARED)) - { - *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time; - return true; - } - if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT || - xact_info == XLOG_XACT_ABORT_PREPARED)) - { - *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time; - return true; - } - return false; -} - -/* - * For point-in-time recovery, this function decides whether we want to - * stop applying the XLOG before the current record. - * - * Returns true if we are stopping, false otherwise. If stopping, some - * information is saved in recoveryStopXid et al for use in annotating the - * new timeline's history file. - */ -static bool -recoveryStopsBefore(XLogReaderState *record) -{ - bool stopsHere = false; - uint8 xact_info; - bool isCommit; - TimestampTz recordXtime = 0; - TransactionId recordXid; - - /* - * Ignore recovery target settings when not in archive recovery (meaning - * we are in crash recovery). - */ - if (!ArchiveRecoveryRequested) - return false; - - /* Check if we should stop as soon as reaching consistency */ - if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) - { - ereport(LOG, - (errmsg("recovery stopping after reaching consistency"))); - - recoveryStopAfter = false; - recoveryStopXid = InvalidTransactionId; - recoveryStopLSN = InvalidXLogRecPtr; - recoveryStopTime = 0; - recoveryStopName[0] = '\0'; - return true; - } - - /* Check if target LSN has been reached */ - if (recoveryTarget == RECOVERY_TARGET_LSN && - !recoveryTargetInclusive && - record->ReadRecPtr >= recoveryTargetLSN) - { - recoveryStopAfter = false; - recoveryStopXid = InvalidTransactionId; - recoveryStopLSN = record->ReadRecPtr; - recoveryStopTime = 0; - recoveryStopName[0] = '\0'; - ereport(LOG, - (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryStopLSN)))); - return true; - } - - /* Otherwise we only consider stopping before COMMIT or ABORT records. */ - if (XLogRecGetRmid(record) != RM_XACT_ID) - return false; - - xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; - - if (xact_info == XLOG_XACT_COMMIT) - { - isCommit = true; - recordXid = XLogRecGetXid(record); - } - else if (xact_info == XLOG_XACT_COMMIT_PREPARED) - { - xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); - xl_xact_parsed_commit parsed; - - isCommit = true; - ParseCommitRecord(XLogRecGetInfo(record), - xlrec, - &parsed); - recordXid = parsed.twophase_xid; - } - else if (xact_info == XLOG_XACT_ABORT) - { - isCommit = false; - recordXid = XLogRecGetXid(record); - } - else if (xact_info == XLOG_XACT_ABORT_PREPARED) - { - xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); - xl_xact_parsed_abort parsed; - - isCommit = false; - ParseAbortRecord(XLogRecGetInfo(record), - xlrec, - &parsed); - recordXid = parsed.twophase_xid; - } - else - return false; - - if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive) - { - /* - * There can be only one transaction end record with this exact - * transactionid - * - * when testing for an xid, we MUST test for equality only, since - * transactions are numbered in the order they start, not the order - * they complete. A higher numbered xid will complete before you about - * 50% of the time... - */ - stopsHere = (recordXid == recoveryTargetXid); - } - - if (recoveryTarget == RECOVERY_TARGET_TIME && - getRecordTimestamp(record, &recordXtime)) - { - /* - * There can be many transactions that share the same commit time, so - * we stop after the last one, if we are inclusive, or stop at the - * first one if we are exclusive - */ - if (recoveryTargetInclusive) - stopsHere = (recordXtime > recoveryTargetTime); - else - stopsHere = (recordXtime >= recoveryTargetTime); - } - - if (stopsHere) - { - recoveryStopAfter = false; - recoveryStopXid = recordXid; - recoveryStopTime = recordXtime; - recoveryStopLSN = InvalidXLogRecPtr; - recoveryStopName[0] = '\0'; - - if (isCommit) - { - ereport(LOG, - (errmsg("recovery stopping before commit of transaction %u, time %s", - recoveryStopXid, - timestamptz_to_str(recoveryStopTime)))); - } - else - { - ereport(LOG, - (errmsg("recovery stopping before abort of transaction %u, time %s", - recoveryStopXid, - timestamptz_to_str(recoveryStopTime)))); - } - } - - return stopsHere; -} - -/* - * Same as recoveryStopsBefore, but called after applying the record. - * - * We also track the timestamp of the latest applied COMMIT/ABORT - * record in XLogCtl->recoveryLastXTime. - */ -static bool -recoveryStopsAfter(XLogReaderState *record) -{ - uint8 info; - uint8 xact_info; - uint8 rmid; - TimestampTz recordXtime; - - /* - * Ignore recovery target settings when not in archive recovery (meaning - * we are in crash recovery). - */ - if (!ArchiveRecoveryRequested) - return false; - - info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - rmid = XLogRecGetRmid(record); - - /* - * There can be many restore points that share the same name; we stop at - * the first one. - */ - if (recoveryTarget == RECOVERY_TARGET_NAME && - rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) - { - xl_restore_point *recordRestorePointData; - - recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); - - if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0) - { - recoveryStopAfter = true; - recoveryStopXid = InvalidTransactionId; - recoveryStopLSN = InvalidXLogRecPtr; - (void) getRecordTimestamp(record, &recoveryStopTime); - strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN); - - ereport(LOG, - (errmsg("recovery stopping at restore point \"%s\", time %s", - recoveryStopName, - timestamptz_to_str(recoveryStopTime)))); - return true; - } - } - - /* Check if the target LSN has been reached */ - if (recoveryTarget == RECOVERY_TARGET_LSN && - recoveryTargetInclusive && - record->ReadRecPtr >= recoveryTargetLSN) - { - recoveryStopAfter = true; - recoveryStopXid = InvalidTransactionId; - recoveryStopLSN = record->ReadRecPtr; - recoveryStopTime = 0; - recoveryStopName[0] = '\0'; - ereport(LOG, - (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryStopLSN)))); - return true; - } - - if (rmid != RM_XACT_ID) - return false; - - xact_info = info & XLOG_XACT_OPMASK; - - if (xact_info == XLOG_XACT_COMMIT || - xact_info == XLOG_XACT_COMMIT_PREPARED || - xact_info == XLOG_XACT_ABORT || - xact_info == XLOG_XACT_ABORT_PREPARED) - { - TransactionId recordXid; - - /* Update the last applied transaction timestamp */ - if (getRecordTimestamp(record, &recordXtime)) - SetLatestXTime(recordXtime); - - /* Extract the XID of the committed/aborted transaction */ - if (xact_info == XLOG_XACT_COMMIT_PREPARED) - { - xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); - xl_xact_parsed_commit parsed; - - ParseCommitRecord(XLogRecGetInfo(record), - xlrec, - &parsed); - recordXid = parsed.twophase_xid; - } - else if (xact_info == XLOG_XACT_ABORT_PREPARED) - { - xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); - xl_xact_parsed_abort parsed; - - ParseAbortRecord(XLogRecGetInfo(record), - xlrec, - &parsed); - recordXid = parsed.twophase_xid; - } - else - recordXid = XLogRecGetXid(record); - - /* - * There can be only one transaction end record with this exact - * transactionid - * - * when testing for an xid, we MUST test for equality only, since - * transactions are numbered in the order they start, not the order - * they complete. A higher numbered xid will complete before you about - * 50% of the time... - */ - if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive && - recordXid == recoveryTargetXid) - { - recoveryStopAfter = true; - recoveryStopXid = recordXid; - recoveryStopTime = recordXtime; - recoveryStopLSN = InvalidXLogRecPtr; - recoveryStopName[0] = '\0'; - - if (xact_info == XLOG_XACT_COMMIT || - xact_info == XLOG_XACT_COMMIT_PREPARED) - { - ereport(LOG, - (errmsg("recovery stopping after commit of transaction %u, time %s", - recoveryStopXid, - timestamptz_to_str(recoveryStopTime)))); - } - else if (xact_info == XLOG_XACT_ABORT || - xact_info == XLOG_XACT_ABORT_PREPARED) - { - ereport(LOG, - (errmsg("recovery stopping after abort of transaction %u, time %s", - recoveryStopXid, - timestamptz_to_str(recoveryStopTime)))); - } - return true; - } - } - - /* Check if we should stop as soon as reaching consistency */ - if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) - { - ereport(LOG, - (errmsg("recovery stopping after reaching consistency"))); - - recoveryStopAfter = true; - recoveryStopXid = InvalidTransactionId; - recoveryStopTime = 0; - recoveryStopLSN = InvalidXLogRecPtr; - recoveryStopName[0] = '\0'; - return true; - } - - return false; -} - -/* - * Create a comment for the history file to explain why and where - * timeline changed. - */ -static char * -getRecoveryStopReason(void) -{ - char reason[200]; - - if (recoveryTarget == RECOVERY_TARGET_XID) - snprintf(reason, sizeof(reason), - "%s transaction %u", - recoveryStopAfter ? "after" : "before", - recoveryStopXid); - else if (recoveryTarget == RECOVERY_TARGET_TIME) - snprintf(reason, sizeof(reason), - "%s %s\n", - recoveryStopAfter ? "after" : "before", - timestamptz_to_str(recoveryStopTime)); - else if (recoveryTarget == RECOVERY_TARGET_LSN) - snprintf(reason, sizeof(reason), - "%s LSN %X/%X\n", - recoveryStopAfter ? "after" : "before", - LSN_FORMAT_ARGS(recoveryStopLSN)); - else if (recoveryTarget == RECOVERY_TARGET_NAME) - snprintf(reason, sizeof(reason), - "at restore point \"%s\"", - recoveryStopName); - else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) - snprintf(reason, sizeof(reason), "reached consistency"); - else - snprintf(reason, sizeof(reason), "no recovery target specified"); - - return pstrdup(reason); -} - -/* - * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED. - * - * endOfRecovery is true if the recovery target is reached and - * the paused state starts at the end of recovery because of - * recovery_target_action=pause, and false otherwise. - */ -static void -recoveryPausesHere(bool endOfRecovery) -{ - /* Don't pause unless users can connect! */ - if (!LocalHotStandbyActive) - return; - - /* Don't pause after standby promotion has been triggered */ - if (LocalPromoteIsTriggered) - return; - - if (endOfRecovery) - ereport(LOG, - (errmsg("pausing at the end of recovery"), - errhint("Execute pg_wal_replay_resume() to promote."))); - else - ereport(LOG, - (errmsg("recovery has paused"), - errhint("Execute pg_wal_replay_resume() to continue."))); - - /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */ - while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) - { - HandleStartupProcInterrupts(); - if (CheckForStandbyTrigger()) - return; - - /* - * If recovery pause is requested then set it paused. While we are in - * the loop, user might resume and pause again so set this every time. - */ - ConfirmRecoveryPaused(); - - /* - * We wait on a condition variable that will wake us as soon as the - * pause ends, but we use a timeout so we can check the above exit - * condition periodically too. - */ - ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000, - WAIT_EVENT_RECOVERY_PAUSE); - } - ConditionVariableCancelSleep(); -} - -/* - * Get the current state of the recovery pause request. - */ -RecoveryPauseState -GetRecoveryPauseState(void) -{ - RecoveryPauseState state; - - SpinLockAcquire(&XLogCtl->info_lck); - state = XLogCtl->recoveryPauseState; - SpinLockRelease(&XLogCtl->info_lck); - - return state; -} - -/* - * Set the recovery pause state. - * - * If recovery pause is requested then sets the recovery pause state to - * 'pause requested' if it is not already 'paused'. Otherwise, sets it - * to 'not paused' to resume the recovery. The recovery pause will be - * confirmed by the ConfirmRecoveryPaused. - */ -void -SetRecoveryPause(bool recoveryPause) -{ - SpinLockAcquire(&XLogCtl->info_lck); - - if (!recoveryPause) - XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; - else if (XLogCtl->recoveryPauseState == RECOVERY_NOT_PAUSED) - XLogCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED; - - SpinLockRelease(&XLogCtl->info_lck); - - if (!recoveryPause) - ConditionVariableBroadcast(&XLogCtl->recoveryNotPausedCV); -} - -/* - * Confirm the recovery pause by setting the recovery pause state to - * RECOVERY_PAUSED. - */ -static void -ConfirmRecoveryPaused(void) -{ - /* If recovery pause is requested then set it paused */ - SpinLockAcquire(&XLogCtl->info_lck); - if (XLogCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED) - XLogCtl->recoveryPauseState = RECOVERY_PAUSED; - SpinLockRelease(&XLogCtl->info_lck); -} - -/* - * When recovery_min_apply_delay is set, we wait long enough to make sure - * certain record types are applied at least that interval behind the primary. - * - * Returns true if we waited. - * - * Note that the delay is calculated between the WAL record log time and - * the current time on standby. We would prefer to keep track of when this - * standby received each WAL record, which would allow a more consistent - * approach and one not affected by time synchronisation issues, but that - * is significantly more effort and complexity for little actual gain in - * usability. - */ -static bool -recoveryApplyDelay(XLogReaderState *record) -{ - uint8 xact_info; - TimestampTz xtime; - TimestampTz delayUntil; - long msecs; - - /* nothing to do if no delay configured */ - if (recovery_min_apply_delay <= 0) - return false; - - /* no delay is applied on a database not yet consistent */ - if (!reachedConsistency) - return false; - - /* nothing to do if crash recovery is requested */ - if (!ArchiveRecoveryRequested) - return false; - - /* - * Is it a COMMIT record? - * - * We deliberately choose not to delay aborts since they have no effect on - * MVCC. We already allow replay of records that don't have a timestamp, - * so there is already opportunity for issues caused by early conflicts on - * standbys. - */ - if (XLogRecGetRmid(record) != RM_XACT_ID) - return false; - - xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; - - if (xact_info != XLOG_XACT_COMMIT && - xact_info != XLOG_XACT_COMMIT_PREPARED) - return false; - - if (!getRecordTimestamp(record, &xtime)) - return false; - - delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); - - /* - * Exit without arming the latch if it's already past time to apply this - * record - */ - msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil); - if (msecs <= 0) - return false; - - while (true) - { - ResetLatch(&XLogCtl->recoveryWakeupLatch); - - /* - * This might change recovery_min_apply_delay or the trigger file's - * location. - */ - HandleStartupProcInterrupts(); - - if (CheckForStandbyTrigger()) - break; - - /* - * Recalculate delayUntil as recovery_min_apply_delay could have - * changed while waiting in this loop. - */ - delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); - - /* - * Wait for difference between GetCurrentTimestamp() and delayUntil. - */ - msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), - delayUntil); - - if (msecs <= 0) - break; - - elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs); - - (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, - msecs, - WAIT_EVENT_RECOVERY_APPLY_DELAY); - } - return true; -} - -/* - * Save timestamp of latest processed commit/abort record. - * - * We keep this in XLogCtl, not a simple static variable, so that it can be - * seen by processes other than the startup process. Note in particular - * that CreateRestartPoint is executed in the checkpointer. - */ -static void -SetLatestXTime(TimestampTz xtime) -{ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->recoveryLastXTime = xtime; - SpinLockRelease(&XLogCtl->info_lck); -} - -/* - * Fetch timestamp of latest processed commit/abort record. - */ -TimestampTz -GetLatestXTime(void) -{ - TimestampTz xtime; - - SpinLockAcquire(&XLogCtl->info_lck); - xtime = XLogCtl->recoveryLastXTime; - SpinLockRelease(&XLogCtl->info_lck); - - return xtime; -} - -/* - * Save timestamp of the next chunk of WAL records to apply. - * - * We keep this in XLogCtl, not a simple static variable, so that it can be - * seen by all backends. - */ -static void -SetCurrentChunkStartTime(TimestampTz xtime) -{ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->currentChunkStartTime = xtime; - SpinLockRelease(&XLogCtl->info_lck); -} - -/* - * Fetch timestamp of latest processed commit/abort record. - * Startup process maintains an accurate local copy in XLogReceiptTime - */ -TimestampTz -GetCurrentChunkReplayStartTime(void) -{ - TimestampTz xtime; - - SpinLockAcquire(&XLogCtl->info_lck); - xtime = XLogCtl->currentChunkStartTime; - SpinLockRelease(&XLogCtl->info_lck); - - return xtime; -} - -/* - * Returns time of receipt of current chunk of XLOG data, as well as - * whether it was received from streaming replication or from archives. - */ -void -GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream) -{ - /* - * This must be executed in the startup process, since we don't export the - * relevant state to shared memory. - */ - Assert(InRecovery); - - *rtime = XLogReceiptTime; - *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM); -} - -/* - * Note that text field supplied is a parameter name and does not require - * translation - */ -static void -RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue) -{ - if (currValue < minValue) - { - if (LocalHotStandbyActive) - { - bool warned_for_promote = false; - - ereport(WARNING, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("hot standby is not possible because of insufficient parameter settings"), - errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", - param_name, - currValue, - minValue))); - - SetRecoveryPause(true); - - ereport(LOG, - (errmsg("recovery has paused"), - errdetail("If recovery is unpaused, the server will shut down."), - errhint("You can then restart the server after making the necessary configuration changes."))); - - while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) - { - HandleStartupProcInterrupts(); - - if (CheckForStandbyTrigger()) - { - if (!warned_for_promote) - ereport(WARNING, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("promotion is not possible because of insufficient parameter settings"), - - /* - * Repeat the detail from above so it's easy to find - * in the log. - */ - errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", - param_name, - currValue, - minValue), - errhint("Restart the server after making the necessary configuration changes."))); - warned_for_promote = true; - } - - /* - * If recovery pause is requested then set it paused. While - * we are in the loop, user might resume and pause again so - * set this every time. - */ - ConfirmRecoveryPaused(); - - /* - * We wait on a condition variable that will wake us as soon - * as the pause ends, but we use a timeout so we can check the - * above conditions periodically too. - */ - ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000, - WAIT_EVENT_RECOVERY_PAUSE); - } - ConditionVariableCancelSleep(); - } - - ereport(FATAL, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("recovery aborted because of insufficient parameter settings"), - /* Repeat the detail from above so it's easy to find in the log. */ - errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", - param_name, - currValue, - minValue), - errhint("You can restart the server after making the necessary configuration changes."))); - } -} - /* * Check to see if required parameters are set high enough on this server * for various aspects of recovery operation. @@ -6643,26 +4889,17 @@ StartupXLOG(void) XLogCtlInsert *Insert; CheckPoint checkPoint; bool wasShutdown; - bool reachedRecoveryTarget = false; - bool haveBackupLabel = false; - bool haveTblspcMap = false; - XLogRecPtr RecPtr, - LastRec, - checkPointLoc, - EndOfLog; + bool haveTblspcMap; + bool haveBackupLabel; + XLogRecPtr EndOfLog; TimeLineID EndOfLogTLI; - TimeLineID replayTLI, - newTLI; + TimeLineID newTLI; bool performedWalRecovery; - char *recoveryStopReason; - XLogRecord *record; + EndOfWalRecoveryInfo *endOfRecoveryInfo; + XLogRecPtr abortedRecPtr; + XLogRecPtr missingContrecPtr; TransactionId oldestActiveXID; - bool backupEndRequired = false; - bool backupFromStandby = false; - XLogReaderState *xlogreader; - XLogPageReadPrivate private; bool promoted = false; - struct stat st; /* * We should have an aux process resource owner to use, and we should not @@ -6771,432 +5008,17 @@ StartupXLOG(void) SyncDataDirectory(); } - /*---- BEGIN InitWalRecovery ----*/ - /* - * Initialize on the assumption we want to recover to the latest timeline - * that's active according to pg_control. - */ - if (ControlFile->minRecoveryPointTLI > - ControlFile->checkPointCopy.ThisTimeLineID) - recoveryTargetTLI = ControlFile->minRecoveryPointTLI; - else - recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID; - - /* - * Check for signal files, and if so set up state for offline recovery - */ - readRecoverySignalFile(); - validateRecoveryParameters(); - - if (ArchiveRecoveryRequested) - { - if (StandbyModeRequested) - ereport(LOG, - (errmsg("entering standby mode"))); - else if (recoveryTarget == RECOVERY_TARGET_XID) - ereport(LOG, - (errmsg("starting point-in-time recovery to XID %u", - recoveryTargetXid))); - else if (recoveryTarget == RECOVERY_TARGET_TIME) - ereport(LOG, - (errmsg("starting point-in-time recovery to %s", - timestamptz_to_str(recoveryTargetTime)))); - else if (recoveryTarget == RECOVERY_TARGET_NAME) - ereport(LOG, - (errmsg("starting point-in-time recovery to \"%s\"", - recoveryTargetName))); - else if (recoveryTarget == RECOVERY_TARGET_LSN) - ereport(LOG, - (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryTargetLSN)))); - else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) - ereport(LOG, - (errmsg("starting point-in-time recovery to earliest consistent point"))); - else - ereport(LOG, - (errmsg("starting archive recovery"))); - } - - /* - * Take ownership of the wakeup latch if we're going to sleep during - * recovery. - */ - if (ArchiveRecoveryRequested) - OwnLatch(&XLogCtl->recoveryWakeupLatch); - - /* Set up XLOG reader facility */ - MemSet(&private, 0, sizeof(XLogPageReadPrivate)); - xlogreader = - XLogReaderAllocate(wal_segment_size, NULL, - XL_ROUTINE(.page_read = &XLogPageRead, - .segment_open = NULL, - .segment_close = wal_segment_close), - &private); - if (!xlogreader) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"), - errdetail("Failed while allocating a WAL reading processor."))); - xlogreader->system_identifier = ControlFile->system_identifier; - - /* - * Allocate two page buffers dedicated to WAL consistency checks. We do - * it this way, rather than just making static arrays, for two reasons: - * (1) no need to waste the storage in most instantiations of the backend; - * (2) a static char array isn't guaranteed to have any particular - * alignment, whereas palloc() will provide MAXALIGN'd storage. - */ - replay_image_masked = (char *) palloc(BLCKSZ); - primary_image_masked = (char *) palloc(BLCKSZ); - - if (read_backup_label(&checkPointLoc, &replayTLI, &backupEndRequired, - &backupFromStandby)) - { - List *tablespaces = NIL; - - /* - * Archive recovery was requested, and thanks to the backup label - * file, we know how far we need to replay to reach consistency. Enter - * archive recovery directly. - */ - InArchiveRecovery = true; - if (StandbyModeRequested) - StandbyMode = true; - - /* - * When a backup_label file is present, we want to roll forward from - * the checkpoint it identifies, rather than using pg_control. - */ - record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true, - replayTLI); - if (record != NULL) - { - memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); - wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); - ereport(DEBUG1, - (errmsg_internal("checkpoint record is at %X/%X", - LSN_FORMAT_ARGS(checkPointLoc)))); - InRecovery = true; /* force recovery even if SHUTDOWNED */ - - /* - * Make sure that REDO location exists. This may not be the case - * if there was a crash during an online backup, which left a - * backup_label around that references a WAL segment that's - * already been archived. - */ - if (checkPoint.redo < checkPointLoc) - { - XLogBeginRead(xlogreader, checkPoint.redo); - if (!ReadRecord(xlogreader, LOG, false, - checkPoint.ThisTimeLineID)) - ereport(FATAL, - (errmsg("could not find redo location referenced by checkpoint record"), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir))); - } - } - else - { - ereport(FATAL, - (errmsg("could not locate required checkpoint record"), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir))); - wasShutdown = false; /* keep compiler quiet */ - } - - /* read the tablespace_map file if present and create symlinks. */ - if (read_tablespace_map(&tablespaces)) - { - ListCell *lc; - - foreach(lc, tablespaces) - { - tablespaceinfo *ti = lfirst(lc); - char *linkloc; - - linkloc = psprintf("pg_tblspc/%s", ti->oid); - - /* - * Remove the existing symlink if any and Create the symlink - * under PGDATA. - */ - remove_tablespace_symlink(linkloc); - - if (symlink(ti->path, linkloc) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create symbolic link \"%s\": %m", - linkloc))); - - pfree(ti->oid); - pfree(ti->path); - pfree(ti); - } - - /* set flag to delete it later */ - haveTblspcMap = true; - } - - /* set flag to delete it later */ - haveBackupLabel = true; - } - else - { - /* - * If tablespace_map file is present without backup_label file, there - * is no use of such file. There is no harm in retaining it, but it - * is better to get rid of the map file so that we don't have any - * redundant file in data directory and it will avoid any sort of - * confusion. It seems prudent though to just rename the file out of - * the way rather than delete it completely, also we ignore any error - * that occurs in rename operation as even if map file is present - * without backup_label file, it is harmless. - */ - if (stat(TABLESPACE_MAP, &st) == 0) - { - unlink(TABLESPACE_MAP_OLD); - if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) - ereport(LOG, - (errmsg("ignoring file \"%s\" because no file \"%s\" exists", - TABLESPACE_MAP, BACKUP_LABEL_FILE), - errdetail("File \"%s\" was renamed to \"%s\".", - TABLESPACE_MAP, TABLESPACE_MAP_OLD))); - else - ereport(LOG, - (errmsg("ignoring file \"%s\" because no file \"%s\" exists", - TABLESPACE_MAP, BACKUP_LABEL_FILE), - errdetail("Could not rename file \"%s\" to \"%s\": %m.", - TABLESPACE_MAP, TABLESPACE_MAP_OLD))); - } - - /* - * It's possible that archive recovery was requested, but we don't - * know how far we need to replay the WAL before we reach consistency. - * This can happen for example if a base backup is taken from a - * running server using an atomic filesystem snapshot, without calling - * pg_start/stop_backup. Or if you just kill a running primary server - * and put it into archive recovery by creating a recovery signal - * file. - * - * Our strategy in that case is to perform crash recovery first, - * replaying all the WAL present in pg_wal, and only enter archive - * recovery after that. - * - * But usually we already know how far we need to replay the WAL (up - * to minRecoveryPoint, up to backupEndPoint, or until we see an - * end-of-backup record), and we can enter archive recovery directly. - */ - if (ArchiveRecoveryRequested && - (ControlFile->minRecoveryPoint != InvalidXLogRecPtr || - ControlFile->backupEndRequired || - ControlFile->backupEndPoint != InvalidXLogRecPtr || - ControlFile->state == DB_SHUTDOWNED)) - { - InArchiveRecovery = true; - if (StandbyModeRequested) - StandbyMode = true; - } - - /* Get the last valid checkpoint record. */ - checkPointLoc = ControlFile->checkPoint; - RedoStartLSN = ControlFile->checkPointCopy.redo; - replayTLI = ControlFile->checkPointCopy.ThisTimeLineID; - record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true, - replayTLI); - if (record != NULL) - { - ereport(DEBUG1, - (errmsg_internal("checkpoint record is at %X/%X", - LSN_FORMAT_ARGS(checkPointLoc)))); - } - else - { - /* - * We used to attempt to go back to a secondary checkpoint record - * here, but only when not in standby mode. We now just fail if we - * can't read the last checkpoint because this allows us to - * simplify processing around checkpoints. - */ - ereport(PANIC, - (errmsg("could not locate a valid checkpoint record"))); - } - memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); - wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); - } - - /* - * If the location of the checkpoint record is not on the expected - * timeline in the history of the requested timeline, we cannot proceed: - * the backup is not part of the history of the requested timeline. - */ - Assert(expectedTLEs); /* was initialized by reading checkpoint - * record */ - if (tliOfPointInHistory(checkPointLoc, expectedTLEs) != - checkPoint.ThisTimeLineID) - { - XLogRecPtr switchpoint; - - /* - * tliSwitchPoint will throw an error if the checkpoint's timeline is - * not in expectedTLEs at all. - */ - switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL); - ereport(FATAL, - (errmsg("requested timeline %u is not a child of this server's history", - recoveryTargetTLI), - errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.", - LSN_FORMAT_ARGS(ControlFile->checkPoint), - ControlFile->checkPointCopy.ThisTimeLineID, - LSN_FORMAT_ARGS(switchpoint)))); - } - - /* - * The min recovery point should be part of the requested timeline's - * history, too. - */ - if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && - tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != - ControlFile->minRecoveryPointTLI) - ereport(FATAL, - (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", - recoveryTargetTLI, - LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), - ControlFile->minRecoveryPointTLI))); - - LastRec = RecPtr = checkPointLoc; - - ereport(DEBUG1, - (errmsg_internal("redo record is at %X/%X; shutdown %s", - LSN_FORMAT_ARGS(checkPoint.redo), - wasShutdown ? "true" : "false"))); - ereport(DEBUG1, - (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", - U64FromFullTransactionId(checkPoint.nextXid), - checkPoint.nextOid))); - ereport(DEBUG1, - (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", - checkPoint.nextMulti, checkPoint.nextMultiOffset))); - ereport(DEBUG1, - (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", - checkPoint.oldestXid, checkPoint.oldestXidDB))); - ereport(DEBUG1, - (errmsg_internal("oldest MultiXactId: %u, in database %u", - checkPoint.oldestMulti, checkPoint.oldestMultiDB))); - ereport(DEBUG1, - (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u", - checkPoint.oldestCommitTsXid, - checkPoint.newestCommitTsXid))); - - /* sanity checks on the checkpoint record */ - if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid))) - ereport(PANIC, - (errmsg("invalid next transaction ID"))); - if (checkPoint.redo > checkPointLoc) - ereport(PANIC, - (errmsg("invalid redo in checkpoint record"))); - - /* - * Check whether we need to force recovery from WAL. If it appears to - * have been a clean shutdown and we did not have a recovery signal file, - * then assume no recovery needed. - */ - if (checkPoint.redo < checkPointLoc) - { - if (wasShutdown) - ereport(PANIC, - (errmsg("invalid redo record in shutdown checkpoint"))); - InRecovery = true; - } - else if (ControlFile->state != DB_SHUTDOWNED) - InRecovery = true; - else if (ArchiveRecoveryRequested) - { - /* force recovery due to presence of recovery signal file */ - InRecovery = true; - } - - /* - * If recovery is needed, update our in-memory copy of pg_control to show - * that we are recovering and to show the selected checkpoint as the place - * we are starting from. We also mark pg_control with any minimum recovery - * stop point obtained from a backup history file. + * Prepare for WAL recovery if needed. * - * We don't write the changes to disk yet, though. Only do that after - * initializing various subsystems. + * InitWalRecovery analyzes the control file and the backup label file, if + * any. It updates the in-memory ControlFile buffer according to the + * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested. + * It also applies the tablespace map file, if any. */ - if (InRecovery) - { - DBState dbstate_at_startup; - - dbstate_at_startup = ControlFile->state; - if (InArchiveRecovery) - { - ControlFile->state = DB_IN_ARCHIVE_RECOVERY; - } - else - { - ereport(LOG, - (errmsg("database system was not properly shut down; " - "automatic recovery in progress"))); - if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID) - ereport(LOG, - (errmsg("crash recovery starts in timeline %u " - "and has target timeline %u", - ControlFile->checkPointCopy.ThisTimeLineID, - recoveryTargetTLI))); - ControlFile->state = DB_IN_CRASH_RECOVERY; - } - ControlFile->checkPoint = checkPointLoc; - ControlFile->checkPointCopy = checkPoint; - if (InArchiveRecovery) - { - /* initialize minRecoveryPoint if not set yet */ - if (ControlFile->minRecoveryPoint < checkPoint.redo) - { - ControlFile->minRecoveryPoint = checkPoint.redo; - ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID; - } - } - - /* - * Set backupStartPoint if we're starting recovery from a base backup. - * - * Also set backupEndPoint and use minRecoveryPoint as the backup end - * location if we're starting recovery from a base backup which was - * taken from a standby. In this case, the database system status in - * pg_control must indicate that the database was already in recovery. - * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be - * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted - * before reaching this point; e.g. because restore_command or - * primary_conninfo were faulty. - * - * Any other state indicates that the backup somehow became corrupted - * and we can't sensibly continue with recovery. - */ - if (haveBackupLabel) - { - ControlFile->backupStartPoint = checkPoint.redo; - ControlFile->backupEndRequired = backupEndRequired; - - if (backupFromStandby) - { - if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY && - dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY) - ereport(FATAL, - (errmsg("backup_label contains data inconsistent with control file"), - errhint("This means that the backup is corrupted and you will " - "have to use another backup for recovery."))); - ControlFile->backupEndPoint = ControlFile->minRecoveryPoint; - } - } - } - - /*---- END InitWalRecovery ----*/ + InitWalRecovery(ControlFile, &wasShutdown, + &haveBackupLabel, &haveTblspcMap); + checkPoint = ControlFile->checkPointCopy; /* initialize shared memory variables from the checkpoint record */ ShmemVariableCache->nextXid = checkPoint.nextXid; @@ -7272,13 +5094,6 @@ StartupXLOG(void) else XLogCtl->unloggedLSN = FirstNormalUnloggedLSN; - /* - * We must replay WAL entries using the same TimeLineID they were created - * under, so temporarily adopt the TLI indicated by the checkpoint (see - * also xlog_redo()). - */ - replayTLI = checkPoint.ThisTimeLineID; - /* * Copy any missing timeline history files between 'now' and the recovery * target timeline from archive to pg_wal. While we don't need those files @@ -7291,7 +5106,7 @@ StartupXLOG(void) * are small, so it's better to copy them unnecessarily than not copy them * and regret later. */ - restoreTimeLineHistoryFiles(replayTLI, recoveryTargetTLI); + restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI); /* * Before running in recovery, scan pg_twophase and fill in its status to @@ -7308,17 +5123,9 @@ StartupXLOG(void) RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; doPageWrites = lastFullPageWrites; - /* - * Start recovery assuming that the final record isn't lost. - */ - abortedRecPtr = InvalidXLogRecPtr; - missingContrecPtr = InvalidXLogRecPtr; - /* REDO */ if (InRecovery) { - int rmid; - /* Initialize state for RecoveryInProgress() */ SpinLockAcquire(&XLogCtl->info_lck); if (InArchiveRecovery) @@ -7376,13 +5183,13 @@ StartupXLOG(void) */ if (InArchiveRecovery) { - minRecoveryPoint = ControlFile->minRecoveryPoint; - minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; } else { - minRecoveryPoint = InvalidXLogRecPtr; - minRecoveryPointTLI = 0; + LocalMinRecoveryPoint = InvalidXLogRecPtr; + LocalMinRecoveryPointTLI = 0; } /* @@ -7473,460 +5280,31 @@ StartupXLOG(void) } } - /*---- BEGIN PerformWalRecovery ----*/ - /* - * Initialize shared variables for tracking progress of WAL replay, as - * if we had just replayed the record before the REDO location (or the - * checkpoint record itself, if it's a shutdown checkpoint). + * We're all set for replaying the WAL now. Do it. */ - SpinLockAcquire(&XLogCtl->info_lck); - if (checkPoint.redo < checkPointLoc) - XLogCtl->replayEndRecPtr = checkPoint.redo; - else - XLogCtl->replayEndRecPtr = xlogreader->EndRecPtr; - XLogCtl->replayEndTLI = replayTLI; - XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr; - XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI; - XLogCtl->recoveryLastXTime = 0; - XLogCtl->currentChunkStartTime = 0; - XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; - SpinLockRelease(&XLogCtl->info_lck); - - /* Also ensure XLogReceiptTime has a sane value */ - XLogReceiptTime = GetCurrentTimestamp(); - - /* - * Let postmaster know we've started redo now, so that it can launch - * the archiver if necessary. - */ - if (IsUnderPostmaster) - SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); - - /* - * Allow read-only connections immediately if we're consistent - * already. - */ - CheckRecoveryConsistency(); - - /* - * Find the first record that logically follows the checkpoint --- it - * might physically precede it, though. - */ - if (checkPoint.redo < checkPointLoc) - { - /* back up to find the record */ - XLogBeginRead(xlogreader, checkPoint.redo); - record = ReadRecord(xlogreader, PANIC, false, replayTLI); - } - else - { - /* just have to read next record after CheckPoint */ - Assert(RecPtr == checkPointLoc); - record = ReadRecord(xlogreader, LOG, false, replayTLI); - } - - if (record != NULL) - { - ErrorContextCallback errcallback; - TimestampTz xtime; - PGRUsage ru0; - - pg_rusage_init(&ru0); - - InRedo = true; - - /* Initialize resource managers */ - for (rmid = 0; rmid <= RM_MAX_ID; rmid++) - { - if (RmgrTable[rmid].rm_startup != NULL) - RmgrTable[rmid].rm_startup(); - } - - ereport(LOG, - (errmsg("redo starts at %X/%X", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); - - /* Prepare to report progress of the redo phase. */ - if (!StandbyMode) - begin_startup_progress_phase(); - - /* - * main redo apply loop - */ - do - { - bool switchedTLI = false; - - if (!StandbyMode) - ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)); - -#ifdef WAL_DEBUG - if (XLOG_DEBUG || - (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) || - (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3)) - { - StringInfoData buf; - - initStringInfo(&buf); - appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), - LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); - xlog_outrec(&buf, xlogreader); - appendStringInfoString(&buf, " - "); - xlog_outdesc(&buf, xlogreader); - elog(LOG, "%s", buf.data); - pfree(buf.data); - } -#endif - - /* Handle interrupt signals of startup process */ - HandleStartupProcInterrupts(); - - /* - * Pause WAL replay, if requested by a hot-standby session via - * SetRecoveryPause(). - * - * Note that we intentionally don't take the info_lck spinlock - * here. We might therefore read a slightly stale value of - * the recoveryPause flag, but it can't be very stale (no - * worse than the last spinlock we did acquire). Since a - * pause request is a pretty asynchronous thing anyway, - * possibly responding to it one WAL record later than we - * otherwise would is a minor issue, so it doesn't seem worth - * adding another spinlock cycle to prevent that. - */ - if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != - RECOVERY_NOT_PAUSED) - recoveryPausesHere(false); - - /* - * Have we reached our recovery target? - */ - if (recoveryStopsBefore(xlogreader)) - { - reachedRecoveryTarget = true; - break; - } - - /* - * If we've been asked to lag the primary, wait on latch until - * enough time has passed. - */ - if (recoveryApplyDelay(xlogreader)) - { - /* - * We test for paused recovery again here. If user sets - * delayed apply, it may be because they expect to pause - * recovery in case of problems, so we must test again - * here otherwise pausing during the delay-wait wouldn't - * work. - */ - if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != - RECOVERY_NOT_PAUSED) - recoveryPausesHere(false); - } - - /* Setup error traceback support for ereport() */ - errcallback.callback = rm_redo_error_callback; - errcallback.arg = (void *) xlogreader; - errcallback.previous = error_context_stack; - error_context_stack = &errcallback; - - /* - * ShmemVariableCache->nextXid must be beyond record's xid. - */ - AdvanceNextFullTransactionIdPastXid(record->xl_xid); - - /* - * Before replaying this record, check if this record causes - * the current timeline to change. The record is already - * considered to be part of the new timeline, so we update - * replayTLI before replaying it. That's important so that - * replayEndTLI, which is recorded as the minimum recovery - * point's TLI if recovery stops after this record, is set - * correctly. - */ - if (record->xl_rmid == RM_XLOG_ID) - { - TimeLineID newReplayTLI = replayTLI; - TimeLineID prevReplayTLI = replayTLI; - uint8 info = record->xl_info & ~XLR_INFO_MASK; - - if (info == XLOG_CHECKPOINT_SHUTDOWN) - { - CheckPoint checkPoint; - - memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); - newReplayTLI = checkPoint.ThisTimeLineID; - prevReplayTLI = checkPoint.PrevTimeLineID; - } - else if (info == XLOG_END_OF_RECOVERY) - { - xl_end_of_recovery xlrec; - - memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery)); - newReplayTLI = xlrec.ThisTimeLineID; - prevReplayTLI = xlrec.PrevTimeLineID; - } - - if (newReplayTLI != replayTLI) - { - /* Check that it's OK to switch to this TLI */ - checkTimeLineSwitch(xlogreader->EndRecPtr, - newReplayTLI, - prevReplayTLI, replayTLI); - - /* Following WAL records should be run with new TLI */ - replayTLI = newReplayTLI; - switchedTLI = true; - } - } - - /* - * Update shared replayEndRecPtr before replaying this record, - * so that XLogFlush will update minRecoveryPoint correctly. - */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->replayEndRecPtr = xlogreader->EndRecPtr; - XLogCtl->replayEndTLI = replayTLI; - SpinLockRelease(&XLogCtl->info_lck); - - /* - * If we are attempting to enter Hot Standby mode, process - * XIDs we see - */ - if (standbyState >= STANDBY_INITIALIZED && - TransactionIdIsValid(record->xl_xid)) - RecordKnownAssignedTransactionIds(record->xl_xid); - - /* Now apply the WAL record itself */ - RmgrTable[record->xl_rmid].rm_redo(xlogreader); - - /* - * After redo, check whether the backup pages associated with - * the WAL record are consistent with the existing pages. This - * check is done only if consistency check is enabled for this - * record. - */ - if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) - checkXLogConsistency(xlogreader); - - /* Pop the error context stack */ - error_context_stack = errcallback.previous; - - /* - * Update lastReplayedEndRecPtr after this record has been - * successfully replayed. - */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr; - XLogCtl->lastReplayedTLI = replayTLI; - SpinLockRelease(&XLogCtl->info_lck); - - /* - * If rm_redo called XLogRequestWalReceiverReply, then we wake - * up the receiver so that it notices the updated - * lastReplayedEndRecPtr and sends a reply to the primary. - */ - if (doRequestWalReceiverReply) - { - doRequestWalReceiverReply = false; - WalRcvForceReply(); - } - - /* Remember this record as the last-applied one */ - LastRec = xlogreader->ReadRecPtr; - - /* Allow read-only connections if we're consistent now */ - CheckRecoveryConsistency(); - - /* Is this a timeline switch? */ - if (switchedTLI) - { - /* - * Before we continue on the new timeline, clean up any - * (possibly bogus) future WAL segments on the old - * timeline. - */ - RemoveNonParentXlogFiles(xlogreader->EndRecPtr, replayTLI); - - /* - * Wake up any walsenders to notice that we are on a new - * timeline. - */ - if (AllowCascadeReplication()) - WalSndWakeup(); - } - - /* Exit loop if we reached inclusive recovery target */ - if (recoveryStopsAfter(xlogreader)) - { - reachedRecoveryTarget = true; - break; - } - - /* Else, try to fetch the next WAL record */ - record = ReadRecord(xlogreader, LOG, false, replayTLI); - } while (record != NULL); - - /* - * end of main redo apply loop - */ - - if (reachedRecoveryTarget) - { - if (!reachedConsistency) - ereport(FATAL, - (errmsg("requested recovery stop point is before consistent recovery point"))); - - /* - * This is the last point where we can restart recovery with a - * new recovery target, if we shutdown and begin again. After - * this, Resource Managers may choose to do permanent - * corrective actions at end of recovery. - */ - switch (recoveryTargetAction) - { - case RECOVERY_TARGET_ACTION_SHUTDOWN: - - /* - * exit with special return code to request shutdown - * of postmaster. Log messages issued from - * postmaster. - */ - proc_exit(3); - - case RECOVERY_TARGET_ACTION_PAUSE: - SetRecoveryPause(true); - recoveryPausesHere(true); - - /* drop into promote */ - - case RECOVERY_TARGET_ACTION_PROMOTE: - break; - } - } - - /* Allow resource managers to do any required cleanup. */ - for (rmid = 0; rmid <= RM_MAX_ID; rmid++) - { - if (RmgrTable[rmid].rm_cleanup != NULL) - RmgrTable[rmid].rm_cleanup(); - } - - ereport(LOG, - (errmsg("redo done at %X/%X system usage: %s", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), - pg_rusage_show(&ru0)))); - xtime = GetLatestXTime(); - if (xtime) - ereport(LOG, - (errmsg("last completed transaction was at log time %s", - timestamptz_to_str(xtime)))); - - InRedo = false; - } - else - { - /* there are no WAL records following the checkpoint */ - ereport(LOG, - (errmsg("redo is not required"))); - - } - - /* - * This check is intentionally after the above log messages that - * indicate how far recovery went. - */ - if (ArchiveRecoveryRequested && - recoveryTarget != RECOVERY_TARGET_UNSET && - !reachedRecoveryTarget) - ereport(FATAL, - (errmsg("recovery ended before configured recovery target was reached"))); - - /*---- END PerformWalRecovery ----*/ + PerformWalRecovery(); performedWalRecovery = true; } - /*---- BEGIN FinishWalRecovery ----*/ - /* - * Kill WAL receiver, if it's still running, before we continue to write - * the startup checkpoint and aborted-contrecord records. It will trump - * over these records and subsequent ones if it's still alive when we - * start writing WAL. + * Finish WAL recovery. */ - XLogShutdownWalRcv(); - - /* - * We are now done reading the xlog from stream. Turn off streaming - * recovery to force fetching the files (which would be required at end of - * recovery, e.g., timeline history file) from archive or pg_wal. - * - * Note that standby mode must be turned off after killing WAL receiver, - * i.e., calling XLogShutdownWalRcv(). - */ - Assert(!WalRcvStreaming()); - StandbyMode = false; - - /* - * Determine where to start writing WAL next. - * - * When recovery ended in an incomplete record, write a WAL record about - * that and continue after it. In all other cases, re-fetch the last - * valid or last applied record, so we can identify the exact endpoint of - * what we consider the valid portion of WAL. - */ - XLogBeginRead(xlogreader, LastRec); - record = ReadRecord(xlogreader, PANIC, false, replayTLI); - EndOfLog = xlogreader->EndRecPtr; - - /* - * EndOfLogTLI is the TLI in the filename of the XLOG segment containing - * the end-of-log. It could be different from the timeline that EndOfLog - * nominally belongs to, if there was a timeline switch in that segment, - * and we were reading the old WAL from a segment belonging to a higher - * timeline. - */ - EndOfLogTLI = xlogreader->seg.ws_tli; - - if (ArchiveRecoveryRequested) - { - /* - * We are no longer in archive recovery state. - * - * We are now done reading the old WAL. Turn off archive fetching if - * it was active. - */ - Assert(InArchiveRecovery); - InArchiveRecovery = false; - - /* - * If the ending log segment is still open, close it (to avoid - * problems on Windows with trying to rename or delete an open file). - */ - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } - } - - recoveryStopReason = getRecoveryStopReason(); - - /*---- END FinishWalRecovery ----*/ + endOfRecoveryInfo = FinishWalRecovery(); + EndOfLog = endOfRecoveryInfo->endOfLog; + EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI; + abortedRecPtr = endOfRecoveryInfo->abortedRecPtr; + missingContrecPtr = endOfRecoveryInfo->missingContrecPtr; /* * Complain if we did not roll forward far enough to render the backup * dump consistent. Note: it is indeed okay to look at the local variable - * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might - * be further ahead --- ControlFile->minRecoveryPoint cannot have been - * advanced beyond the WAL we processed. + * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint + * might be further ahead --- ControlFile->minRecoveryPoint cannot have + * been advanced beyond the WAL we processed. */ if (InRecovery && - (EndOfLog < minRecoveryPoint || + (EndOfLog < LocalMinRecoveryPoint || !XLogRecPtrIsInvalid(ControlFile->backupStartPoint))) { /* @@ -7993,7 +5371,7 @@ StartupXLOG(void) * * In a normal crash recovery, we can just extend the timeline we were in. */ - newTLI = replayTLI; + newTLI = endOfRecoveryInfo->lastRecTLI; if (ArchiveRecoveryRequested) { newTLI = findNewestTimeLine(recoveryTargetTLI) + 1; @@ -8002,8 +5380,8 @@ StartupXLOG(void) /* * Make a writable copy of the last WAL segment. (Note that we also - * have a copy of the last block of the old WAL in readBuf; we will - * use that below.) + * have a copy of the last block of the old WAL in + * endOfRecovery->lastPage; we will use that below.) */ XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI); @@ -8011,10 +5389,10 @@ StartupXLOG(void) * Remove the signal files out of the way, so that we don't * accidentally re-enter archive recovery mode in a subsequent crash. */ - if (standby_signal_file_found) + if (endOfRecoveryInfo->standby_signal_file_found) durable_unlink(STANDBY_SIGNAL_FILE, FATAL); - if (recovery_signal_file_found) + if (endOfRecoveryInfo->recovery_signal_file_found) durable_unlink(RECOVERY_SIGNAL_FILE, FATAL); /* @@ -8028,7 +5406,7 @@ StartupXLOG(void) * between here and writing the end-of-recovery record. */ writeTimeLineHistory(newTLI, recoveryTargetTLI, - EndOfLog, recoveryStopReason); + EndOfLog, endOfRecoveryInfo->recoveryStopReason); ereport(LOG, (errmsg("archive recovery complete"))); @@ -8036,7 +5414,7 @@ StartupXLOG(void) /* Save the selected TimeLineID in shared memory, too */ XLogCtl->InsertTimeLineID = newTLI; - XLogCtl->PrevTimeLineID = replayTLI; + XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI; /* * Actually, if WAL ended in an incomplete record, skip the parts that @@ -8056,11 +5434,11 @@ StartupXLOG(void) * previous incarnation. */ Insert = &XLogCtl->Insert; - Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec); + Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec); Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog); /* - * Tricky point here: readBuf contains the *last* block that the LastRec + * Tricky point here: lastPage contains the *last* block that the LastRec * record spans, not the one it starts in. The last block is indeed the * one we want to use. */ @@ -8069,21 +5447,18 @@ StartupXLOG(void) char *page; int len; int firstIdx; - XLogRecPtr pageBeginPtr; - - pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ); - Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size)); firstIdx = XLogRecPtrToBufIdx(EndOfLog); + len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr; + Assert(len < XLOG_BLCKSZ); /* Copy the valid part of the last block, and zero the rest */ page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ]; - len = EndOfLog % XLOG_BLCKSZ; - memcpy(page, xlogreader->readBuf, len); + memcpy(page, endOfRecoveryInfo->lastPage, XLOG_BLCKSZ); memset(page + len, 0, XLOG_BLCKSZ - len); - XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ; - XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ; + XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ; + XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ; } else { @@ -8138,40 +5513,8 @@ StartupXLOG(void) /* Reload shared-memory state for prepared transactions */ RecoverPreparedTransactions(); - /*---- BEGIN ShutdownWalRecovery ----*/ - /* Shut down xlogreader */ - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } - XLogReaderFree(xlogreader); - - if (ArchiveRecoveryRequested) - { - char recoveryPath[MAXPGPATH]; - - /* - * Since there might be a partial WAL segment named RECOVERYXLOG, get - * rid of it. - */ - snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG"); - unlink(recoveryPath); /* ignore any error */ - - /* Get rid of any remaining recovered timeline-history file, too */ - snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY"); - unlink(recoveryPath); /* ignore any error */ - } - - /* - * We don't need the latch anymore. It's not strictly necessary to disown - * it, but let's do it for the sake of tidiness. - */ - if (ArchiveRecoveryRequested) - DisownLatch(&XLogCtl->recoveryWakeupLatch); - - /*---- END ShutdownWalRecovery ----*/ + ShutdownWalRecovery(); /* Enable WAL writes for this backend only. */ LocalSetXLogInsertAllowed(); @@ -8181,8 +5524,6 @@ StartupXLOG(void) { Assert(!XLogRecPtrIsInvalid(missingContrecPtr)); CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI); - abortedRecPtr = InvalidXLogRecPtr; - missingContrecPtr = InvalidXLogRecPtr; } /* @@ -8269,99 +5610,72 @@ StartupXLOG(void) } /* - * Checks if recovery has reached a consistent state. When consistency is - * reached and we have a valid starting standby snapshot, tell postmaster - * that it can start accepting read-only connections. + * Callback from PerformWalRecovery(), called when we switch from crash + * recovery to archive recovery mode. Updates the control file accordingly. */ -static void -CheckRecoveryConsistency(void) +void +SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI) { - XLogRecPtr lastReplayedEndRecPtr; - - /* - * During crash recovery, we don't reach a consistent state until we've - * replayed all the WAL. - */ - if (XLogRecPtrIsInvalid(minRecoveryPoint)) - return; - - Assert(InArchiveRecovery); - - /* - * assume that we are called in the startup process, and hence don't need - * a lock to read lastReplayedEndRecPtr - */ - lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr; - - /* - * Have we reached the point where our base backup was completed? - */ - if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) && - ControlFile->backupEndPoint <= lastReplayedEndRecPtr) + /* initialize minRecoveryPoint to this record */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + ControlFile->state = DB_IN_ARCHIVE_RECOVERY; + if (ControlFile->minRecoveryPoint < EndRecPtr) { - /* - * We have reached the end of base backup, as indicated by pg_control. - * The data on disk is now consistent. Reset backupStartPoint and - * backupEndPoint, and update minRecoveryPoint to make sure we don't - * allow starting up at an earlier point even if recovery is stopped - * and restarted soon after this. - */ - elog(DEBUG1, "end of backup reached"); + ControlFile->minRecoveryPoint = EndRecPtr; + ControlFile->minRecoveryPointTLI = replayTLI; + } + /* update local copy */ + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + /* + * The startup process can update its local copy of minRecoveryPoint from + * this point. + */ + updateMinRecoveryPoint = true; - if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr) - ControlFile->minRecoveryPoint = lastReplayedEndRecPtr; + UpdateControlFile(); - ControlFile->backupStartPoint = InvalidXLogRecPtr; - ControlFile->backupEndPoint = InvalidXLogRecPtr; - ControlFile->backupEndRequired = false; - UpdateControlFile(); + /* + * We update SharedRecoveryState while holding the lock on ControlFileLock + * so both states are consistent in shared memory. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE; + SpinLockRelease(&XLogCtl->info_lck); - LWLockRelease(ControlFileLock); + LWLockRelease(ControlFileLock); +} + +/* + * Callback from PerformWalRecovery(), called when we reach the end of backup. + * Updates the control file accordingly. + */ +void +ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli) +{ + /* + * We have reached the end of base backup, as indicated by pg_control. The + * data on disk is now consistent (unless minRecovery point is further + * ahead, which can happen if we crashed during previous recovery). Reset + * backupStartPoint and backupEndPoint, and update minRecoveryPoint to + * make sure we don't allow starting up at an earlier point even if + * recovery is stopped and restarted soon after this. + */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + + if (ControlFile->minRecoveryPoint < EndRecPtr) + { + ControlFile->minRecoveryPoint = EndRecPtr; + ControlFile->minRecoveryPointTLI = tli; } - /* - * Have we passed our safe starting point? Note that minRecoveryPoint is - * known to be incorrectly set if ControlFile->backupEndRequired, until - * the XLOG_BACKUP_END arrives to advise us of the correct - * minRecoveryPoint. All we know prior to that is that we're not - * consistent yet. - */ - if (!reachedConsistency && !ControlFile->backupEndRequired && - minRecoveryPoint <= lastReplayedEndRecPtr && - XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) - { - /* - * Check to see if the XLOG sequence contained any unresolved - * references to uninitialized pages. - */ - XLogCheckInvalidPages(); + ControlFile->backupStartPoint = InvalidXLogRecPtr; + ControlFile->backupEndPoint = InvalidXLogRecPtr; + ControlFile->backupEndRequired = false; + UpdateControlFile(); - reachedConsistency = true; - ereport(LOG, - (errmsg("consistent recovery state reached at %X/%X", - LSN_FORMAT_ARGS(lastReplayedEndRecPtr)))); - } - - /* - * Have we got a valid starting snapshot that will allow queries to be - * run? If so, we can tell postmaster that the database is consistent now, - * enabling connections. - */ - if (standbyState == STANDBY_SNAPSHOT_READY && - !LocalHotStandbyActive && - reachedConsistency && - IsUnderPostmaster) - { - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->SharedHotStandbyActive = true; - SpinLockRelease(&XLogCtl->info_lck); - - LocalHotStandbyActive = true; - - SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY); - } + LWLockRelease(ControlFileLock); } /* @@ -8393,7 +5707,7 @@ PerformRecoveryXLogAction(void) * fully out of recovery mode and already accepting queries. */ if (ArchiveRecoveryRequested && IsUnderPostmaster && - LocalPromoteIsTriggered) + PromoteIsTriggered()) { promoted = true; @@ -8472,47 +5786,6 @@ GetRecoveryState(void) return retval; } -/* - * Is HotStandby active yet? This is only important in special backends - * since normal backends won't ever be able to connect until this returns - * true. Postmaster knows this by way of signal, not via shared memory. - * - * Unlike testing standbyState, this works in any process that's connected to - * shared memory. (And note that standbyState alone doesn't tell the truth - * anyway.) - */ -bool -HotStandbyActive(void) -{ - /* - * We check shared state each time only until Hot Standby is active. We - * can't de-activate Hot Standby, so there's no need to keep checking - * after the shared variable has once been seen true. - */ - if (LocalHotStandbyActive) - return true; - else - { - /* spinlock is essential on machines with weak memory ordering! */ - SpinLockAcquire(&XLogCtl->info_lck); - LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive; - SpinLockRelease(&XLogCtl->info_lck); - - return LocalHotStandbyActive; - } -} - -/* - * Like HotStandbyActive(), but to be used only in WAL replay code, - * where we don't need to ask any other process what the state is. - */ -bool -HotStandbyActiveInReplay(void) -{ - Assert(AmStartupProcess() || !IsPostmasterEnvironment); - return LocalHotStandbyActive; -} - /* * Is this process allowed to insert new WAL records? * @@ -8563,109 +5836,6 @@ LocalSetXLogInsertAllowed(void) return oldXLogAllowed; } -/* - * Subroutine to try to fetch and validate a prior checkpoint record. - * - * whichChkpt identifies the checkpoint (merely for reporting purposes). - * 1 for "primary", 0 for "other" (backup_label) - */ -static XLogRecord * -ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, - int whichChkpt, bool report, TimeLineID replayTLI) -{ - XLogRecord *record; - uint8 info; - - if (!XRecOffIsValid(RecPtr)) - { - if (!report) - return NULL; - - switch (whichChkpt) - { - case 1: - ereport(LOG, - (errmsg("invalid primary checkpoint link in control file"))); - break; - default: - ereport(LOG, - (errmsg("invalid checkpoint link in backup_label file"))); - break; - } - return NULL; - } - - XLogBeginRead(xlogreader, RecPtr); - record = ReadRecord(xlogreader, LOG, true, replayTLI); - - if (record == NULL) - { - if (!report) - return NULL; - - switch (whichChkpt) - { - case 1: - ereport(LOG, - (errmsg("invalid primary checkpoint record"))); - break; - default: - ereport(LOG, - (errmsg("invalid checkpoint record"))); - break; - } - return NULL; - } - if (record->xl_rmid != RM_XLOG_ID) - { - switch (whichChkpt) - { - case 1: - ereport(LOG, - (errmsg("invalid resource manager ID in primary checkpoint record"))); - break; - default: - ereport(LOG, - (errmsg("invalid resource manager ID in checkpoint record"))); - break; - } - return NULL; - } - info = record->xl_info & ~XLR_INFO_MASK; - if (info != XLOG_CHECKPOINT_SHUTDOWN && - info != XLOG_CHECKPOINT_ONLINE) - { - switch (whichChkpt) - { - case 1: - ereport(LOG, - (errmsg("invalid xl_info in primary checkpoint record"))); - break; - default: - ereport(LOG, - (errmsg("invalid xl_info in checkpoint record"))); - break; - } - return NULL; - } - if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint)) - { - switch (whichChkpt) - { - case 1: - ereport(LOG, - (errmsg("invalid length of primary checkpoint record"))); - break; - default: - ereport(LOG, - (errmsg("invalid length of checkpoint record"))); - break; - } - return NULL; - } - return record; -} - /* * Return the current Redo pointer from shared memory. * @@ -9849,8 +7019,8 @@ CreateRestartPoint(int flags) ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID; /* update local copy */ - minRecoveryPoint = ControlFile->minRecoveryPoint; - minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; } if (flags & CHECKPOINT_IS_SHUTDOWN) ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; @@ -10313,67 +7483,20 @@ UpdateFullPageWrites(void) END_CRIT_SECTION(); } -/* - * Check that it's OK to switch to new timeline during recovery. - * - * 'lsn' is the address of the shutdown checkpoint record we're about to - * replay. (Currently, timeline can only change at a shutdown checkpoint). - */ -static void -checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, - TimeLineID replayTLI) -{ - /* Check that the record agrees on what the current (old) timeline is */ - if (prevTLI != replayTLI) - ereport(PANIC, - (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record", - prevTLI, replayTLI))); - - /* - * The new timeline better be in the list of timelines we expect to see, - * according to the timeline history. It should also not decrease. - */ - if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs)) - ereport(PANIC, - (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", - newTLI, replayTLI))); - - /* - * If we have not yet reached min recovery point, and we're about to - * switch to a timeline greater than the timeline of the min recovery - * point: trouble. After switching to the new timeline, we could not - * possibly visit the min recovery point on the correct timeline anymore. - * This can happen if there is a newer timeline in the archive that - * branched before the timeline the min recovery point is on, and you - * attempt to do PITR to the new timeline. - */ - if (!XLogRecPtrIsInvalid(minRecoveryPoint) && - lsn < minRecoveryPoint && - newTLI > minRecoveryPointTLI) - ereport(PANIC, - (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", - newTLI, - LSN_FORMAT_ARGS(minRecoveryPoint), - minRecoveryPointTLI))); - - /* Looks good */ -} - /* * XLOG resource manager's routines * * Definitions of info values are in include/catalog/pg_control.h, though * not all record types are related to control file updates. + * + * NOTE: Some XLOG record types that are directly related to WAL recovery + * are handled in xlogrecovery_redo(). */ void xlog_redo(XLogReaderState *record) { uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; XLogRecPtr lsn = record->EndRecPtr; - TimeLineID replayTLI; - - /* No other process can change this, so we can read it without a lock. */ - replayTLI = XLogCtl->replayEndTLI; /* * In XLOG rmgr, backup blocks are only used by XLOG_FPI and @@ -10402,6 +7525,7 @@ xlog_redo(XLogReaderState *record) else if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; + TimeLineID replayTLI; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In a SHUTDOWN checkpoint, believe the counters exactly */ @@ -10487,6 +7611,7 @@ xlog_redo(XLogReaderState *record) * We should've already switched to the new TLI before replaying this * record. */ + (void) GetCurrentReplayRecPtr(&replayTLI); if (checkPoint.ThisTimeLineID != replayTLI) ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record", @@ -10497,6 +7622,7 @@ xlog_redo(XLogReaderState *record) else if (info == XLOG_CHECKPOINT_ONLINE) { CheckPoint checkPoint; + TimeLineID replayTLI; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In an ONLINE checkpoint, treat the XID counter as a minimum */ @@ -10543,6 +7669,7 @@ xlog_redo(XLogReaderState *record) SpinLockRelease(&XLogCtl->info_lck); /* TLI should not change in an on-line checkpoint */ + (void) GetCurrentReplayRecPtr(&replayTLI); if (checkPoint.ThisTimeLineID != replayTLI) ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record", @@ -10552,14 +7679,12 @@ xlog_redo(XLogReaderState *record) } else if (info == XLOG_OVERWRITE_CONTRECORD) { - xl_overwrite_contrecord xlrec; - - memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); - VerifyOverwriteContrecord(&xlrec, record); + /* nothing to do here, handled in xlogrecovery_redo() */ } else if (info == XLOG_END_OF_RECOVERY) { xl_end_of_recovery xlrec; + TimeLineID replayTLI; memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery)); @@ -10573,6 +7698,7 @@ xlog_redo(XLogReaderState *record) * We should've already switched to the new TLI before replaying this * record. */ + (void) GetCurrentReplayRecPtr(&replayTLI); if (xlrec.ThisTimeLineID != replayTLI) ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record", @@ -10588,7 +7714,7 @@ xlog_redo(XLogReaderState *record) } else if (info == XLOG_RESTORE_POINT) { - /* nothing to do here */ + /* nothing to do here, handled in xlogrecovery.c */ } else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) { @@ -10626,34 +7752,7 @@ xlog_redo(XLogReaderState *record) } else if (info == XLOG_BACKUP_END) { - XLogRecPtr startpoint; - - memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint)); - - if (ControlFile->backupStartPoint == startpoint) - { - /* - * We have reached the end of base backup, the point where - * pg_stop_backup() was done. The data on disk is now consistent. - * Reset backupStartPoint, and update minRecoveryPoint to make - * sure we don't allow starting up at an earlier point even if - * recovery is stopped and restarted soon after this. - */ - elog(DEBUG1, "end of backup reached"); - - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - - if (ControlFile->minRecoveryPoint < lsn) - { - ControlFile->minRecoveryPoint = lsn; - ControlFile->minRecoveryPointTLI = replayTLI; - } - ControlFile->backupStartPoint = InvalidXLogRecPtr; - ControlFile->backupEndRequired = false; - UpdateControlFile(); - - LWLockRelease(ControlFileLock); - } + /* nothing to do here, handled in xlogrecovery_redo() */ } else if (info == XLOG_PARAMETER_CHANGE) { @@ -10681,11 +7780,14 @@ xlog_redo(XLogReaderState *record) */ if (InArchiveRecovery) { - minRecoveryPoint = ControlFile->minRecoveryPoint; - minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + LocalMinRecoveryPoint = ControlFile->minRecoveryPoint; + LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI; } - if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn) + if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn) { + TimeLineID replayTLI; + + (void) GetCurrentReplayRecPtr(&replayTLI); ControlFile->minRecoveryPoint = lsn; ControlFile->minRecoveryPointTLI = replayTLI; } @@ -10724,102 +7826,6 @@ xlog_redo(XLogReaderState *record) } } -/* - * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. - */ -static void -VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state) -{ - if (xlrec->overwritten_lsn != state->overwrittenRecPtr) - elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X", - LSN_FORMAT_ARGS(xlrec->overwritten_lsn), - LSN_FORMAT_ARGS(state->overwrittenRecPtr)); - - ereport(LOG, - (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s", - LSN_FORMAT_ARGS(xlrec->overwritten_lsn), - timestamptz_to_str(xlrec->overwrite_time)))); - - /* Verifying the record should only happen once */ - state->overwrittenRecPtr = InvalidXLogRecPtr; -} - -#ifdef WAL_DEBUG - -static void -xlog_outrec(StringInfo buf, XLogReaderState *record) -{ - appendStringInfo(buf, "prev %X/%X; xid %u", - LSN_FORMAT_ARGS(XLogRecGetPrev(record)), - XLogRecGetXid(record)); - - appendStringInfo(buf, "; len %u", - XLogRecGetDataLen(record)); - - xlog_block_info(buf, record); -} -#endif /* WAL_DEBUG */ - -/* - * Returns a string giving information about all the blocks in an - * XLogRecord. - */ -static void -xlog_block_info(StringInfo buf, XLogReaderState *record) -{ - int block_id; - - /* decode block references */ - for (block_id = 0; block_id <= record->max_block_id; block_id++) - { - RelFileNode rnode; - ForkNumber forknum; - BlockNumber blk; - - if (!XLogRecHasBlockRef(record, block_id)) - continue; - - XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); - if (forknum != MAIN_FORKNUM) - appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u", - block_id, - rnode.spcNode, rnode.dbNode, rnode.relNode, - forknum, - blk); - else - appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u", - block_id, - rnode.spcNode, rnode.dbNode, rnode.relNode, - blk); - if (XLogRecHasBlockImage(record, block_id)) - appendStringInfoString(buf, " FPW"); - } -} - -/* - * Returns a string describing an XLogRecord, consisting of its identity - * optionally followed by a colon, a space, and a further description. - */ -static void -xlog_outdesc(StringInfo buf, XLogReaderState *record) -{ - RmgrId rmid = XLogRecGetRmid(record); - uint8 info = XLogRecGetInfo(record); - const char *id; - - appendStringInfoString(buf, RmgrTable[rmid].rm_name); - appendStringInfoChar(buf, '/'); - - id = RmgrTable[rmid].rm_identify(info); - if (id == NULL) - appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK); - else - appendStringInfo(buf, "%s: ", id); - - RmgrTable[rmid].rm_desc(buf, record); -} - - /* * Return the (possible) sync flag used for opening a file, depending on the * value of the GUC wal_sync_method. @@ -12024,27 +9030,6 @@ register_persistent_abort_backup_handler(void) already_done = true; } -/* - * Get latest redo apply position. - * - * Exported to allow WALReceiver to read the pointer directly. - */ -XLogRecPtr -GetXLogReplayRecPtr(TimeLineID *replayTLI) -{ - XLogRecPtr recptr; - TimeLineID tli; - - SpinLockAcquire(&XLogCtl->info_lck); - recptr = XLogCtl->lastReplayedEndRecPtr; - tli = XLogCtl->lastReplayedTLI; - SpinLockRelease(&XLogCtl->info_lck); - - if (replayTLI) - *replayTLI = tli; - return recptr; -} - /* * Get latest WAL insert pointer */ @@ -12087,262 +9072,6 @@ GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) LWLockRelease(ControlFileLock); } -/* - * read_backup_label: check to see if a backup_label file is present - * - * If we see a backup_label during recovery, we assume that we are recovering - * from a backup dump file, and we therefore roll forward from the checkpoint - * identified by the label file, NOT what pg_control says. This avoids the - * problem that pg_control might have been archived one or more checkpoints - * later than the start of the dump, and so if we rely on it as the start - * point, we will fail to restore a consistent database state. - * - * Returns true if a backup_label was found (and fills the checkpoint - * location and TLI into *checkPointLoc and *backupLabelTLI, respectively); - * returns false if not. If this backup_label came from a streamed backup, - * *backupEndRequired is set to true. If this backup_label was created during - * recovery, *backupFromStandby is set to true. - * - * Also sets the global variable RedoStartLSN with the LSN read from the - * backup file. - */ -static bool -read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, - bool *backupEndRequired, bool *backupFromStandby) -{ - char startxlogfilename[MAXFNAMELEN]; - TimeLineID tli_from_walseg, - tli_from_file; - FILE *lfp; - char ch; - char backuptype[20]; - char backupfrom[20]; - char backuplabel[MAXPGPATH]; - char backuptime[128]; - uint32 hi, - lo; - - /* suppress possible uninitialized-variable warnings */ - *checkPointLoc = InvalidXLogRecPtr; - *backupLabelTLI = 0; - *backupEndRequired = false; - *backupFromStandby = false; - - /* - * See if label file is present - */ - lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); - if (!lfp) - { - if (errno != ENOENT) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not read file \"%s\": %m", - BACKUP_LABEL_FILE))); - return false; /* it's not there, all is fine */ - } - - /* - * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code - * is pretty crude, but we are not expecting any variability in the file - * format). - */ - if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", - &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); - RedoStartLSN = ((uint64) hi) << 32 | lo; - if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c", - &hi, &lo, &ch) != 3 || ch != '\n') - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); - *checkPointLoc = ((uint64) hi) << 32 | lo; - - /* - * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore - * from an older backup anyway, but since the information on it is not - * strictly required, don't error out if it's missing for some reason. - */ - if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1) - { - if (strcmp(backuptype, "streamed") == 0) - *backupEndRequired = true; - } - - if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1) - { - if (strcmp(backupfrom, "standby") == 0) - *backupFromStandby = true; - } - - /* - * Parse START TIME and LABEL. Those are not mandatory fields for recovery - * but checking for their presence is useful for debugging and the next - * sanity checks. Cope also with the fact that the result buffers have a - * pre-allocated size, hence if the backup_label file has been generated - * with strings longer than the maximum assumed here an incorrect parsing - * happens. That's fine as only minor consistency checks are done - * afterwards. - */ - if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1) - ereport(DEBUG1, - (errmsg_internal("backup time %s in file \"%s\"", - backuptime, BACKUP_LABEL_FILE))); - - if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1) - ereport(DEBUG1, - (errmsg_internal("backup label %s in file \"%s\"", - backuplabel, BACKUP_LABEL_FILE))); - - /* - * START TIMELINE is new as of 11. Its parsing is not mandatory, still use - * it as a sanity check if present. - */ - if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1) - { - if (tli_from_walseg != tli_from_file) - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE), - errdetail("Timeline ID parsed is %u, but expected %u.", - tli_from_file, tli_from_walseg))); - - ereport(DEBUG1, - (errmsg_internal("backup timeline %u in file \"%s\"", - tli_from_file, BACKUP_LABEL_FILE))); - } - - if (ferror(lfp) || FreeFile(lfp)) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not read file \"%s\": %m", - BACKUP_LABEL_FILE))); - - *backupLabelTLI = tli_from_walseg; - - return true; -} - -/* - * read_tablespace_map: check to see if a tablespace_map file is present - * - * If we see a tablespace_map file during recovery, we assume that we are - * recovering from a backup dump file, and we therefore need to create symlinks - * as per the information present in tablespace_map file. - * - * Returns true if a tablespace_map file was found (and fills *tablespaces - * with a tablespaceinfo struct for each tablespace listed in the file); - * returns false if not. - */ -static bool -read_tablespace_map(List **tablespaces) -{ - tablespaceinfo *ti; - FILE *lfp; - char str[MAXPGPATH]; - int ch, - i, - n; - bool was_backslash; - - /* - * See if tablespace_map file is present - */ - lfp = AllocateFile(TABLESPACE_MAP, "r"); - if (!lfp) - { - if (errno != ENOENT) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not read file \"%s\": %m", - TABLESPACE_MAP))); - return false; /* it's not there, all is fine */ - } - - /* - * Read and parse the link name and path lines from tablespace_map file - * (this code is pretty crude, but we are not expecting any variability in - * the file format). De-escape any backslashes that were inserted. - */ - i = 0; - was_backslash = false; - while ((ch = fgetc(lfp)) != EOF) - { - if (!was_backslash && (ch == '\n' || ch == '\r')) - { - if (i == 0) - continue; /* \r immediately followed by \n */ - - /* - * The de-escaped line should contain an OID followed by exactly - * one space followed by a path. The path might start with - * spaces, so don't be too liberal about parsing. - */ - str[i] = '\0'; - n = 0; - while (str[n] && str[n] != ' ') - n++; - if (n < 1 || n >= i - 1) - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); - str[n++] = '\0'; - - ti = palloc0(sizeof(tablespaceinfo)); - ti->oid = pstrdup(str); - ti->path = pstrdup(str + n); - *tablespaces = lappend(*tablespaces, ti); - - i = 0; - continue; - } - else if (!was_backslash && ch == '\\') - was_backslash = true; - else - { - if (i < sizeof(str) - 1) - str[i++] = ch; - was_backslash = false; - } - } - - if (i != 0 || was_backslash) /* last line not terminated? */ - ereport(FATAL, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); - - if (ferror(lfp) || FreeFile(lfp)) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not read file \"%s\": %m", - TABLESPACE_MAP))); - - return true; -} - -/* - * Error context callback for errors occurring during rm_redo(). - */ -static void -rm_redo_error_callback(void *arg) -{ - XLogReaderState *record = (XLogReaderState *) arg; - StringInfoData buf; - - initStringInfo(&buf); - xlog_outdesc(&buf, record); - xlog_block_info(&buf, record); - - /* translator: %s is a WAL record description */ - errcontext("WAL redo at %X/%X for %s", - LSN_FORMAT_ARGS(record->ReadRecPtr), - buf.data); - - pfree(buf.data); -} - /* * BackupInProgress: check if online backup mode is active * @@ -12424,715 +9153,8 @@ CancelBackup(void) } } -/* - * Read the XLOG page containing RecPtr into readBuf (if not read already). - * Returns number of bytes read, if the page is read successfully, or -1 - * in case of errors. When errors occur, they are ereport'ed, but only - * if they have not been previously reported. - * - * This is responsible for restoring files from archive as needed, as well - * as for waiting for the requested WAL record to arrive in standby mode. - * - * 'emode' specifies the log level used for reporting "file not found" or - * "end of WAL" situations in archive recovery, or in standby mode when a - * trigger file is found. If set to WARNING or below, XLogPageRead() returns - * false in those situations, on higher log levels the ereport() won't - * return. - * - * In standby mode, if after a successful return of XLogPageRead() the - * caller finds the record it's interested in to be broken, it should - * ereport the error with the level determined by - * emode_for_corrupt_record(), and then set lastSourceFailed - * and call XLogPageRead() again with the same arguments. This lets - * XLogPageRead() to try fetching the record from another source, or to - * sleep and retry. - */ -static int -XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, - XLogRecPtr targetRecPtr, char *readBuf) -{ - XLogPageReadPrivate *private = - (XLogPageReadPrivate *) xlogreader->private_data; - int emode = private->emode; - uint32 targetPageOff; - XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; - int r; - - XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); - targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); - - /* - * See if we need to switch to a new segment because the requested record - * is not in the currently open one. - */ - if (readFile >= 0 && - !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size)) - { - /* - * Request a restartpoint if we've replayed too much xlog since the - * last one. - */ - if (ArchiveRecoveryRequested && IsUnderPostmaster) - { - if (XLogCheckpointNeeded(readSegNo)) - { - (void) GetRedoRecPtr(); - if (XLogCheckpointNeeded(readSegNo)) - RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); - } - } - - close(readFile); - readFile = -1; - readSource = XLOG_FROM_ANY; - } - - XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size); - -retry: - /* See if we need to retrieve more data */ - if (readFile < 0 || - (readSource == XLOG_FROM_STREAM && - flushedUpto < targetPagePtr + reqLen)) - { - if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen, - private->randAccess, - private->fetching_ckpt, - targetRecPtr, - private->replayTLI, - xlogreader->EndRecPtr)) - { - if (readFile >= 0) - close(readFile); - readFile = -1; - readLen = 0; - readSource = XLOG_FROM_ANY; - - return -1; - } - } - - /* - * At this point, we have the right segment open and if we're streaming we - * know the requested record is in it. - */ - Assert(readFile != -1); - - /* - * If the current segment is being streamed from the primary, calculate - * how much of the current page we have received already. We know the - * requested record has been received, but this is for the benefit of - * future calls, to allow quick exit at the top of this function. - */ - if (readSource == XLOG_FROM_STREAM) - { - if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ)) - readLen = XLOG_BLCKSZ; - else - readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) - - targetPageOff; - } - else - readLen = XLOG_BLCKSZ; - - /* Read the requested page */ - readOff = targetPageOff; - - pgstat_report_wait_start(WAIT_EVENT_WAL_READ); - r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); - if (r != XLOG_BLCKSZ) - { - char fname[MAXFNAMELEN]; - int save_errno = errno; - - pgstat_report_wait_end(); - XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); - if (r < 0) - { - errno = save_errno; - ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), - (errcode_for_file_access(), - errmsg("could not read from log segment %s, offset %u: %m", - fname, readOff))); - } - else - ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("could not read from log segment %s, offset %u: read %d of %zu", - fname, readOff, r, (Size) XLOG_BLCKSZ))); - goto next_record_is_invalid; - } - pgstat_report_wait_end(); - - Assert(targetSegNo == readSegNo); - Assert(targetPageOff == readOff); - Assert(reqLen <= readLen); - - xlogreader->seg.ws_tli = curFileTLI; - - /* - * Check the page header immediately, so that we can retry immediately if - * it's not valid. This may seem unnecessary, because ReadPageInternal() - * validates the page header anyway, and would propagate the failure up to - * ReadRecord(), which would retry. However, there's a corner case with - * continuation records, if a record is split across two pages such that - * we would need to read the two pages from different sources. For - * example, imagine a scenario where a streaming replica is started up, - * and replay reaches a record that's split across two WAL segments. The - * first page is only available locally, in pg_wal, because it's already - * been recycled on the primary. The second page, however, is not present - * in pg_wal, and we should stream it from the primary. There is a - * recycled WAL segment present in pg_wal, with garbage contents, however. - * We would read the first page from the local WAL segment, but when - * reading the second page, we would read the bogus, recycled, WAL - * segment. If we didn't catch that case here, we would never recover, - * because ReadRecord() would retry reading the whole record from the - * beginning. - * - * Of course, this only catches errors in the page header, which is what - * happens in the case of a recycled WAL segment. Other kinds of errors or - * corruption still has the same problem. But this at least fixes the - * common case, which can happen as part of normal operation. - * - * Validating the page header is cheap enough that doing it twice - * shouldn't be a big deal from a performance point of view. - * - * When not in standby mode, an invalid page header should cause recovery - * to end, not retry reading the page, so we don't need to validate the - * page header here for the retry. Instead, ReadPageInternal() is - * responsible for the validation. - */ - if (StandbyMode && - !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)) - { - /* - * Emit this error right now then retry this page immediately. Use - * errmsg_internal() because the message was already translated. - */ - if (xlogreader->errormsg_buf[0]) - ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), - (errmsg_internal("%s", xlogreader->errormsg_buf))); - - /* reset any error XLogReaderValidatePageHeader() might have set */ - xlogreader->errormsg_buf[0] = '\0'; - goto next_record_is_invalid; - } - - return readLen; - -next_record_is_invalid: - lastSourceFailed = true; - - if (readFile >= 0) - close(readFile); - readFile = -1; - readLen = 0; - readSource = XLOG_FROM_ANY; - - /* In standby-mode, keep trying */ - if (StandbyMode) - goto retry; - else - return -1; -} - -/* - * Open the WAL segment containing WAL location 'RecPtr'. - * - * The segment can be fetched via restore_command, or via walreceiver having - * streamed the record, or it can already be present in pg_wal. Checking - * pg_wal is mainly for crash recovery, but it will be polled in standby mode - * too, in case someone copies a new segment directly to pg_wal. That is not - * documented or recommended, though. - * - * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should - * prepare to read WAL starting from RedoStartLSN after this. - * - * 'RecPtr' might not point to the beginning of the record we're interested - * in, it might also point to the page or segment header. In that case, - * 'tliRecPtr' is the position of the WAL record we're interested in. It is - * used to decide which timeline to stream the requested WAL from. - * - * 'replayLSN' is the current replay LSN, so that if we scan for new - * timelines, we can reject a switch to a timeline that branched off before - * this point. - * - * If the record is not immediately available, the function returns false - * if we're not in standby mode. In standby mode, waits for it to become - * available. - * - * When the requested record becomes available, the function opens the file - * containing it (if not open already), and returns true. When end of standby - * mode is triggered by the user, and there is no more WAL available, returns - * false. - */ -static bool -WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, - bool fetching_ckpt, XLogRecPtr tliRecPtr, - TimeLineID replayTLI, XLogRecPtr replayLSN) -{ - static TimestampTz last_fail_time = 0; - TimestampTz now; - bool streaming_reply_sent = false; - - /*------- - * Standby mode is implemented by a state machine: - * - * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just - * pg_wal (XLOG_FROM_PG_WAL) - * 2. Check trigger file - * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) - * 4. Rescan timelines - * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1. - * - * Failure to read from the current source advances the state machine to - * the next state. - * - * 'currentSource' indicates the current state. There are no currentSource - * values for "check trigger", "rescan timelines", and "sleep" states, - * those actions are taken when reading from the previous source fails, as - * part of advancing to the next state. - * - * If standby mode is turned off while reading WAL from stream, we move - * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching - * the files (which would be required at end of recovery, e.g., timeline - * history file) from archive or pg_wal. We don't need to kill WAL receiver - * here because it's already stopped when standby mode is turned off at - * the end of recovery. - *------- - */ - if (!InArchiveRecovery) - currentSource = XLOG_FROM_PG_WAL; - else if (currentSource == XLOG_FROM_ANY || - (!StandbyMode && currentSource == XLOG_FROM_STREAM)) - { - lastSourceFailed = false; - currentSource = XLOG_FROM_ARCHIVE; - } - - for (;;) - { - XLogSource oldSource = currentSource; - bool startWalReceiver = false; - - /* - * First check if we failed to read from the current source, and - * advance the state machine if so. The failure to read might've - * happened outside this function, e.g when a CRC check fails on a - * record, or within this loop. - */ - if (lastSourceFailed) - { - switch (currentSource) - { - case XLOG_FROM_ARCHIVE: - case XLOG_FROM_PG_WAL: - - /* - * Check to see if the trigger file exists. Note that we - * do this only after failure, so when you create the - * trigger file, we still finish replaying as much as we - * can from archive and pg_wal before failover. - */ - if (StandbyMode && CheckForStandbyTrigger()) - { - XLogShutdownWalRcv(); - return false; - } - - /* - * Not in standby mode, and we've now tried the archive - * and pg_wal. - */ - if (!StandbyMode) - return false; - - /* - * Move to XLOG_FROM_STREAM state, and set to start a - * walreceiver if necessary. - */ - currentSource = XLOG_FROM_STREAM; - startWalReceiver = true; - break; - - case XLOG_FROM_STREAM: - - /* - * Failure while streaming. Most likely, we got here - * because streaming replication was terminated, or - * promotion was triggered. But we also get here if we - * find an invalid record in the WAL streamed from the - * primary, in which case something is seriously wrong. - * There's little chance that the problem will just go - * away, but PANIC is not good for availability either, - * especially in hot standby mode. So, we treat that the - * same as disconnection, and retry from archive/pg_wal - * again. The WAL in the archive should be identical to - * what was streamed, so it's unlikely that it helps, but - * one can hope... - */ - - /* - * We should be able to move to XLOG_FROM_STREAM only in - * standby mode. - */ - Assert(StandbyMode); - - /* - * Before we leave XLOG_FROM_STREAM state, make sure that - * walreceiver is not active, so that it won't overwrite - * WAL that we restore from archive. - */ - if (WalRcvStreaming()) - XLogShutdownWalRcv(); - - /* - * Before we sleep, re-scan for possible new timelines if - * we were requested to recover to the latest timeline. - */ - if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) - { - if (rescanLatestTimeLine(replayTLI, replayLSN)) - { - currentSource = XLOG_FROM_ARCHIVE; - break; - } - } - - /* - * XLOG_FROM_STREAM is the last state in our state - * machine, so we've exhausted all the options for - * obtaining the requested WAL. We're going to loop back - * and retry from the archive, but if it hasn't been long - * since last attempt, sleep wal_retrieve_retry_interval - * milliseconds to avoid busy-waiting. - */ - now = GetCurrentTimestamp(); - if (!TimestampDifferenceExceeds(last_fail_time, now, - wal_retrieve_retry_interval)) - { - long wait_time; - - wait_time = wal_retrieve_retry_interval - - TimestampDifferenceMilliseconds(last_fail_time, now); - - (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT | - WL_EXIT_ON_PM_DEATH, - wait_time, - WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL); - ResetLatch(&XLogCtl->recoveryWakeupLatch); - now = GetCurrentTimestamp(); - - /* Handle interrupt signals of startup process */ - HandleStartupProcInterrupts(); - } - last_fail_time = now; - currentSource = XLOG_FROM_ARCHIVE; - break; - - default: - elog(ERROR, "unexpected WAL source %d", currentSource); - } - } - else if (currentSource == XLOG_FROM_PG_WAL) - { - /* - * We just successfully read a file in pg_wal. We prefer files in - * the archive over ones in pg_wal, so try the next file again - * from the archive first. - */ - if (InArchiveRecovery) - currentSource = XLOG_FROM_ARCHIVE; - } - - if (currentSource != oldSource) - elog(DEBUG2, "switched WAL source from %s to %s after %s", - xlogSourceNames[oldSource], xlogSourceNames[currentSource], - lastSourceFailed ? "failure" : "success"); - - /* - * We've now handled possible failure. Try to read from the chosen - * source. - */ - lastSourceFailed = false; - - switch (currentSource) - { - case XLOG_FROM_ARCHIVE: - case XLOG_FROM_PG_WAL: - - /* - * WAL receiver must not be running when reading WAL from - * archive or pg_wal. - */ - Assert(!WalRcvStreaming()); - - /* Close any old file we might have open. */ - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } - /* Reset curFileTLI if random fetch. */ - if (randAccess) - curFileTLI = 0; - - /* - * Try to restore the file from archive, or read an existing - * file from pg_wal. - */ - readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, - currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY : - currentSource); - if (readFile >= 0) - return true; /* success! */ - - /* - * Nope, not found in archive or pg_wal. - */ - lastSourceFailed = true; - break; - - case XLOG_FROM_STREAM: - { - bool havedata; - - /* - * We should be able to move to XLOG_FROM_STREAM only in - * standby mode. - */ - Assert(StandbyMode); - - /* - * First, shutdown walreceiver if its restart has been - * requested -- but no point if we're already slated for - * starting it. - */ - if (pendingWalRcvRestart && !startWalReceiver) - { - XLogShutdownWalRcv(); - - /* - * Re-scan for possible new timelines if we were - * requested to recover to the latest timeline. - */ - if (recoveryTargetTimeLineGoal == - RECOVERY_TARGET_TIMELINE_LATEST) - rescanLatestTimeLine(replayTLI, replayLSN); - - startWalReceiver = true; - } - pendingWalRcvRestart = false; - - /* - * Launch walreceiver if needed. - * - * If fetching_ckpt is true, RecPtr points to the initial - * checkpoint location. In that case, we use RedoStartLSN - * as the streaming start position instead of RecPtr, so - * that when we later jump backwards to start redo at - * RedoStartLSN, we will have the logs streamed already. - */ - if (startWalReceiver && - PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) - { - XLogRecPtr ptr; - TimeLineID tli; - - if (fetching_ckpt) - { - ptr = RedoStartLSN; - tli = ControlFile->checkPointCopy.ThisTimeLineID; - } - else - { - ptr = RecPtr; - - /* - * Use the record begin position to determine the - * TLI, rather than the position we're reading. - */ - tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); - - if (curFileTLI > 0 && tli < curFileTLI) - elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", - LSN_FORMAT_ARGS(tliRecPtr), - tli, curFileTLI); - } - curFileTLI = tli; - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - XLogCtl->InstallXLogFileSegmentActive = true; - LWLockRelease(ControlFileLock); - RequestXLogStreaming(tli, ptr, PrimaryConnInfo, - PrimarySlotName, - wal_receiver_create_temp_slot); - flushedUpto = 0; - } - - /* - * Check if WAL receiver is active or wait to start up. - */ - if (!WalRcvStreaming()) - { - lastSourceFailed = true; - break; - } - - /* - * Walreceiver is active, so see if new data has arrived. - * - * We only advance XLogReceiptTime when we obtain fresh - * WAL from walreceiver and observe that we had already - * processed everything before the most recent "chunk" - * that it flushed to disk. In steady state where we are - * keeping up with the incoming data, XLogReceiptTime will - * be updated on each cycle. When we are behind, - * XLogReceiptTime will not advance, so the grace time - * allotted to conflicting queries will decrease. - */ - if (RecPtr < flushedUpto) - havedata = true; - else - { - XLogRecPtr latestChunkStart; - - flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI); - if (RecPtr < flushedUpto && receiveTLI == curFileTLI) - { - havedata = true; - if (latestChunkStart <= RecPtr) - { - XLogReceiptTime = GetCurrentTimestamp(); - SetCurrentChunkStartTime(XLogReceiptTime); - } - } - else - havedata = false; - } - if (havedata) - { - /* - * Great, streamed far enough. Open the file if it's - * not open already. Also read the timeline history - * file if we haven't initialized timeline history - * yet; it should be streamed over and present in - * pg_wal by now. Use XLOG_FROM_STREAM so that source - * info is set correctly and XLogReceiptTime isn't - * changed. - * - * NB: We must set readTimeLineHistory based on - * recoveryTargetTLI, not receiveTLI. Normally they'll - * be the same, but if recovery_target_timeline is - * 'latest' and archiving is configured, then it's - * possible that we managed to retrieve one or more - * new timeline history files from the archive, - * updating recoveryTargetTLI. - */ - if (readFile < 0) - { - if (!expectedTLEs) - expectedTLEs = readTimeLineHistory(recoveryTargetTLI); - readFile = XLogFileRead(readSegNo, PANIC, - receiveTLI, - XLOG_FROM_STREAM, false); - Assert(readFile >= 0); - } - else - { - /* just make sure source info is correct... */ - readSource = XLOG_FROM_STREAM; - XLogReceiptSource = XLOG_FROM_STREAM; - return true; - } - break; - } - - /* - * Data not here yet. Check for trigger, then wait for - * walreceiver to wake us up when new WAL arrives. - */ - if (CheckForStandbyTrigger()) - { - /* - * Note that we don't "return false" immediately here. - * After being triggered, we still want to replay all - * the WAL that was already streamed. It's in pg_wal - * now, so we just treat this as a failure, and the - * state machine will move on to replay the streamed - * WAL from pg_wal, and then recheck the trigger and - * exit replay. - */ - lastSourceFailed = true; - break; - } - - /* - * Since we have replayed everything we have received so - * far and are about to start waiting for more WAL, let's - * tell the upstream server our replay location now so - * that pg_stat_replication doesn't show stale - * information. - */ - if (!streaming_reply_sent) - { - WalRcvForceReply(); - streaming_reply_sent = true; - } - - /* - * Wait for more WAL to arrive. Time out after 5 seconds - * to react to a trigger file promptly and to check if the - * WAL receiver is still active. - */ - (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT | - WL_EXIT_ON_PM_DEATH, - 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM); - ResetLatch(&XLogCtl->recoveryWakeupLatch); - break; - } - - default: - elog(ERROR, "unexpected WAL source %d", currentSource); - } - - /* - * Check for recovery pause here so that we can confirm more quickly - * that a requested pause has actually taken effect. - */ - if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != - RECOVERY_NOT_PAUSED) - recoveryPausesHere(false); - - /* - * This possibly-long loop needs to handle interrupts of startup - * process. - */ - HandleStartupProcInterrupts(); - } - - return false; /* not reached */ -} - -/* - * Set flag to signal the walreceiver to restart. (The startup process calls - * this on noticing a relevant configuration change.) - */ -void -StartupRequestWalReceiverRestart(void) -{ - if (currentSource == XLOG_FROM_STREAM && WalRcvRunning()) - { - ereport(LOG, - (errmsg("WAL receiver process shutdown requested"))); - - pendingWalRcvRestart = true; - } -} - /* Thin wrapper around ShutdownWalRcv(). */ -static void +void XLogShutdownWalRcv(void) { ShutdownWalRcv(); @@ -13142,153 +9164,25 @@ XLogShutdownWalRcv(void) LWLockRelease(ControlFileLock); } -/* - * Determine what log level should be used to report a corrupt WAL record - * in the current WAL page, previously read by XLogPageRead(). - * - * 'emode' is the error mode that would be used to report a file-not-found - * or legitimate end-of-WAL situation. Generally, we use it as-is, but if - * we're retrying the exact same record that we've tried previously, only - * complain the first time to keep the noise down. However, we only do when - * reading from pg_wal, because we don't expect any invalid records in archive - * or in records streamed from the primary. Files in the archive should be complete, - * and we should never hit the end of WAL because we stop and wait for more WAL - * to arrive before replaying it. - * - * NOTE: This function remembers the RecPtr value it was last called with, - * to suppress repeated messages about the same record. Only call this when - * you are about to ereport(), or you might cause a later message to be - * erroneously suppressed. - */ -static int -emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) -{ - static XLogRecPtr lastComplaint = 0; - - if (readSource == XLOG_FROM_PG_WAL && emode == LOG) - { - if (RecPtr == lastComplaint) - emode = DEBUG1; - else - lastComplaint = RecPtr; - } - return emode; -} - -/* - * Has a standby promotion already been triggered? - * - * Unlike CheckForStandbyTrigger(), this works in any process - * that's connected to shared memory. - */ -bool -PromoteIsTriggered(void) -{ - /* - * We check shared state each time only until a standby promotion is - * triggered. We can't trigger a promotion again, so there's no need to - * keep checking after the shared variable has once been seen true. - */ - if (LocalPromoteIsTriggered) - return true; - - SpinLockAcquire(&XLogCtl->info_lck); - LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered; - SpinLockRelease(&XLogCtl->info_lck); - - return LocalPromoteIsTriggered; -} - -static void -SetPromoteIsTriggered(void) -{ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->SharedPromoteIsTriggered = true; - SpinLockRelease(&XLogCtl->info_lck); - - /* - * Mark the recovery pause state as 'not paused' because the paused state - * ends and promotion continues if a promotion is triggered while recovery - * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly - * return 'paused' while a promotion is ongoing. - */ - SetRecoveryPause(false); - - LocalPromoteIsTriggered = true; -} - -/* - * Check to see whether the user-specified trigger file exists and whether a - * promote request has arrived. If either condition holds, return true. - */ -static bool -CheckForStandbyTrigger(void) -{ - struct stat stat_buf; - - if (LocalPromoteIsTriggered) - return true; - - if (IsPromoteSignaled() && CheckPromoteSignal()) - { - ereport(LOG, (errmsg("received promote request"))); - RemovePromoteSignalFiles(); - ResetPromoteSignaled(); - SetPromoteIsTriggered(); - return true; - } - - if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0) - return false; - - if (stat(PromoteTriggerFile, &stat_buf) == 0) - { - ereport(LOG, - (errmsg("promote trigger file found: %s", PromoteTriggerFile))); - unlink(PromoteTriggerFile); - SetPromoteIsTriggered(); - return true; - } - else if (errno != ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat promote trigger file \"%s\": %m", - PromoteTriggerFile))); - - return false; -} - -/* - * Remove the files signaling a standby promotion request. - */ +/* Enable WAL file recycling and preallocation. */ void -RemovePromoteSignalFiles(void) +SetInstallXLogFileSegmentActive(void) { - unlink(PROMOTE_SIGNAL_FILE); + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + XLogCtl->InstallXLogFileSegmentActive = true; + LWLockRelease(ControlFileLock); } -/* - * Check to see if a promote request has arrived. - */ bool -CheckPromoteSignal(void) +IsInstallXLogFileSegmentActive(void) { - struct stat stat_buf; + bool result; - if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) - return true; + LWLockAcquire(ControlFileLock, LW_SHARED); + result = XLogCtl->InstallXLogFileSegmentActive; + LWLockRelease(ControlFileLock); - return false; -} - -/* - * Wake up startup process to replay newly arrived WAL, or to notice that - * failover has been requested. - */ -void -WakeupRecovery(void) -{ - SetLatch(&XLogCtl->recoveryWakeupLatch); + return result; } /* @@ -13301,12 +9195,3 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } - -/* - * Schedule a walreceiver wakeup in the main recovery loop. - */ -void -XLogRequestWalReceiverReply(void) -{ - doRequestWalReceiverReply = true; -} diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index d8af5aad58..2f900533cd 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -19,8 +19,8 @@ #include #include "access/htup_details.h" -#include "access/xlog.h" #include "access/xlog_internal.h" +#include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "catalog/pg_type.h" #include "funcapi.h" diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c new file mode 100644 index 0000000000..d5269ede80 --- /dev/null +++ b/src/backend/access/transam/xlogrecovery.c @@ -0,0 +1,4537 @@ +/*------------------------------------------------------------------------- + * + * xlogrecovery.c + * Functions for WAL recovery, standby mode + * + * This source file contains functions controlling WAL recovery. + * InitWalRecovery() initializes the system for crash or archive recovery, + * or standby mode, depending on configuration options and the state of + * the control file and possible backup label file. PerformWalRecovery() + * performs the actual WAL replay, calling the rmgr-specific redo routines. + * EndWalRecovery() performs end-of-recovery checks and cleanup actions, + * and prepares information needed to initialize the WAL for writes. In + * addition to these three main functions, there are a bunch of functions + * for interrogating recovery state and controlling the recovery process. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xlogrecovery.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#include +#include +#include + +#include "access/timeline.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/xlogarchive.h" +#include "access/xlogreader.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "catalog/pg_control.h" +#include "commands/tablespace.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "postmaster/startup.h" +#include "replication/basebackup.h" +#include "replication/walreceiver.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/spin.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/ps_status.h" +#include "utils/pg_rusage.h" + +/* Unsupported old recovery command file names (relative to $PGDATA) */ +#define RECOVERY_COMMAND_FILE "recovery.conf" +#define RECOVERY_COMMAND_DONE "recovery.done" + +/* + * GUC support + */ +const struct config_enum_entry recovery_target_action_options[] = { + {"pause", RECOVERY_TARGET_ACTION_PAUSE, false}, + {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false}, + {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false}, + {NULL, 0, false} +}; + +/* options formerly taken from recovery.conf for archive recovery */ +char *recoveryRestoreCommand = NULL; +char *recoveryEndCommand = NULL; +char *archiveCleanupCommand = NULL; +RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; +bool recoveryTargetInclusive = true; +int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE; +TransactionId recoveryTargetXid; +char *recovery_target_time_string; +TimestampTz recoveryTargetTime; +const char *recoveryTargetName; +XLogRecPtr recoveryTargetLSN; +int recovery_min_apply_delay = 0; + +/* options formerly taken from recovery.conf for XLOG streaming */ +char *PrimaryConnInfo = NULL; +char *PrimarySlotName = NULL; +char *PromoteTriggerFile = NULL; +bool wal_receiver_create_temp_slot = false; + +/* + * recoveryTargetTimeLineGoal: what the user requested, if any + * + * recoveryTargetTLIRequested: numeric value of requested timeline, if constant + * + * recoveryTargetTLI: the currently understood target timeline; changes + * + * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and + * the timelines of its known parents, newest first (so recoveryTargetTLI is + * always the first list member). Only these TLIs are expected to be seen in + * the WAL segments we read, and indeed only these TLIs will be considered as + * candidate WAL files to open at all. + * + * curFileTLI: the TLI appearing in the name of the current input WAL file. + * (This is not necessarily the same as the timeline from which we are + * replaying WAL, which StartupXLOG calls replayTLI, because we could be + * scanning data that was copied from an ancestor timeline when the current + * file was created.) During a sequential scan we do not allow this value + * to decrease. + */ +RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST; +TimeLineID recoveryTargetTLIRequested = 0; +TimeLineID recoveryTargetTLI = 0; +static List *expectedTLEs; +static TimeLineID curFileTLI; + +/* + * When ArchiveRecoveryRequested is set, archive recovery was requested, + * ie. signal files were present. When InArchiveRecovery is set, we are + * currently recovering using offline XLOG archives. These variables are only + * valid in the startup process. + * + * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're + * currently performing crash recovery using only XLOG files in pg_wal, but + * will switch to using offline XLOG archives as soon as we reach the end of + * WAL in pg_wal. +*/ +bool ArchiveRecoveryRequested = false; +bool InArchiveRecovery = false; + +/* + * When StandbyModeRequested is set, standby mode was requested, i.e. + * standby.signal file was present. When StandbyMode is set, we are currently + * in standby mode. These variables are only valid in the startup process. + * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery. + */ +static bool StandbyModeRequested = false; +bool StandbyMode = false; + +/* was a signal file present at startup? */ +static bool standby_signal_file_found = false; +static bool recovery_signal_file_found = false; + +/* + * CheckPointLoc is the position of the checkpoint record that determines + * where to start the replay. It comes from the backup label file or the + * control file. + * + * RedoStartLSN is the checkpoint's REDO location, also from the backup label + * file or the control file. In standby mode, XLOG streaming usually starts + * from the position where an invalid record was found. But if we fail to + * read even the initial checkpoint record, we use the REDO location instead + * of the checkpoint location as the start position of XLOG streaming. + * Otherwise we would have to jump backwards to the REDO location after + * reading the checkpoint record, because the REDO record can precede the + * checkpoint record. + */ +static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr; +static TimeLineID CheckPointTLI = 0; +static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr; +static TimeLineID RedoStartTLI = 0; + +/* + * Local copy of SharedHotStandbyActive variable. False actually means "not + * known, need to check the shared state". + */ +static bool LocalHotStandbyActive = false; + +/* + * Local copy of SharedPromoteIsTriggered variable. False actually means "not + * known, need to check the shared state". + */ +static bool LocalPromoteIsTriggered = false; + +/* Has the recovery code requested a walreceiver wakeup? */ +static bool doRequestWalReceiverReply; + +/* XLogReader object used to parse the WAL records */ +static XLogReaderState *xlogreader = NULL; + +/* Parameters passed down from ReadRecord to the XLogPageRead callback. */ +typedef struct XLogPageReadPrivate +{ + int emode; + bool fetching_ckpt; /* are we fetching a checkpoint record? */ + bool randAccess; + TimeLineID replayTLI; +} XLogPageReadPrivate; + +/* flag to tell XLogPageRead that we have started replaying */ +static bool InRedo = false; + +/* + * Codes indicating where we got a WAL file from during recovery, or where + * to attempt to get one. + */ +typedef enum +{ + XLOG_FROM_ANY = 0, /* request to read WAL from any source */ + XLOG_FROM_ARCHIVE, /* restored using restore_command */ + XLOG_FROM_PG_WAL, /* existing file in pg_wal */ + XLOG_FROM_STREAM /* streamed from primary */ +} XLogSource; + +/* human-readable names for XLogSources, for debugging output */ +static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"}; + +/* + * readFile is -1 or a kernel FD for the log file segment that's currently + * open for reading. readSegNo identifies the segment. readOff is the offset + * of the page just read, readLen indicates how much of it has been read into + * readBuf, and readSource indicates where we got the currently open file from. + * + * Note: we could use Reserve/ReleaseExternalFD to track consumption of this + * FD too (like for openLogFile in xlog.c); but it doesn't currently seem + * worthwhile, since the XLOG is not read by general-purpose sessions. + */ +static int readFile = -1; +static XLogSegNo readSegNo = 0; +static uint32 readOff = 0; +static uint32 readLen = 0; +static XLogSource readSource = XLOG_FROM_ANY; + +/* + * Keeps track of which source we're currently reading from. This is + * different from readSource in that this is always set, even when we don't + * currently have a WAL file open. If lastSourceFailed is set, our last + * attempt to read from currentSource failed, and we should try another source + * next. + * + * pendingWalRcvRestart is set when a config change occurs that requires a + * walreceiver restart. This is only valid in XLOG_FROM_STREAM state. + */ +static XLogSource currentSource = XLOG_FROM_ANY; +static bool lastSourceFailed = false; +static bool pendingWalRcvRestart = false; + +/* + * These variables track when we last obtained some WAL data to process, + * and where we got it from. (XLogReceiptSource is initially the same as + * readSource, but readSource gets reset to zero when we don't have data + * to process right now. It is also different from currentSource, which + * also changes when we try to read from a source and fail, while + * XLogReceiptSource tracks where we last successfully read some WAL.) + */ +static TimestampTz XLogReceiptTime = 0; +static XLogSource XLogReceiptSource = XLOG_FROM_ANY; + +/* Local copy of WalRcv->flushedUpto */ +static XLogRecPtr flushedUpto = 0; +static TimeLineID receiveTLI = 0; + +/* + * Copy of minRecoveryPoint and backupEndPoint from the control file. + * + * In order to reach consistency, we must replay the WAL up to + * minRecoveryPoint. If backupEndRequired is true, we must also reach + * backupEndPoint, or if it's invalid, an end-of-backup record corresponding + * to backupStartPoint. + * + * Note: In archive recovery, after consistency has been reached, the + * functions in xlog.c will start updating minRecoveryPoint in the control + * file. But this copy of minRecoveryPoint variable reflects the value at the + * beginning of recovery, and is *not* updated after consistency is reached. + */ +static XLogRecPtr minRecoveryPoint; +static TimeLineID minRecoveryPointTLI; + +static XLogRecPtr backupStartPoint; +static XLogRecPtr backupEndPoint; +static bool backupEndRequired = false; + +/* + * Have we reached a consistent database state? In crash recovery, we have + * to replay all the WAL, so reachedConsistency is never set. During archive + * recovery, the database is consistent once minRecoveryPoint is reached. + * + * Consistent state means that the system is internally consistent, all + * the WAL has been replayed up to a certain point, and importantly, there + * is no trace of later actions on disk. + */ +bool reachedConsistency = false; + +/* Buffers dedicated to consistency checks of size BLCKSZ */ +static char *replay_image_masked = NULL; +static char *primary_image_masked = NULL; + + +/* + * Shared-memory state for WAL recovery. + */ +typedef struct XLogRecoveryCtlData +{ + /* + * SharedHotStandbyActive indicates if we allow hot standby queries to be + * run. Protected by info_lck. + */ + bool SharedHotStandbyActive; + + /* + * SharedPromoteIsTriggered indicates if a standby promotion has been + * triggered. Protected by info_lck. + */ + bool SharedPromoteIsTriggered; + + /* + * recoveryWakeupLatch is used to wake up the startup process to continue + * WAL replay, if it is waiting for WAL to arrive or failover trigger file + * to appear. + * + * Note that the startup process also uses another latch, its procLatch, + * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for + * signaling the startup process in favor of using its procLatch, which + * comports better with possible generic signal handlers using that latch. + * But we should not do that because the startup process doesn't assume + * that it's waken up by walreceiver process or SIGHUP signal handler + * while it's waiting for recovery conflict. The separate latches, + * recoveryWakeupLatch and procLatch, should be used for inter-process + * communication for WAL replay and recovery conflict, respectively. + */ + Latch recoveryWakeupLatch; + + /* + * Last record successfully replayed. + */ + XLogRecPtr lastReplayedReadRecPtr; /* start position */ + XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */ + TimeLineID lastReplayedTLI; /* timeline */ + + /* + * When we're currently replaying a record, ie. in a redo function, + * replayEndRecPtr points to the end+1 of the record being replayed, + * otherwise it's equal to lastReplayedEndRecPtr. + */ + XLogRecPtr replayEndRecPtr; + TimeLineID replayEndTLI; + /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ + TimestampTz recoveryLastXTime; + + /* + * timestamp of when we started replaying the current chunk of WAL data, + * only relevant for replication or archive recovery + */ + TimestampTz currentChunkStartTime; + /* Recovery pause state */ + RecoveryPauseState recoveryPauseState; + ConditionVariable recoveryNotPausedCV; + + slock_t info_lck; /* locks shared variables shown above */ +} XLogRecoveryCtlData; + +static XLogRecoveryCtlData *XLogRecoveryCtl = NULL; + +/* + * abortedRecPtr is the start pointer of a broken record at end of WAL when + * recovery completes; missingContrecPtr is the location of the first + * contrecord that went missing. See CreateOverwriteContrecordRecord for + * details. + */ +static XLogRecPtr abortedRecPtr; +static XLogRecPtr missingContrecPtr; + +/* + * if recoveryStopsBefore/After returns true, it saves information of the stop + * point here + */ +static TransactionId recoveryStopXid; +static TimestampTz recoveryStopTime; +static XLogRecPtr recoveryStopLSN; +static char recoveryStopName[MAXFNAMELEN]; +static bool recoveryStopAfter; + +/* prototypes for local functions */ +static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI); + +static void readRecoverySignalFile(void); +static void validateRecoveryParameters(void); +static bool read_backup_label(XLogRecPtr *checkPointLoc, + TimeLineID *backupLabelTLI, + bool *backupEndRequired, bool *backupFromStandby); +static bool read_tablespace_map(List **tablespaces); + +static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI); +static void CheckRecoveryConsistency(void); +static void rm_redo_error_callback(void *arg); +#ifdef WAL_DEBUG +static void xlog_outrec(StringInfo buf, XLogReaderState *record); +#endif +static void xlog_block_info(StringInfo buf, XLogReaderState *record); +static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, + TimeLineID prevTLI, TimeLineID replayTLI); +static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime); +static void verifyBackupPageConsistency(XLogReaderState *record); + +static bool recoveryStopsBefore(XLogReaderState *record); +static bool recoveryStopsAfter(XLogReaderState *record); +static char *getRecoveryStopReason(void); +static void recoveryPausesHere(bool endOfRecovery); +static bool recoveryApplyDelay(XLogReaderState *record); +static void ConfirmRecoveryPaused(void); + +static XLogRecord *ReadRecord(XLogReaderState *xlogreader, + int emode, bool fetching_ckpt, TimeLineID replayTLI); + +static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *readBuf); +static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, + bool fetching_ckpt, + XLogRecPtr tliRecPtr, + TimeLineID replayTLI, + XLogRecPtr replayLSN); +static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); +static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, + int whichChkpt, bool report, TimeLineID replayTLI); +static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN); +static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, + XLogSource source, bool notfoundOk); +static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); + +static bool CheckForStandbyTrigger(void); +static void SetPromoteIsTriggered(void); +static bool HotStandbyActiveInReplay(void); + +static void SetCurrentChunkStartTime(TimestampTz xtime); +static void SetLatestXTime(TimestampTz xtime); + +/* + * Initialization of shared memory for WAL recovery + */ +Size +XLogRecoveryShmemSize(void) +{ + Size size; + + /* XLogRecoveryCtl */ + size = sizeof(XLogRecoveryCtlData); + + return size; +} + +void +XLogRecoveryShmemInit(void) +{ + bool found; + + XLogRecoveryCtl = (XLogRecoveryCtlData *) + ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found); + if (found) + return; + memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData)); + + SpinLockInit(&XLogRecoveryCtl->info_lck); + InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV); +} + +/* + * Prepare the system for WAL recovery, if needed. + * + * This is called by StartupXLOG() which coordinates the server startup + * sequence. This function analyzes the control file and the backup label + * file, if any, and figures out whether we need to perform crash recovery or + * archive recovery, and how far we need to replay the WAL to reach a + * consistent state. + * + * This doesn't yet change the on-disk state, except for creating the symlinks + * from table space map file if any, and for fetching WAL files needed to find + * the checkpoint record. On entry, the caller has already read the control + * file into memory, and passes it as argument. This function updates it to + * reflect the recovery state, and the caller is expected to write it back to + * disk does after initializing other subsystems, but before calling + * PerformWalRecovery(). + * + * This initializes some global variables like ArchiveModeRequested, and + * StandbyModeRequested and InRecovery. + */ +void +InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, + bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr) +{ + XLogPageReadPrivate *private; + struct stat st; + bool wasShutdown; + XLogRecord *record; + DBState dbstate_at_startup; + bool haveTblspcMap = false; + bool haveBackupLabel = false; + CheckPoint checkPoint; + bool backupFromStandby = false; + + dbstate_at_startup = ControlFile->state; + + /* + * Initialize on the assumption we want to recover to the latest timeline + * that's active according to pg_control. + */ + if (ControlFile->minRecoveryPointTLI > + ControlFile->checkPointCopy.ThisTimeLineID) + recoveryTargetTLI = ControlFile->minRecoveryPointTLI; + else + recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID; + + /* + * Check for signal files, and if so set up state for offline recovery + */ + readRecoverySignalFile(); + validateRecoveryParameters(); + + if (ArchiveRecoveryRequested) + { + if (StandbyModeRequested) + ereport(LOG, + (errmsg("entering standby mode"))); + else if (recoveryTarget == RECOVERY_TARGET_XID) + ereport(LOG, + (errmsg("starting point-in-time recovery to XID %u", + recoveryTargetXid))); + else if (recoveryTarget == RECOVERY_TARGET_TIME) + ereport(LOG, + (errmsg("starting point-in-time recovery to %s", + timestamptz_to_str(recoveryTargetTime)))); + else if (recoveryTarget == RECOVERY_TARGET_NAME) + ereport(LOG, + (errmsg("starting point-in-time recovery to \"%s\"", + recoveryTargetName))); + else if (recoveryTarget == RECOVERY_TARGET_LSN) + ereport(LOG, + (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryTargetLSN)))); + else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) + ereport(LOG, + (errmsg("starting point-in-time recovery to earliest consistent point"))); + else + ereport(LOG, + (errmsg("starting archive recovery"))); + } + + /* + * Take ownership of the wakeup latch if we're going to sleep during + * recovery. + */ + if (ArchiveRecoveryRequested) + OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + + private = palloc0(sizeof(XLogPageReadPrivate)); + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &XLogPageRead, + .segment_open = NULL, + .segment_close = wal_segment_close), + private); + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while allocating a WAL reading processor."))); + xlogreader->system_identifier = ControlFile->system_identifier; + + /* + * Allocate two page buffers dedicated to WAL consistency checks. We do + * it this way, rather than just making static arrays, for two reasons: + * (1) no need to waste the storage in most instantiations of the backend; + * (2) a static char array isn't guaranteed to have any particular + * alignment, whereas palloc() will provide MAXALIGN'd storage. + */ + replay_image_masked = (char *) palloc(BLCKSZ); + primary_image_masked = (char *) palloc(BLCKSZ); + + if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired, + &backupFromStandby)) + { + List *tablespaces = NIL; + + /* + * Archive recovery was requested, and thanks to the backup label + * file, we know how far we need to replay to reach consistency. Enter + * archive recovery directly. + */ + InArchiveRecovery = true; + if (StandbyModeRequested) + StandbyMode = true; + + /* + * When a backup_label file is present, we want to roll forward from + * the checkpoint it identifies, rather than using pg_control. + */ + record = ReadCheckpointRecord(xlogreader, CheckPointLoc, 0, true, CheckPointTLI); + if (record != NULL) + { + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + ereport(DEBUG1, + (errmsg_internal("checkpoint record is at %X/%X", + LSN_FORMAT_ARGS(CheckPointLoc)))); + InRecovery = true; /* force recovery even if SHUTDOWNED */ + + /* + * Make sure that REDO location exists. This may not be the case + * if there was a crash during an online backup, which left a + * backup_label around that references a WAL segment that's + * already been archived. + */ + if (checkPoint.redo < CheckPointLoc) + { + XLogBeginRead(xlogreader, checkPoint.redo); + if (!ReadRecord(xlogreader, LOG, false, + checkPoint.ThisTimeLineID)) + ereport(FATAL, + (errmsg("could not find redo location referenced by checkpoint record"), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir))); + } + } + else + { + ereport(FATAL, + (errmsg("could not locate required checkpoint record"), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir))); + wasShutdown = false; /* keep compiler quiet */ + } + + /* Read the tablespace_map file if present and create symlinks. */ + if (read_tablespace_map(&tablespaces)) + { + ListCell *lc; + + foreach(lc, tablespaces) + { + tablespaceinfo *ti = lfirst(lc); + char *linkloc; + + linkloc = psprintf("pg_tblspc/%s", ti->oid); + + /* + * Remove the existing symlink if any and Create the symlink + * under PGDATA. + */ + remove_tablespace_symlink(linkloc); + + if (symlink(ti->path, linkloc) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create symbolic link \"%s\": %m", + linkloc))); + + pfree(ti->oid); + pfree(ti->path); + pfree(ti); + } + + /* tell the caller to delete it later */ + haveTblspcMap = true; + } + + /* tell the caller to delete it later */ + haveBackupLabel = true; + } + else + { + /* + * If tablespace_map file is present without backup_label file, there + * is no use of such file. There is no harm in retaining it, but it + * is better to get rid of the map file so that we don't have any + * redundant file in data directory and it will avoid any sort of + * confusion. It seems prudent though to just rename the file out of + * the way rather than delete it completely, also we ignore any error + * that occurs in rename operation as even if map file is present + * without backup_label file, it is harmless. + */ + if (stat(TABLESPACE_MAP, &st) == 0) + { + unlink(TABLESPACE_MAP_OLD); + if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) + ereport(LOG, + (errmsg("ignoring file \"%s\" because no file \"%s\" exists", + TABLESPACE_MAP, BACKUP_LABEL_FILE), + errdetail("File \"%s\" was renamed to \"%s\".", + TABLESPACE_MAP, TABLESPACE_MAP_OLD))); + else + ereport(LOG, + (errmsg("ignoring file \"%s\" because no file \"%s\" exists", + TABLESPACE_MAP, BACKUP_LABEL_FILE), + errdetail("Could not rename file \"%s\" to \"%s\": %m.", + TABLESPACE_MAP, TABLESPACE_MAP_OLD))); + } + + /* + * It's possible that archive recovery was requested, but we don't + * know how far we need to replay the WAL before we reach consistency. + * This can happen for example if a base backup is taken from a + * running server using an atomic filesystem snapshot, without calling + * pg_start/stop_backup. Or if you just kill a running primary server + * and put it into archive recovery by creating a recovery signal + * file. + * + * Our strategy in that case is to perform crash recovery first, + * replaying all the WAL present in pg_wal, and only enter archive + * recovery after that. + * + * But usually we already know how far we need to replay the WAL (up + * to minRecoveryPoint, up to backupEndPoint, or until we see an + * end-of-backup record), and we can enter archive recovery directly. + */ + if (ArchiveRecoveryRequested && + (ControlFile->minRecoveryPoint != InvalidXLogRecPtr || + ControlFile->backupEndRequired || + ControlFile->backupEndPoint != InvalidXLogRecPtr || + ControlFile->state == DB_SHUTDOWNED)) + { + InArchiveRecovery = true; + if (StandbyModeRequested) + StandbyMode = true; + } + + /* Get the last valid checkpoint record. */ + CheckPointLoc = ControlFile->checkPoint; + CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; + RedoStartLSN = ControlFile->checkPointCopy.redo; + RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID; + record = ReadCheckpointRecord(xlogreader, CheckPointLoc, 1, true, + CheckPointTLI); + if (record != NULL) + { + ereport(DEBUG1, + (errmsg_internal("checkpoint record is at %X/%X", + LSN_FORMAT_ARGS(CheckPointLoc)))); + } + else + { + /* + * We used to attempt to go back to a secondary checkpoint record + * here, but only when not in standby mode. We now just fail if we + * can't read the last checkpoint because this allows us to + * simplify processing around checkpoints. + */ + ereport(PANIC, + (errmsg("could not locate a valid checkpoint record"))); + } + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); + } + + /* + * If the location of the checkpoint record is not on the expected + * timeline in the history of the requested timeline, we cannot proceed: + * the backup is not part of the history of the requested timeline. + */ + Assert(expectedTLEs); /* was initialized by reading checkpoint + * record */ + if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) != + CheckPointTLI) + { + XLogRecPtr switchpoint; + + /* + * tliSwitchPoint will throw an error if the checkpoint's timeline is + * not in expectedTLEs at all. + */ + switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL); + ereport(FATAL, + (errmsg("requested timeline %u is not a child of this server's history", + recoveryTargetTLI), + errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.", + LSN_FORMAT_ARGS(ControlFile->checkPoint), + ControlFile->checkPointCopy.ThisTimeLineID, + LSN_FORMAT_ARGS(switchpoint)))); + } + + /* + * The min recovery point should be part of the requested timeline's + * history, too. + */ + if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && + tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != + ControlFile->minRecoveryPointTLI) + ereport(FATAL, + (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", + recoveryTargetTLI, + LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), + ControlFile->minRecoveryPointTLI))); + + ereport(DEBUG1, + (errmsg_internal("redo record is at %X/%X; shutdown %s", + LSN_FORMAT_ARGS(checkPoint.redo), + wasShutdown ? "true" : "false"))); + ereport(DEBUG1, + (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", + U64FromFullTransactionId(checkPoint.nextXid), + checkPoint.nextOid))); + ereport(DEBUG1, + (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", + checkPoint.nextMulti, checkPoint.nextMultiOffset))); + ereport(DEBUG1, + (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", + checkPoint.oldestXid, checkPoint.oldestXidDB))); + ereport(DEBUG1, + (errmsg_internal("oldest MultiXactId: %u, in database %u", + checkPoint.oldestMulti, checkPoint.oldestMultiDB))); + ereport(DEBUG1, + (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u", + checkPoint.oldestCommitTsXid, + checkPoint.newestCommitTsXid))); + if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid))) + ereport(PANIC, + (errmsg("invalid next transaction ID"))); + + /* sanity check */ + if (checkPoint.redo > CheckPointLoc) + ereport(PANIC, + (errmsg("invalid redo in checkpoint record"))); + + /* + * Check whether we need to force recovery from WAL. If it appears to + * have been a clean shutdown and we did not have a recovery signal file, + * then assume no recovery needed. + */ + if (checkPoint.redo < CheckPointLoc) + { + if (wasShutdown) + ereport(PANIC, + (errmsg("invalid redo record in shutdown checkpoint"))); + InRecovery = true; + } + else if (ControlFile->state != DB_SHUTDOWNED) + InRecovery = true; + else if (ArchiveRecoveryRequested) + { + /* force recovery due to presence of recovery signal file */ + InRecovery = true; + } + + /* + * Update pg_control to show that we are recovering and to show the + * selected checkpoint as the place we are starting from. We also mark + * pg_control with any minimum recovery stop point obtained from a backup + * history file. + */ + if (InArchiveRecovery) + { + ControlFile->state = DB_IN_ARCHIVE_RECOVERY; + } + else + { + ereport(LOG, + (errmsg("database system was not properly shut down; " + "automatic recovery in progress"))); + if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID) + ereport(LOG, + (errmsg("crash recovery starts in timeline %u " + "and has target timeline %u", + ControlFile->checkPointCopy.ThisTimeLineID, + recoveryTargetTLI))); + ControlFile->state = DB_IN_CRASH_RECOVERY; + } + ControlFile->checkPoint = CheckPointLoc; + ControlFile->checkPointCopy = checkPoint; + if (InArchiveRecovery) + { + /* initialize minRecoveryPoint if not set yet */ + if (ControlFile->minRecoveryPoint < checkPoint.redo) + { + ControlFile->minRecoveryPoint = checkPoint.redo; + ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID; + } + } + + /* + * Set backupStartPoint if we're starting recovery from a base backup. + * + * Also set backupEndPoint and use minRecoveryPoint as the backup end + * location if we're starting recovery from a base backup which was taken + * from a standby. In this case, the database system status in pg_control + * must indicate that the database was already in recovery. Usually that + * will be DB_IN_ARCHIVE_RECOVERY but also can be + * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted before + * reaching this point; e.g. because restore_command or primary_conninfo + * were faulty. + * + * Any other state indicates that the backup somehow became corrupted and + * we can't sensibly continue with recovery. + */ + if (haveBackupLabel) + { + ControlFile->backupStartPoint = checkPoint.redo; + ControlFile->backupEndRequired = backupEndRequired; + + if (backupFromStandby) + { + if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY && + dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY) + ereport(FATAL, + (errmsg("backup_label contains data inconsistent with control file"), + errhint("This means that the backup is corrupted and you will " + "have to use another backup for recovery."))); + ControlFile->backupEndPoint = ControlFile->minRecoveryPoint; + } + } + + /* remember these, so that we know when we have reached consistency */ + backupStartPoint = ControlFile->backupStartPoint; + backupEndRequired = ControlFile->backupEndRequired; + backupEndPoint = ControlFile->backupEndPoint; + if (InArchiveRecovery) + { + minRecoveryPoint = ControlFile->minRecoveryPoint; + minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; + } + else + { + minRecoveryPoint = InvalidXLogRecPtr; + minRecoveryPointTLI = 0; + } + + /* + * Start recovery assuming that the final record isn't lost. + */ + abortedRecPtr = InvalidXLogRecPtr; + missingContrecPtr = InvalidXLogRecPtr; + + *wasShutdown_ptr = wasShutdown; + *haveBackupLabel_ptr = haveBackupLabel; + *haveTblspcMap_ptr = haveTblspcMap; +} + +/* + * See if there are any recovery signal files and if so, set state for + * recovery. + * + * See if there is a recovery command file (recovery.conf), and if so + * throw an ERROR since as of PG12 we no longer recognize that. + */ +static void +readRecoverySignalFile(void) +{ + struct stat stat_buf; + + if (IsBootstrapProcessingMode()) + return; + + /* + * Check for old recovery API file: recovery.conf + */ + if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("using recovery command file \"%s\" is not supported", + RECOVERY_COMMAND_FILE))); + + /* + * Remove unused .done file, if present. Ignore if absent. + */ + unlink(RECOVERY_COMMAND_DONE); + + /* + * Check for recovery signal files and if found, fsync them since they + * represent server state information. We don't sweat too much about the + * possibility of fsync failure, however. + * + * If present, standby signal file takes precedence. If neither is present + * then we won't enter archive recovery. + */ + if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) + { + int fd; + + fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd >= 0) + { + (void) pg_fsync(fd); + close(fd); + } + standby_signal_file_found = true; + } + else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0) + { + int fd; + + fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd >= 0) + { + (void) pg_fsync(fd); + close(fd); + } + recovery_signal_file_found = true; + } + + StandbyModeRequested = false; + ArchiveRecoveryRequested = false; + if (standby_signal_file_found) + { + StandbyModeRequested = true; + ArchiveRecoveryRequested = true; + } + else if (recovery_signal_file_found) + { + StandbyModeRequested = false; + ArchiveRecoveryRequested = true; + } + else + return; + + /* + * We don't support standby mode in standalone backends; that requires + * other processes such as the WAL receiver to be alive. + */ + if (StandbyModeRequested && !IsUnderPostmaster) + ereport(FATAL, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("standby mode is not supported by single-user servers"))); +} + +static void +validateRecoveryParameters(void) +{ + if (!ArchiveRecoveryRequested) + return; + + /* + * Check for compulsory parameters + */ + if (StandbyModeRequested) + { + if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) && + (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0)) + ereport(WARNING, + (errmsg("specified neither primary_conninfo nor restore_command"), + errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there."))); + } + else + { + if (recoveryRestoreCommand == NULL || + strcmp(recoveryRestoreCommand, "") == 0) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("must specify restore_command when standby mode is not enabled"))); + } + + /* + * Override any inconsistent requests. Note that this is a change of + * behaviour in 9.5; prior to this we simply ignored a request to pause if + * hot_standby = off, which was surprising behaviour. + */ + if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE && + !EnableHotStandby) + recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN; + + /* + * Final parsing of recovery_target_time string; see also + * check_recovery_target_time(). + */ + if (recoveryTarget == RECOVERY_TARGET_TIME) + { + recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, + CStringGetDatum(recovery_target_time_string), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1))); + } + + /* + * If user specified recovery_target_timeline, validate it or compute the + * "latest" value. We can't do this until after we've gotten the restore + * command and set InArchiveRecovery, because we need to fetch timeline + * history files from the archive. + */ + if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC) + { + TimeLineID rtli = recoveryTargetTLIRequested; + + /* Timeline 1 does not have a history file, all else should */ + if (rtli != 1 && !existsTimeLineHistory(rtli)) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery target timeline %u does not exist", + rtli))); + recoveryTargetTLI = rtli; + } + else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) + { + /* We start the "latest" search from pg_control's timeline */ + recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI); + } + else + { + /* + * else we just use the recoveryTargetTLI as already read from + * ControlFile + */ + Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE); + } +} + +/* + * read_backup_label: check to see if a backup_label file is present + * + * If we see a backup_label during recovery, we assume that we are recovering + * from a backup dump file, and we therefore roll forward from the checkpoint + * identified by the label file, NOT what pg_control says. This avoids the + * problem that pg_control might have been archived one or more checkpoints + * later than the start of the dump, and so if we rely on it as the start + * point, we will fail to restore a consistent database state. + * + * Returns true if a backup_label was found (and fills the checkpoint + * location and TLI into *checkPointLoc and *backupLabelTLI, respectively); + * returns false if not. If this backup_label came from a streamed backup, + * *backupEndRequired is set to true. If this backup_label was created during + * recovery, *backupFromStandby is set to true. + * + * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN + * and TLI read from the backup file. + */ +static bool +read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, + bool *backupEndRequired, bool *backupFromStandby) +{ + char startxlogfilename[MAXFNAMELEN]; + TimeLineID tli_from_walseg, + tli_from_file; + FILE *lfp; + char ch; + char backuptype[20]; + char backupfrom[20]; + char backuplabel[MAXPGPATH]; + char backuptime[128]; + uint32 hi, + lo; + + /* suppress possible uninitialized-variable warnings */ + *checkPointLoc = InvalidXLogRecPtr; + *backupLabelTLI = 0; + *backupEndRequired = false; + *backupFromStandby = false; + + /* + * See if label file is present + */ + lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); + if (!lfp) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + return false; /* it's not there, all is fine */ + } + + /* + * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code + * is pretty crude, but we are not expecting any variability in the file + * format). + */ + if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", + &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + RedoStartLSN = ((uint64) hi) << 32 | lo; + RedoStartTLI = tli_from_walseg; + if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c", + &hi, &lo, &ch) != 3 || ch != '\n') + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); + *checkPointLoc = ((uint64) hi) << 32 | lo; + *backupLabelTLI = tli_from_walseg; + + /* + * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore + * from an older backup anyway, but since the information on it is not + * strictly required, don't error out if it's missing for some reason. + */ + if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1) + { + if (strcmp(backuptype, "streamed") == 0) + *backupEndRequired = true; + } + + if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1) + { + if (strcmp(backupfrom, "standby") == 0) + *backupFromStandby = true; + } + + /* + * Parse START TIME and LABEL. Those are not mandatory fields for recovery + * but checking for their presence is useful for debugging and the next + * sanity checks. Cope also with the fact that the result buffers have a + * pre-allocated size, hence if the backup_label file has been generated + * with strings longer than the maximum assumed here an incorrect parsing + * happens. That's fine as only minor consistency checks are done + * afterwards. + */ + if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1) + ereport(DEBUG1, + (errmsg_internal("backup time %s in file \"%s\"", + backuptime, BACKUP_LABEL_FILE))); + + if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1) + ereport(DEBUG1, + (errmsg_internal("backup label %s in file \"%s\"", + backuplabel, BACKUP_LABEL_FILE))); + + /* + * START TIMELINE is new as of 11. Its parsing is not mandatory, still use + * it as a sanity check if present. + */ + if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1) + { + if (tli_from_walseg != tli_from_file) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE), + errdetail("Timeline ID parsed is %u, but expected %u.", + tli_from_file, tli_from_walseg))); + + ereport(DEBUG1, + (errmsg_internal("backup timeline %u in file \"%s\"", + tli_from_file, BACKUP_LABEL_FILE))); + } + + if (ferror(lfp) || FreeFile(lfp)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + BACKUP_LABEL_FILE))); + + return true; +} + +/* + * read_tablespace_map: check to see if a tablespace_map file is present + * + * If we see a tablespace_map file during recovery, we assume that we are + * recovering from a backup dump file, and we therefore need to create symlinks + * as per the information present in tablespace_map file. + * + * Returns true if a tablespace_map file was found (and fills *tablespaces + * with a tablespaceinfo struct for each tablespace listed in the file); + * returns false if not. + */ +static bool +read_tablespace_map(List **tablespaces) +{ + tablespaceinfo *ti; + FILE *lfp; + char str[MAXPGPATH]; + int ch, + i, + n; + bool was_backslash; + + /* + * See if tablespace_map file is present + */ + lfp = AllocateFile(TABLESPACE_MAP, "r"); + if (!lfp) + { + if (errno != ENOENT) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + TABLESPACE_MAP))); + return false; /* it's not there, all is fine */ + } + + /* + * Read and parse the link name and path lines from tablespace_map file + * (this code is pretty crude, but we are not expecting any variability in + * the file format). De-escape any backslashes that were inserted. + */ + i = 0; + was_backslash = false; + while ((ch = fgetc(lfp)) != EOF) + { + if (!was_backslash && (ch == '\n' || ch == '\r')) + { + if (i == 0) + continue; /* \r immediately followed by \n */ + + /* + * The de-escaped line should contain an OID followed by exactly + * one space followed by a path. The path might start with + * spaces, so don't be too liberal about parsing. + */ + str[i] = '\0'; + n = 0; + while (str[n] && str[n] != ' ') + n++; + if (n < 1 || n >= i - 1) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); + str[n++] = '\0'; + + ti = palloc0(sizeof(tablespaceinfo)); + ti->oid = pstrdup(str); + ti->path = pstrdup(str + n); + *tablespaces = lappend(*tablespaces, ti); + + i = 0; + continue; + } + else if (!was_backslash && ch == '\\') + was_backslash = true; + else + { + if (i < sizeof(str) - 1) + str[i++] = ch; + was_backslash = false; + } + } + + if (i != 0 || was_backslash) /* last line not terminated? */ + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); + + if (ferror(lfp) || FreeFile(lfp)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + TABLESPACE_MAP))); + + return true; +} + +/* + * Finish WAL recovery. + * + * This does not close the 'xlogreader' yet, because in some cases the caller + * still wants to re-read the last checkpoint record by calling + * ReadCheckPointRecord(). + * + * Returns the position of the last valid or applied record, after which new + * WAL should be appended, information about why recovery was ended, and some + * other things. See the WalRecoveryResult struct for details. + */ +EndOfWalRecoveryInfo * +FinishWalRecovery(void) +{ + EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo)); + XLogRecPtr lastRec; + TimeLineID lastRecTLI; + XLogRecPtr endOfLog; + + /* + * Kill WAL receiver, if it's still running, before we continue to write + * the startup checkpoint and aborted-contrecord records. It will trump + * over these records and subsequent ones if it's still alive when we + * start writing WAL. + */ + XLogShutdownWalRcv(); + + /* + * We are now done reading the xlog from stream. Turn off streaming + * recovery to force fetching the files (which would be required at end of + * recovery, e.g., timeline history file) from archive or pg_wal. + * + * Note that standby mode must be turned off after killing WAL receiver, + * i.e., calling XLogShutdownWalRcv(). + */ + Assert(!WalRcvStreaming()); + StandbyMode = false; + + /* + * Determine where to start writing WAL next. + * + * Re-fetch the last valid or last applied record, so we can identify the + * exact endpoint of what we consider the valid portion of WAL. There may + * be an incomplete continuation record after that, in which case + * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will + * write a special OVERWRITE_CONTRECORD message to mark that the rest of + * it is intentionally missing. See CreateOverwriteContrecordRecord(). + * + * An important side-effect of this is to load the last page into + * xlogreader. The caller uses it to initialize the WAL for writing. + */ + if (!InRecovery) + { + lastRec = CheckPointLoc; + lastRecTLI = CheckPointTLI; + } + else + { + lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr; + lastRecTLI = XLogRecoveryCtl->lastReplayedTLI; + } + XLogBeginRead(xlogreader, lastRec); + (void) ReadRecord(xlogreader, PANIC, false, lastRecTLI); + endOfLog = xlogreader->EndRecPtr; + + /* + * Remember the TLI in the filename of the XLOG segment containing the + * end-of-log. It could be different from the timeline that endOfLog + * nominally belongs to, if there was a timeline switch in that segment, + * and we were reading the old WAL from a segment belonging to a higher + * timeline. + */ + result->endOfLogTLI = xlogreader->seg.ws_tli; + + if (ArchiveRecoveryRequested) + { + /* + * We are no longer in archive recovery state. + * + * We are now done reading the old WAL. Turn off archive fetching if + * it was active. + */ + Assert(InArchiveRecovery); + InArchiveRecovery = false; + + /* + * If the ending log segment is still open, close it (to avoid + * problems on Windows with trying to rename or delete an open file). + */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + } + + /* + * Copy the last partial block to the caller, for initializing the WAL + * buffer for appending new WAL. + */ + if (endOfLog % XLOG_BLCKSZ != 0) + { + char *page; + int len; + XLogRecPtr pageBeginPtr; + + pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ); + Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size)); + + /* Copy the valid part of the last block */ + len = endOfLog % XLOG_BLCKSZ; + page = palloc(len); + memcpy(page, xlogreader->readBuf, len); + + result->lastPageBeginPtr = pageBeginPtr; + result->lastPage = page; + } + else + { + /* There is no partial block to copy. */ + result->lastPageBeginPtr = endOfLog; + result->lastPage = NULL; + } + + /* + * Create a comment for the history file to explain why and where timeline + * changed. + */ + result->recoveryStopReason = getRecoveryStopReason(); + + result->lastRec = lastRec; + result->lastRecTLI = lastRecTLI; + result->endOfLog = endOfLog; + + result->abortedRecPtr = abortedRecPtr; + result->missingContrecPtr = missingContrecPtr; + + result->standby_signal_file_found = standby_signal_file_found; + result->recovery_signal_file_found = recovery_signal_file_found; + + return result; +} + +/* + * Clean up the WAL reader and leftovers from restoring WAL from archive + */ +void +ShutdownWalRecovery(void) +{ + char recoveryPath[MAXPGPATH]; + + /* Shut down xlogreader */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + XLogReaderFree(xlogreader); + + if (ArchiveRecoveryRequested) + { + /* + * Since there might be a partial WAL segment named RECOVERYXLOG, get + * rid of it. + */ + snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG"); + unlink(recoveryPath); /* ignore any error */ + + /* Get rid of any remaining recovered timeline-history file, too */ + snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY"); + unlink(recoveryPath); /* ignore any error */ + } + + /* + * We don't need the latch anymore. It's not strictly necessary to disown + * it, but let's do it for the sake of tidiness. + */ + if (ArchiveRecoveryRequested) + DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch); +} + +/* + * Perform WAL recovery. + * + * If the system was shut down cleanly, this is never called. + */ +void +PerformWalRecovery(void) +{ + int rmid; + XLogRecord *record; + bool reachedRecoveryTarget = false; + TimeLineID replayTLI; + + /* + * Initialize shared variables for tracking progress of WAL replay, as if + * we had just replayed the record before the REDO location (or the + * checkpoint record itself, if it's a shutdown checkpoint). + */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + if (RedoStartLSN < CheckPointLoc) + { + XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr; + XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN; + XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI; + } + else + { + XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr; + XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr; + XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI; + } + XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr; + XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI; + XLogRecoveryCtl->recoveryLastXTime = 0; + XLogRecoveryCtl->currentChunkStartTime = 0; + XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + /* Also ensure XLogReceiptTime has a sane value */ + XLogReceiptTime = GetCurrentTimestamp(); + + /* + * Let postmaster know we've started redo now, so that it can launch the + * archiver if necessary. + */ + if (IsUnderPostmaster) + SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); + + /* + * Allow read-only connections immediately if we're consistent already. + */ + CheckRecoveryConsistency(); + + /* + * Find the first record that logically follows the checkpoint --- it + * might physically precede it, though. + */ + if (RedoStartLSN < CheckPointLoc) + { + /* back up to find the record */ + replayTLI = RedoStartTLI; + XLogBeginRead(xlogreader, RedoStartLSN); + record = ReadRecord(xlogreader, PANIC, false, replayTLI); + } + else + { + /* just have to read next record after CheckPoint */ + Assert(xlogreader->ReadRecPtr == CheckPointLoc); + replayTLI = CheckPointTLI; + record = ReadRecord(xlogreader, LOG, false, replayTLI); + } + + if (record != NULL) + { + TimestampTz xtime; + PGRUsage ru0; + + pg_rusage_init(&ru0); + + InRedo = true; + + /* Initialize resource managers */ + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_startup != NULL) + RmgrTable[rmid].rm_startup(); + } + + ereport(LOG, + (errmsg("redo starts at %X/%X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); + + /* Prepare to report progress of the redo phase. */ + if (!StandbyMode) + begin_startup_progress_phase(); + + /* + * main redo apply loop + */ + do + { + if (!StandbyMode) + ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)); + +#ifdef WAL_DEBUG + if (XLOG_DEBUG || + (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) || + (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3)) + { + StringInfoData buf; + + initStringInfo(&buf); + appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), + LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); + xlog_outrec(&buf, xlogreader); + appendStringInfoString(&buf, " - "); + xlog_outdesc(&buf, xlogreader); + elog(LOG, "%s", buf.data); + pfree(buf.data); + } +#endif + + /* Handle interrupt signals of startup process */ + HandleStartupProcInterrupts(); + + /* + * Pause WAL replay, if requested by a hot-standby session via + * SetRecoveryPause(). + * + * Note that we intentionally don't take the info_lck spinlock + * here. We might therefore read a slightly stale value of the + * recoveryPause flag, but it can't be very stale (no worse than + * the last spinlock we did acquire). Since a pause request is a + * pretty asynchronous thing anyway, possibly responding to it one + * WAL record later than we otherwise would is a minor issue, so + * it doesn't seem worth adding another spinlock cycle to prevent + * that. + */ + if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + + /* + * Have we reached our recovery target? + */ + if (recoveryStopsBefore(xlogreader)) + { + reachedRecoveryTarget = true; + break; + } + + /* + * If we've been asked to lag the primary, wait on latch until + * enough time has passed. + */ + if (recoveryApplyDelay(xlogreader)) + { + /* + * We test for paused recovery again here. If user sets + * delayed apply, it may be because they expect to pause + * recovery in case of problems, so we must test again here + * otherwise pausing during the delay-wait wouldn't work. + */ + if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + } + + /* + * Apply the record + */ + ApplyWalRecord(xlogreader, record, &replayTLI); + + /* Exit loop if we reached inclusive recovery target */ + if (recoveryStopsAfter(xlogreader)) + { + reachedRecoveryTarget = true; + break; + } + + /* Else, try to fetch the next WAL record */ + record = ReadRecord(xlogreader, LOG, false, replayTLI); + } while (record != NULL); + + /* + * end of main redo apply loop + */ + + if (reachedRecoveryTarget) + { + if (!reachedConsistency) + ereport(FATAL, + (errmsg("requested recovery stop point is before consistent recovery point"))); + + /* + * This is the last point where we can restart recovery with a new + * recovery target, if we shutdown and begin again. After this, + * Resource Managers may choose to do permanent corrective actions + * at end of recovery. + */ + switch (recoveryTargetAction) + { + case RECOVERY_TARGET_ACTION_SHUTDOWN: + + /* + * exit with special return code to request shutdown of + * postmaster. Log messages issued from postmaster. + */ + proc_exit(3); + + case RECOVERY_TARGET_ACTION_PAUSE: + SetRecoveryPause(true); + recoveryPausesHere(true); + + /* drop into promote */ + + case RECOVERY_TARGET_ACTION_PROMOTE: + break; + } + } + + /* Allow resource managers to do any required cleanup. */ + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_cleanup != NULL) + RmgrTable[rmid].rm_cleanup(); + } + + ereport(LOG, + (errmsg("redo done at %X/%X system usage: %s", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), + pg_rusage_show(&ru0)))); + xtime = GetLatestXTime(); + if (xtime) + ereport(LOG, + (errmsg("last completed transaction was at log time %s", + timestamptz_to_str(xtime)))); + + InRedo = false; + } + else + { + /* there are no WAL records following the checkpoint */ + ereport(LOG, + (errmsg("redo is not required"))); + + } + + /* + * This check is intentionally after the above log messages that indicate + * how far recovery went. + */ + if (ArchiveRecoveryRequested && + recoveryTarget != RECOVERY_TARGET_UNSET && + !reachedRecoveryTarget) + ereport(FATAL, + (errmsg("recovery ended before configured recovery target was reached"))); +} + +/* + * Subroutine of PerformWalRecovery, to apply one WAL record. + */ +static void +ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI) +{ + ErrorContextCallback errcallback; + bool switchedTLI = false; + + /* Setup error traceback support for ereport() */ + errcallback.callback = rm_redo_error_callback; + errcallback.arg = (void *) xlogreader; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* + * ShmemVariableCache->nextXid must be beyond record's xid. + */ + AdvanceNextFullTransactionIdPastXid(record->xl_xid); + + /* + * Before replaying this record, check if this record causes the current + * timeline to change. The record is already considered to be part of the + * new timeline, so we update replayTLI before replaying it. That's + * important so that replayEndTLI, which is recorded as the minimum + * recovery point's TLI if recovery stops after this record, is set + * correctly. + */ + if (record->xl_rmid == RM_XLOG_ID) + { + TimeLineID newReplayTLI = *replayTLI; + TimeLineID prevReplayTLI = *replayTLI; + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_CHECKPOINT_SHUTDOWN) + { + CheckPoint checkPoint; + + memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + newReplayTLI = checkPoint.ThisTimeLineID; + prevReplayTLI = checkPoint.PrevTimeLineID; + } + else if (info == XLOG_END_OF_RECOVERY) + { + xl_end_of_recovery xlrec; + + memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery)); + newReplayTLI = xlrec.ThisTimeLineID; + prevReplayTLI = xlrec.PrevTimeLineID; + } + + if (newReplayTLI != *replayTLI) + { + /* Check that it's OK to switch to this TLI */ + checkTimeLineSwitch(xlogreader->EndRecPtr, + newReplayTLI, prevReplayTLI, *replayTLI); + + /* Following WAL records should be run with new TLI */ + *replayTLI = newReplayTLI; + switchedTLI = true; + } + } + + /* + * Update shared replayEndRecPtr before replaying this record, so that + * XLogFlush will update minRecoveryPoint correctly. + */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr; + XLogRecoveryCtl->replayEndTLI = *replayTLI; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + /* + * If we are attempting to enter Hot Standby mode, process XIDs we see + */ + if (standbyState >= STANDBY_INITIALIZED && + TransactionIdIsValid(record->xl_xid)) + RecordKnownAssignedTransactionIds(record->xl_xid); + + /* + * Some XLOG record types that are related to recovery are processed + * directly here, rather than in xlog_redo() + */ + if (record->xl_rmid == RM_XLOG_ID) + xlogrecovery_redo(xlogreader, *replayTLI); + + /* Now apply the WAL record itself */ + RmgrTable[record->xl_rmid].rm_redo(xlogreader); + + /* + * After redo, check whether the backup pages associated with the WAL + * record are consistent with the existing pages. This check is done only + * if consistency check is enabled for this record. + */ + if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) + verifyBackupPageConsistency(xlogreader); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + /* + * Update lastReplayedEndRecPtr after this record has been successfully + * replayed. + */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr; + XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr; + XLogRecoveryCtl->lastReplayedTLI = *replayTLI; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + /* + * If rm_redo called XLogRequestWalReceiverReply, then we wake up the + * receiver so that it notices the updated lastReplayedEndRecPtr and sends + * a reply to the primary. + */ + if (doRequestWalReceiverReply) + { + doRequestWalReceiverReply = false; + WalRcvForceReply(); + } + + /* Allow read-only connections if we're consistent now */ + CheckRecoveryConsistency(); + + /* Is this a timeline switch? */ + if (switchedTLI) + { + /* + * Before we continue on the new timeline, clean up any (possibly + * bogus) future WAL segments on the old timeline. + */ + RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI); + + /* + * Wake up any walsenders to notice that we are on a new timeline. + */ + if (AllowCascadeReplication()) + WalSndWakeup(); + } +} + +/* + * Some XLOG RM record types that are directly related to WAL recovery are + * handled here rather than in the xlog_redo() + */ +static void +xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + XLogRecPtr lsn = record->EndRecPtr; + + Assert(XLogRecGetRmid(record) == RM_XLOG_ID); + + if (info == XLOG_OVERWRITE_CONTRECORD) + { + /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */ + xl_overwrite_contrecord xlrec; + + memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); + if (xlrec.overwritten_lsn != record->overwrittenRecPtr) + elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X", + LSN_FORMAT_ARGS(xlrec.overwritten_lsn), + LSN_FORMAT_ARGS(record->overwrittenRecPtr)); + + ereport(LOG, + (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s", + LSN_FORMAT_ARGS(xlrec.overwritten_lsn), + timestamptz_to_str(xlrec.overwrite_time)))); + + /* Verifying the record should only happen once */ + record->overwrittenRecPtr = InvalidXLogRecPtr; + } + else if (info == XLOG_BACKUP_END) + { + XLogRecPtr startpoint; + + memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint)); + + if (backupStartPoint == startpoint) + { + /* + * We have reached the end of base backup, the point where + * pg_stop_backup() was done. The data on disk is now consistent + * (assuming we have also reached minRecoveryPoint). Set + * backupEndPoint to the current LSN, so that the next call to + * CheckRecoveryConsistency() will notice it and do the + * end-of-backup processing. + */ + elog(DEBUG1, "end of backup record reached"); + + backupEndPoint = lsn; + } + else + elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X", + LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint)); + } +} + +/* + * Checks if recovery has reached a consistent state. When consistency is + * reached and we have a valid starting standby snapshot, tell postmaster + * that it can start accepting read-only connections. + */ +static void +CheckRecoveryConsistency(void) +{ + XLogRecPtr lastReplayedEndRecPtr; + TimeLineID lastReplayedTLI; + + /* + * During crash recovery, we don't reach a consistent state until we've + * replayed all the WAL. + */ + if (XLogRecPtrIsInvalid(minRecoveryPoint)) + return; + + Assert(InArchiveRecovery); + + /* + * assume that we are called in the startup process, and hence don't need + * a lock to read lastReplayedEndRecPtr + */ + lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr; + lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI; + + /* + * Have we reached the point where our base backup was completed? + */ + if (!XLogRecPtrIsInvalid(backupEndPoint) && + backupEndPoint <= lastReplayedEndRecPtr) + { + elog(DEBUG1, "end of backup reached"); + + /* + * We have reached the end of base backup, as indicated by pg_control. + * Update the control file accordingly. + */ + ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI); + backupStartPoint = InvalidXLogRecPtr; + backupEndPoint = InvalidXLogRecPtr; + backupEndRequired = false; + } + + /* + * Have we passed our safe starting point? Note that minRecoveryPoint is + * known to be incorrectly set if ControlFile->backupEndRequired, until + * the XLOG_BACKUP_END arrives to advise us of the correct + * minRecoveryPoint. All we know prior to that is that we're not + * consistent yet. + */ + if (!reachedConsistency && !backupEndRequired && + minRecoveryPoint <= lastReplayedEndRecPtr) + { + /* + * Check to see if the XLOG sequence contained any unresolved + * references to uninitialized pages. + */ + XLogCheckInvalidPages(); + + reachedConsistency = true; + ereport(LOG, + (errmsg("consistent recovery state reached at %X/%X", + LSN_FORMAT_ARGS(lastReplayedEndRecPtr)))); + } + + /* + * Have we got a valid starting snapshot that will allow queries to be + * run? If so, we can tell postmaster that the database is consistent now, + * enabling connections. + */ + if (standbyState == STANDBY_SNAPSHOT_READY && + !LocalHotStandbyActive && + reachedConsistency && + IsUnderPostmaster) + { + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->SharedHotStandbyActive = true; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + LocalHotStandbyActive = true; + + SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY); + } +} + +/* + * Error context callback for errors occurring during rm_redo(). + */ +static void +rm_redo_error_callback(void *arg) +{ + XLogReaderState *record = (XLogReaderState *) arg; + StringInfoData buf; + + initStringInfo(&buf); + xlog_outdesc(&buf, record); + xlog_block_info(&buf, record); + + /* translator: %s is a WAL record description */ + errcontext("WAL redo at %X/%X for %s", + LSN_FORMAT_ARGS(record->ReadRecPtr), + buf.data); + + pfree(buf.data); +} + +/* + * Returns a string describing an XLogRecord, consisting of its identity + * optionally followed by a colon, a space, and a further description. + */ +void +xlog_outdesc(StringInfo buf, XLogReaderState *record) +{ + RmgrId rmid = XLogRecGetRmid(record); + uint8 info = XLogRecGetInfo(record); + const char *id; + + appendStringInfoString(buf, RmgrTable[rmid].rm_name); + appendStringInfoChar(buf, '/'); + + id = RmgrTable[rmid].rm_identify(info); + if (id == NULL) + appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK); + else + appendStringInfo(buf, "%s: ", id); + + RmgrTable[rmid].rm_desc(buf, record); +} + +#ifdef WAL_DEBUG + +static void +xlog_outrec(StringInfo buf, XLogReaderState *record) +{ + appendStringInfo(buf, "prev %X/%X; xid %u", + LSN_FORMAT_ARGS(XLogRecGetPrev(record)), + XLogRecGetXid(record)); + + appendStringInfo(buf, "; len %u", + XLogRecGetDataLen(record)); + + xlog_block_info(buf, record); +} +#endif /* WAL_DEBUG */ + +/* + * Returns a string giving information about all the blocks in an + * XLogRecord. + */ +static void +xlog_block_info(StringInfo buf, XLogReaderState *record) +{ + int block_id; + + /* decode block references */ + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blk; + + if (!XLogRecHasBlockRef(record, block_id)) + continue; + + XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk); + if (forknum != MAIN_FORKNUM) + appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, + blk); + else + appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u", + block_id, + rnode.spcNode, rnode.dbNode, rnode.relNode, + blk); + if (XLogRecHasBlockImage(record, block_id)) + appendStringInfoString(buf, " FPW"); + } +} + + +/* + * Check that it's OK to switch to new timeline during recovery. + * + * 'lsn' is the address of the shutdown checkpoint record we're about to + * replay. (Currently, timeline can only change at a shutdown checkpoint). + */ +static void +checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, + TimeLineID replayTLI) +{ + /* Check that the record agrees on what the current (old) timeline is */ + if (prevTLI != replayTLI) + ereport(PANIC, + (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record", + prevTLI, replayTLI))); + + /* + * The new timeline better be in the list of timelines we expect to see, + * according to the timeline history. It should also not decrease. + */ + if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs)) + ereport(PANIC, + (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", + newTLI, replayTLI))); + + /* + * If we have not yet reached min recovery point, and we're about to + * switch to a timeline greater than the timeline of the min recovery + * point: trouble. After switching to the new timeline, we could not + * possibly visit the min recovery point on the correct timeline anymore. + * This can happen if there is a newer timeline in the archive that + * branched before the timeline the min recovery point is on, and you + * attempt to do PITR to the new timeline. + */ + if (!XLogRecPtrIsInvalid(minRecoveryPoint) && + lsn < minRecoveryPoint && + newTLI > minRecoveryPointTLI) + ereport(PANIC, + (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", + newTLI, + LSN_FORMAT_ARGS(minRecoveryPoint), + minRecoveryPointTLI))); + + /* Looks good */ +} + + +/* + * Extract timestamp from WAL record. + * + * If the record contains a timestamp, returns true, and saves the timestamp + * in *recordXtime. If the record type has no timestamp, returns false. + * Currently, only transaction commit/abort records and restore points contain + * timestamps. + */ +static bool +getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 xact_info = info & XLOG_XACT_OPMASK; + uint8 rmid = XLogRecGetRmid(record); + + if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) + { + *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time; + return true; + } + if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED)) + { + *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time; + return true; + } + if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED)) + { + *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time; + return true; + } + return false; +} + +/* + * Checks whether the current buffer page and backup page stored in the + * WAL record are consistent or not. Before comparing the two pages, a + * masking can be applied to the pages to ignore certain areas like hint bits, + * unused space between pd_lower and pd_upper among other things. This + * function should be called once WAL replay has been completed for a + * given record. + */ +static void +verifyBackupPageConsistency(XLogReaderState *record) +{ + RmgrId rmid = XLogRecGetRmid(record); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + int block_id; + + /* Records with no backup blocks have no need for consistency checks. */ + if (!XLogRecHasAnyBlockRefs(record)) + return; + + Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); + + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + Buffer buf; + Page page; + + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + { + /* + * WAL record doesn't contain a block reference with the given id. + * Do nothing. + */ + continue; + } + + Assert(XLogRecHasBlockImage(record, block_id)); + + if (XLogRecBlockImageApply(record, block_id)) + { + /* + * WAL record has already applied the page, so bypass the + * consistency check as that would result in comparing the full + * page stored in the record with itself. + */ + continue; + } + + /* + * Read the contents from the current buffer and store it in a + * temporary page. + */ + buf = XLogReadBufferExtended(rnode, forknum, blkno, + RBM_NORMAL_NO_LOG); + if (!BufferIsValid(buf)) + continue; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + /* + * Take a copy of the local page where WAL has been applied to have a + * comparison base before masking it... + */ + memcpy(replay_image_masked, page, BLCKSZ); + + /* No need for this page anymore now that a copy is in. */ + UnlockReleaseBuffer(buf); + + /* + * If the block LSN is already ahead of this WAL record, we can't + * expect contents to match. This can happen if recovery is + * restarted. + */ + if (PageGetLSN(replay_image_masked) > record->EndRecPtr) + continue; + + /* + * Read the contents from the backup copy, stored in WAL record and + * store it in a temporary page. There is no need to allocate a new + * page here, a local buffer is fine to hold its contents and a mask + * can be directly applied on it. + */ + if (!RestoreBlockImage(record, block_id, primary_image_masked)) + elog(ERROR, "failed to restore block image"); + + /* + * If masking function is defined, mask both the primary and replay + * images + */ + if (RmgrTable[rmid].rm_mask != NULL) + { + RmgrTable[rmid].rm_mask(replay_image_masked, blkno); + RmgrTable[rmid].rm_mask(primary_image_masked, blkno); + } + + /* Time to compare the primary and replay images. */ + if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) + { + elog(FATAL, + "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + } + } +} + +/* + * For point-in-time recovery, this function decides whether we want to + * stop applying the XLOG before the current record. + * + * Returns true if we are stopping, false otherwise. If stopping, some + * information is saved in recoveryStopXid et al for use in annotating the + * new timeline's history file. + */ +static bool +recoveryStopsBefore(XLogReaderState *record) +{ + bool stopsHere = false; + uint8 xact_info; + bool isCommit; + TimestampTz recordXtime = 0; + TransactionId recordXid; + + /* + * Ignore recovery target settings when not in archive recovery (meaning + * we are in crash recovery). + */ + if (!ArchiveRecoveryRequested) + return false; + + /* Check if we should stop as soon as reaching consistency */ + if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) + { + ereport(LOG, + (errmsg("recovery stopping after reaching consistency"))); + + recoveryStopAfter = false; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + return true; + } + + /* Check if target LSN has been reached */ + if (recoveryTarget == RECOVERY_TARGET_LSN && + !recoveryTargetInclusive && + record->ReadRecPtr >= recoveryTargetLSN) + { + recoveryStopAfter = false; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = record->ReadRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + ereport(LOG, + (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryStopLSN)))); + return true; + } + + /* Otherwise we only consider stopping before COMMIT or ABORT records. */ + if (XLogRecGetRmid(record) != RM_XACT_ID) + return false; + + xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (xact_info == XLOG_XACT_COMMIT) + { + isCommit = true; + recordXid = XLogRecGetXid(record); + } + else if (xact_info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + isCommit = true; + ParseCommitRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else if (xact_info == XLOG_XACT_ABORT) + { + isCommit = false; + recordXid = XLogRecGetXid(record); + } + else if (xact_info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + isCommit = false; + ParseAbortRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else + return false; + + if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive) + { + /* + * There can be only one transaction end record with this exact + * transactionid + * + * when testing for an xid, we MUST test for equality only, since + * transactions are numbered in the order they start, not the order + * they complete. A higher numbered xid will complete before you about + * 50% of the time... + */ + stopsHere = (recordXid == recoveryTargetXid); + } + + if (recoveryTarget == RECOVERY_TARGET_TIME && + getRecordTimestamp(record, &recordXtime)) + { + /* + * There can be many transactions that share the same commit time, so + * we stop after the last one, if we are inclusive, or stop at the + * first one if we are exclusive + */ + if (recoveryTargetInclusive) + stopsHere = (recordXtime > recoveryTargetTime); + else + stopsHere = (recordXtime >= recoveryTargetTime); + } + + if (stopsHere) + { + recoveryStopAfter = false; + recoveryStopXid = recordXid; + recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + + if (isCommit) + { + ereport(LOG, + (errmsg("recovery stopping before commit of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + else + { + ereport(LOG, + (errmsg("recovery stopping before abort of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + } + + return stopsHere; +} + +/* + * Same as recoveryStopsBefore, but called after applying the record. + * + * We also track the timestamp of the latest applied COMMIT/ABORT + * record in XLogRecoveryCtl->recoveryLastXTime. + */ +static bool +recoveryStopsAfter(XLogReaderState *record) +{ + uint8 info; + uint8 xact_info; + uint8 rmid; + TimestampTz recordXtime; + + /* + * Ignore recovery target settings when not in archive recovery (meaning + * we are in crash recovery). + */ + if (!ArchiveRecoveryRequested) + return false; + + info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + rmid = XLogRecGetRmid(record); + + /* + * There can be many restore points that share the same name; we stop at + * the first one. + */ + if (recoveryTarget == RECOVERY_TARGET_NAME && + rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) + { + xl_restore_point *recordRestorePointData; + + recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); + + if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0) + { + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = InvalidXLogRecPtr; + (void) getRecordTimestamp(record, &recoveryStopTime); + strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN); + + ereport(LOG, + (errmsg("recovery stopping at restore point \"%s\", time %s", + recoveryStopName, + timestamptz_to_str(recoveryStopTime)))); + return true; + } + } + + /* Check if the target LSN has been reached */ + if (recoveryTarget == RECOVERY_TARGET_LSN && + recoveryTargetInclusive && + record->ReadRecPtr >= recoveryTargetLSN) + { + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopLSN = record->ReadRecPtr; + recoveryStopTime = 0; + recoveryStopName[0] = '\0'; + ereport(LOG, + (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"", + LSN_FORMAT_ARGS(recoveryStopLSN)))); + return true; + } + + if (rmid != RM_XACT_ID) + return false; + + xact_info = info & XLOG_XACT_OPMASK; + + if (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED || + xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED) + { + TransactionId recordXid; + + /* Update the last applied transaction timestamp */ + if (getRecordTimestamp(record, &recordXtime)) + SetLatestXTime(recordXtime); + + /* Extract the XID of the committed/aborted transaction */ + if (xact_info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + ParseCommitRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else if (xact_info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + ParseAbortRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else + recordXid = XLogRecGetXid(record); + + /* + * There can be only one transaction end record with this exact + * transactionid + * + * when testing for an xid, we MUST test for equality only, since + * transactions are numbered in the order they start, not the order + * they complete. A higher numbered xid will complete before you about + * 50% of the time... + */ + if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive && + recordXid == recoveryTargetXid) + { + recoveryStopAfter = true; + recoveryStopXid = recordXid; + recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + + if (xact_info == XLOG_XACT_COMMIT || + xact_info == XLOG_XACT_COMMIT_PREPARED) + { + ereport(LOG, + (errmsg("recovery stopping after commit of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + else if (xact_info == XLOG_XACT_ABORT || + xact_info == XLOG_XACT_ABORT_PREPARED) + { + ereport(LOG, + (errmsg("recovery stopping after abort of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + return true; + } + } + + /* Check if we should stop as soon as reaching consistency */ + if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) + { + ereport(LOG, + (errmsg("recovery stopping after reaching consistency"))); + + recoveryStopAfter = true; + recoveryStopXid = InvalidTransactionId; + recoveryStopTime = 0; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + return true; + } + + return false; +} + +/* + * Create a comment for the history file to explain why and where + * timeline changed. + */ +static char * +getRecoveryStopReason(void) +{ + char reason[200]; + + if (recoveryTarget == RECOVERY_TARGET_XID) + snprintf(reason, sizeof(reason), + "%s transaction %u", + recoveryStopAfter ? "after" : "before", + recoveryStopXid); + else if (recoveryTarget == RECOVERY_TARGET_TIME) + snprintf(reason, sizeof(reason), + "%s %s\n", + recoveryStopAfter ? "after" : "before", + timestamptz_to_str(recoveryStopTime)); + else if (recoveryTarget == RECOVERY_TARGET_LSN) + snprintf(reason, sizeof(reason), + "%s LSN %X/%X\n", + recoveryStopAfter ? "after" : "before", + LSN_FORMAT_ARGS(recoveryStopLSN)); + else if (recoveryTarget == RECOVERY_TARGET_NAME) + snprintf(reason, sizeof(reason), + "at restore point \"%s\"", + recoveryStopName); + else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) + snprintf(reason, sizeof(reason), "reached consistency"); + else + snprintf(reason, sizeof(reason), "no recovery target specified"); + + return pstrdup(reason); +} + +/* + * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED. + * + * endOfRecovery is true if the recovery target is reached and + * the paused state starts at the end of recovery because of + * recovery_target_action=pause, and false otherwise. + */ +static void +recoveryPausesHere(bool endOfRecovery) +{ + /* Don't pause unless users can connect! */ + if (!LocalHotStandbyActive) + return; + + /* Don't pause after standby promotion has been triggered */ + if (LocalPromoteIsTriggered) + return; + + if (endOfRecovery) + ereport(LOG, + (errmsg("pausing at the end of recovery"), + errhint("Execute pg_wal_replay_resume() to promote."))); + else + ereport(LOG, + (errmsg("recovery has paused"), + errhint("Execute pg_wal_replay_resume() to continue."))); + + /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */ + while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) + { + HandleStartupProcInterrupts(); + if (CheckForStandbyTrigger()) + return; + + /* + * If recovery pause is requested then set it paused. While we are in + * the loop, user might resume and pause again so set this every time. + */ + ConfirmRecoveryPaused(); + + /* + * We wait on a condition variable that will wake us as soon as the + * pause ends, but we use a timeout so we can check the above exit + * condition periodically too. + */ + ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000, + WAIT_EVENT_RECOVERY_PAUSE); + } + ConditionVariableCancelSleep(); +} + +/* + * When recovery_min_apply_delay is set, we wait long enough to make sure + * certain record types are applied at least that interval behind the primary. + * + * Returns true if we waited. + * + * Note that the delay is calculated between the WAL record log time and + * the current time on standby. We would prefer to keep track of when this + * standby received each WAL record, which would allow a more consistent + * approach and one not affected by time synchronisation issues, but that + * is significantly more effort and complexity for little actual gain in + * usability. + */ +static bool +recoveryApplyDelay(XLogReaderState *record) +{ + uint8 xact_info; + TimestampTz xtime; + TimestampTz delayUntil; + long msecs; + + /* nothing to do if no delay configured */ + if (recovery_min_apply_delay <= 0) + return false; + + /* no delay is applied on a database not yet consistent */ + if (!reachedConsistency) + return false; + + /* nothing to do if crash recovery is requested */ + if (!ArchiveRecoveryRequested) + return false; + + /* + * Is it a COMMIT record? + * + * We deliberately choose not to delay aborts since they have no effect on + * MVCC. We already allow replay of records that don't have a timestamp, + * so there is already opportunity for issues caused by early conflicts on + * standbys. + */ + if (XLogRecGetRmid(record) != RM_XACT_ID) + return false; + + xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (xact_info != XLOG_XACT_COMMIT && + xact_info != XLOG_XACT_COMMIT_PREPARED) + return false; + + if (!getRecordTimestamp(record, &xtime)) + return false; + + delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); + + /* + * Exit without arming the latch if it's already past time to apply this + * record + */ + msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil); + if (msecs <= 0) + return false; + + while (true) + { + ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + + /* + * This might change recovery_min_apply_delay or the trigger file's + * location. + */ + HandleStartupProcInterrupts(); + + if (CheckForStandbyTrigger()) + break; + + /* + * Recalculate delayUntil as recovery_min_apply_delay could have + * changed while waiting in this loop. + */ + delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); + + /* + * Wait for difference between GetCurrentTimestamp() and delayUntil. + */ + msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), + delayUntil); + + if (msecs <= 0) + break; + + elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs); + + (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + msecs, + WAIT_EVENT_RECOVERY_APPLY_DELAY); + } + return true; +} + +/* + * Get the current state of the recovery pause request. + */ +RecoveryPauseState +GetRecoveryPauseState(void) +{ + RecoveryPauseState state; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + state = XLogRecoveryCtl->recoveryPauseState; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return state; +} + +/* + * Set the recovery pause state. + * + * If recovery pause is requested then sets the recovery pause state to + * 'pause requested' if it is not already 'paused'. Otherwise, sets it + * to 'not paused' to resume the recovery. The recovery pause will be + * confirmed by the ConfirmRecoveryPaused. + */ +void +SetRecoveryPause(bool recoveryPause) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + + if (!recoveryPause) + XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; + else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED) + XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED; + + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + if (!recoveryPause) + ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV); +} + +/* + * Confirm the recovery pause by setting the recovery pause state to + * RECOVERY_PAUSED. + */ +static void +ConfirmRecoveryPaused(void) +{ + /* If recovery pause is requested then set it paused */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED) + XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED; + SpinLockRelease(&XLogRecoveryCtl->info_lck); +} + + +/* + * Attempt to read the next XLOG record. + * + * Before first call, the reader needs to be positioned to the first record + * by calling XLogBeginRead(). + * + * If no valid record is available, returns NULL, or fails if emode is PANIC. + * (emode must be either PANIC, LOG). In standby mode, retries until a valid + * record is available. + */ +static XLogRecord * +ReadRecord(XLogReaderState *xlogreader, int emode, + bool fetching_ckpt, TimeLineID replayTLI) +{ + XLogRecord *record; + XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; + + /* Pass through parameters to XLogPageRead */ + private->fetching_ckpt = fetching_ckpt; + private->emode = emode; + private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr); + private->replayTLI = replayTLI; + + /* This is the first attempt to read this page. */ + lastSourceFailed = false; + + for (;;) + { + char *errormsg; + + record = XLogReadRecord(xlogreader, &errormsg); + if (record == NULL) + { + /* + * When not in standby mode we find that WAL ends in an incomplete + * record, keep track of that record. After recovery is done, + * we'll write a record to indicate downstream WAL readers that + * that portion is to be ignored. + */ + if (!StandbyMode && + !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr)) + { + abortedRecPtr = xlogreader->abortedRecPtr; + missingContrecPtr = xlogreader->missingContrecPtr; + } + + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + + /* + * We only end up here without a message when XLogPageRead() + * failed - in that case we already logged something. In + * StandbyMode that only happens if we have been triggered, so we + * shouldn't loop anymore in that case. + */ + if (errormsg) + ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), + (errmsg_internal("%s", errormsg) /* already translated */ )); + } + + /* + * Check page TLI is one of the expected values. + */ + else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs)) + { + char fname[MAXFNAMELEN]; + XLogSegNo segno; + int32 offset; + + XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size); + offset = XLogSegmentOffset(xlogreader->latestPagePtr, + wal_segment_size); + XLogFileName(fname, xlogreader->seg.ws_tli, segno, + wal_segment_size); + ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), + (errmsg("unexpected timeline ID %u in log segment %s, offset %u", + xlogreader->latestPageTLI, + fname, + offset))); + record = NULL; + } + + if (record) + { + /* Great, got a record */ + return record; + } + else + { + /* No valid record available from this source */ + lastSourceFailed = true; + + /* + * If archive recovery was requested, but we were still doing + * crash recovery, switch to archive recovery and retry using the + * offline archive. We have now replayed all the valid WAL in + * pg_wal, so we are presumably now consistent. + * + * We require that there's at least some valid WAL present in + * pg_wal, however (!fetching_ckpt). We could recover using the + * WAL from the archive, even if pg_wal is completely empty, but + * we'd have no idea how far we'd have to replay to reach + * consistency. So err on the safe side and give up. + */ + if (!InArchiveRecovery && ArchiveRecoveryRequested && + !fetching_ckpt) + { + ereport(DEBUG1, + (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); + InArchiveRecovery = true; + if (StandbyModeRequested) + StandbyMode = true; + + SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI); + minRecoveryPoint = xlogreader->EndRecPtr; + minRecoveryPointTLI = replayTLI; + + CheckRecoveryConsistency(); + + /* + * Before we retry, reset lastSourceFailed and currentSource + * so that we will check the archive next. + */ + lastSourceFailed = false; + currentSource = XLOG_FROM_ANY; + + continue; + } + + /* In standby mode, loop back to retry. Otherwise, give up. */ + if (StandbyMode && !CheckForStandbyTrigger()) + continue; + else + return NULL; + } + } +} + +/* + * Read the XLOG page containing RecPtr into readBuf (if not read already). + * Returns number of bytes read, if the page is read successfully, or -1 + * in case of errors. When errors occur, they are ereport'ed, but only + * if they have not been previously reported. + * + * This is responsible for restoring files from archive as needed, as well + * as for waiting for the requested WAL record to arrive in standby mode. + * + * 'emode' specifies the log level used for reporting "file not found" or + * "end of WAL" situations in archive recovery, or in standby mode when a + * trigger file is found. If set to WARNING or below, XLogPageRead() returns + * false in those situations, on higher log levels the ereport() won't + * return. + * + * In standby mode, if after a successful return of XLogPageRead() the + * caller finds the record it's interested in to be broken, it should + * ereport the error with the level determined by + * emode_for_corrupt_record(), and then set lastSourceFailed + * and call XLogPageRead() again with the same arguments. This lets + * XLogPageRead() to try fetching the record from another source, or to + * sleep and retry. + */ +static int +XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *readBuf) +{ + XLogPageReadPrivate *private = + (XLogPageReadPrivate *) xlogreader->private_data; + int emode = private->emode; + uint32 targetPageOff; + XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; + int r; + + XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); + targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); + + /* + * See if we need to switch to a new segment because the requested record + * is not in the currently open one. + */ + if (readFile >= 0 && + !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size)) + { + /* + * Request a restartpoint if we've replayed too much xlog since the + * last one. + */ + if (ArchiveRecoveryRequested && IsUnderPostmaster) + { + if (XLogCheckpointNeeded(readSegNo)) + { + (void) GetRedoRecPtr(); + if (XLogCheckpointNeeded(readSegNo)) + RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); + } + } + + close(readFile); + readFile = -1; + readSource = XLOG_FROM_ANY; + } + + XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size); + +retry: + /* See if we need to retrieve more data */ + if (readFile < 0 || + (readSource == XLOG_FROM_STREAM && + flushedUpto < targetPagePtr + reqLen)) + { + if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen, + private->randAccess, + private->fetching_ckpt, + targetRecPtr, + private->replayTLI, + xlogreader->EndRecPtr)) + { + if (readFile >= 0) + close(readFile); + readFile = -1; + readLen = 0; + readSource = XLOG_FROM_ANY; + + return -1; + } + } + + /* + * At this point, we have the right segment open and if we're streaming we + * know the requested record is in it. + */ + Assert(readFile != -1); + + /* + * If the current segment is being streamed from the primary, calculate + * how much of the current page we have received already. We know the + * requested record has been received, but this is for the benefit of + * future calls, to allow quick exit at the top of this function. + */ + if (readSource == XLOG_FROM_STREAM) + { + if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ)) + readLen = XLOG_BLCKSZ; + else + readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) - + targetPageOff; + } + else + readLen = XLOG_BLCKSZ; + + /* Read the requested page */ + readOff = targetPageOff; + + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); + r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); + if (r != XLOG_BLCKSZ) + { + char fname[MAXFNAMELEN]; + int save_errno = errno; + + pgstat_report_wait_end(); + XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); + if (r < 0) + { + errno = save_errno; + ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), + (errcode_for_file_access(), + errmsg("could not read from log segment %s, offset %u: %m", + fname, readOff))); + } + else + ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read from log segment %s, offset %u: read %d of %zu", + fname, readOff, r, (Size) XLOG_BLCKSZ))); + goto next_record_is_invalid; + } + pgstat_report_wait_end(); + + Assert(targetSegNo == readSegNo); + Assert(targetPageOff == readOff); + Assert(reqLen <= readLen); + + xlogreader->seg.ws_tli = curFileTLI; + + /* + * Check the page header immediately, so that we can retry immediately if + * it's not valid. This may seem unnecessary, because ReadPageInternal() + * validates the page header anyway, and would propagate the failure up to + * ReadRecord(), which would retry. However, there's a corner case with + * continuation records, if a record is split across two pages such that + * we would need to read the two pages from different sources. For + * example, imagine a scenario where a streaming replica is started up, + * and replay reaches a record that's split across two WAL segments. The + * first page is only available locally, in pg_wal, because it's already + * been recycled on the primary. The second page, however, is not present + * in pg_wal, and we should stream it from the primary. There is a + * recycled WAL segment present in pg_wal, with garbage contents, however. + * We would read the first page from the local WAL segment, but when + * reading the second page, we would read the bogus, recycled, WAL + * segment. If we didn't catch that case here, we would never recover, + * because ReadRecord() would retry reading the whole record from the + * beginning. + * + * Of course, this only catches errors in the page header, which is what + * happens in the case of a recycled WAL segment. Other kinds of errors or + * corruption still has the same problem. But this at least fixes the + * common case, which can happen as part of normal operation. + * + * Validating the page header is cheap enough that doing it twice + * shouldn't be a big deal from a performance point of view. + * + * When not in standby mode, an invalid page header should cause recovery + * to end, not retry reading the page, so we don't need to validate the + * page header here for the retry. Instead, ReadPageInternal() is + * responsible for the validation. + */ + if (StandbyMode && + !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)) + { + /* + * Emit this error right now then retry this page immediately. Use + * errmsg_internal() because the message was already translated. + */ + if (xlogreader->errormsg_buf[0]) + ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), + (errmsg_internal("%s", xlogreader->errormsg_buf))); + + /* reset any error XLogReaderValidatePageHeader() might have set */ + xlogreader->errormsg_buf[0] = '\0'; + goto next_record_is_invalid; + } + + return readLen; + +next_record_is_invalid: + lastSourceFailed = true; + + if (readFile >= 0) + close(readFile); + readFile = -1; + readLen = 0; + readSource = XLOG_FROM_ANY; + + /* In standby-mode, keep trying */ + if (StandbyMode) + goto retry; + else + return -1; +} + +/* + * Open the WAL segment containing WAL location 'RecPtr'. + * + * The segment can be fetched via restore_command, or via walreceiver having + * streamed the record, or it can already be present in pg_wal. Checking + * pg_wal is mainly for crash recovery, but it will be polled in standby mode + * too, in case someone copies a new segment directly to pg_wal. That is not + * documented or recommended, though. + * + * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should + * prepare to read WAL starting from RedoStartLSN after this. + * + * 'RecPtr' might not point to the beginning of the record we're interested + * in, it might also point to the page or segment header. In that case, + * 'tliRecPtr' is the position of the WAL record we're interested in. It is + * used to decide which timeline to stream the requested WAL from. + * + * 'replayLSN' is the current replay LSN, so that if we scan for new + * timelines, we can reject a switch to a timeline that branched off before + * this point. + * + * If the record is not immediately available, the function returns false + * if we're not in standby mode. In standby mode, waits for it to become + * available. + * + * When the requested record becomes available, the function opens the file + * containing it (if not open already), and returns true. When end of standby + * mode is triggered by the user, and there is no more WAL available, returns + * false. + */ +static bool +WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, + bool fetching_ckpt, XLogRecPtr tliRecPtr, + TimeLineID replayTLI, XLogRecPtr replayLSN) +{ + static TimestampTz last_fail_time = 0; + TimestampTz now; + bool streaming_reply_sent = false; + + /*------- + * Standby mode is implemented by a state machine: + * + * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just + * pg_wal (XLOG_FROM_PG_WAL) + * 2. Check trigger file + * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) + * 4. Rescan timelines + * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1. + * + * Failure to read from the current source advances the state machine to + * the next state. + * + * 'currentSource' indicates the current state. There are no currentSource + * values for "check trigger", "rescan timelines", and "sleep" states, + * those actions are taken when reading from the previous source fails, as + * part of advancing to the next state. + * + * If standby mode is turned off while reading WAL from stream, we move + * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching + * the files (which would be required at end of recovery, e.g., timeline + * history file) from archive or pg_wal. We don't need to kill WAL receiver + * here because it's already stopped when standby mode is turned off at + * the end of recovery. + *------- + */ + if (!InArchiveRecovery) + currentSource = XLOG_FROM_PG_WAL; + else if (currentSource == XLOG_FROM_ANY || + (!StandbyMode && currentSource == XLOG_FROM_STREAM)) + { + lastSourceFailed = false; + currentSource = XLOG_FROM_ARCHIVE; + } + + for (;;) + { + XLogSource oldSource = currentSource; + bool startWalReceiver = false; + + /* + * First check if we failed to read from the current source, and + * advance the state machine if so. The failure to read might've + * happened outside this function, e.g when a CRC check fails on a + * record, or within this loop. + */ + if (lastSourceFailed) + { + switch (currentSource) + { + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_WAL: + + /* + * Check to see if the trigger file exists. Note that we + * do this only after failure, so when you create the + * trigger file, we still finish replaying as much as we + * can from archive and pg_wal before failover. + */ + if (StandbyMode && CheckForStandbyTrigger()) + { + XLogShutdownWalRcv(); + return false; + } + + /* + * Not in standby mode, and we've now tried the archive + * and pg_wal. + */ + if (!StandbyMode) + return false; + + /* + * Move to XLOG_FROM_STREAM state, and set to start a + * walreceiver if necessary. + */ + currentSource = XLOG_FROM_STREAM; + startWalReceiver = true; + break; + + case XLOG_FROM_STREAM: + + /* + * Failure while streaming. Most likely, we got here + * because streaming replication was terminated, or + * promotion was triggered. But we also get here if we + * find an invalid record in the WAL streamed from the + * primary, in which case something is seriously wrong. + * There's little chance that the problem will just go + * away, but PANIC is not good for availability either, + * especially in hot standby mode. So, we treat that the + * same as disconnection, and retry from archive/pg_wal + * again. The WAL in the archive should be identical to + * what was streamed, so it's unlikely that it helps, but + * one can hope... + */ + + /* + * We should be able to move to XLOG_FROM_STREAM only in + * standby mode. + */ + Assert(StandbyMode); + + /* + * Before we leave XLOG_FROM_STREAM state, make sure that + * walreceiver is not active, so that it won't overwrite + * WAL that we restore from archive. + */ + if (WalRcvStreaming()) + XLogShutdownWalRcv(); + + /* + * Before we sleep, re-scan for possible new timelines if + * we were requested to recover to the latest timeline. + */ + if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) + { + if (rescanLatestTimeLine(replayTLI, replayLSN)) + { + currentSource = XLOG_FROM_ARCHIVE; + break; + } + } + + /* + * XLOG_FROM_STREAM is the last state in our state + * machine, so we've exhausted all the options for + * obtaining the requested WAL. We're going to loop back + * and retry from the archive, but if it hasn't been long + * since last attempt, sleep wal_retrieve_retry_interval + * milliseconds to avoid busy-waiting. + */ + now = GetCurrentTimestamp(); + if (!TimestampDifferenceExceeds(last_fail_time, now, + wal_retrieve_retry_interval)) + { + long wait_time; + + wait_time = wal_retrieve_retry_interval - + TimestampDifferenceMilliseconds(last_fail_time, now); + + elog(LOG, "waiting for WAL to become available at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + + (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + wait_time, + WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL); + ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + now = GetCurrentTimestamp(); + + /* Handle interrupt signals of startup process */ + HandleStartupProcInterrupts(); + } + last_fail_time = now; + currentSource = XLOG_FROM_ARCHIVE; + break; + + default: + elog(ERROR, "unexpected WAL source %d", currentSource); + } + } + else if (currentSource == XLOG_FROM_PG_WAL) + { + /* + * We just successfully read a file in pg_wal. We prefer files in + * the archive over ones in pg_wal, so try the next file again + * from the archive first. + */ + if (InArchiveRecovery) + currentSource = XLOG_FROM_ARCHIVE; + } + + if (currentSource != oldSource) + elog(DEBUG2, "switched WAL source from %s to %s after %s", + xlogSourceNames[oldSource], xlogSourceNames[currentSource], + lastSourceFailed ? "failure" : "success"); + + /* + * We've now handled possible failure. Try to read from the chosen + * source. + */ + lastSourceFailed = false; + + switch (currentSource) + { + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_WAL: + + /* + * WAL receiver must not be running when reading WAL from + * archive or pg_wal. + */ + Assert(!WalRcvStreaming()); + + /* Close any old file we might have open. */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + /* Reset curFileTLI if random fetch. */ + if (randAccess) + curFileTLI = 0; + + /* + * Try to restore the file from archive, or read an existing + * file from pg_wal. + */ + readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, + currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY : + currentSource); + if (readFile >= 0) + return true; /* success! */ + + /* + * Nope, not found in archive or pg_wal. + */ + lastSourceFailed = true; + break; + + case XLOG_FROM_STREAM: + { + bool havedata; + + /* + * We should be able to move to XLOG_FROM_STREAM only in + * standby mode. + */ + Assert(StandbyMode); + + /* + * First, shutdown walreceiver if its restart has been + * requested -- but no point if we're already slated for + * starting it. + */ + if (pendingWalRcvRestart && !startWalReceiver) + { + XLogShutdownWalRcv(); + + /* + * Re-scan for possible new timelines if we were + * requested to recover to the latest timeline. + */ + if (recoveryTargetTimeLineGoal == + RECOVERY_TARGET_TIMELINE_LATEST) + rescanLatestTimeLine(replayTLI, replayLSN); + + startWalReceiver = true; + } + pendingWalRcvRestart = false; + + /* + * Launch walreceiver if needed. + * + * If fetching_ckpt is true, RecPtr points to the initial + * checkpoint location. In that case, we use RedoStartLSN + * as the streaming start position instead of RecPtr, so + * that when we later jump backwards to start redo at + * RedoStartLSN, we will have the logs streamed already. + */ + if (startWalReceiver && + PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) + { + XLogRecPtr ptr; + TimeLineID tli; + + if (fetching_ckpt) + { + ptr = RedoStartLSN; + tli = RedoStartTLI; + } + else + { + ptr = RecPtr; + + /* + * Use the record begin position to determine the + * TLI, rather than the position we're reading. + */ + tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); + + if (curFileTLI > 0 && tli < curFileTLI) + elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", + LSN_FORMAT_ARGS(tliRecPtr), + tli, curFileTLI); + } + curFileTLI = tli; + SetInstallXLogFileSegmentActive(); + RequestXLogStreaming(tli, ptr, PrimaryConnInfo, + PrimarySlotName, + wal_receiver_create_temp_slot); + flushedUpto = 0; + } + + /* + * Check if WAL receiver is active or wait to start up. + */ + if (!WalRcvStreaming()) + { + lastSourceFailed = true; + break; + } + + /* + * Walreceiver is active, so see if new data has arrived. + * + * We only advance XLogReceiptTime when we obtain fresh + * WAL from walreceiver and observe that we had already + * processed everything before the most recent "chunk" + * that it flushed to disk. In steady state where we are + * keeping up with the incoming data, XLogReceiptTime will + * be updated on each cycle. When we are behind, + * XLogReceiptTime will not advance, so the grace time + * allotted to conflicting queries will decrease. + */ + if (RecPtr < flushedUpto) + havedata = true; + else + { + XLogRecPtr latestChunkStart; + + flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI); + if (RecPtr < flushedUpto && receiveTLI == curFileTLI) + { + havedata = true; + if (latestChunkStart <= RecPtr) + { + XLogReceiptTime = GetCurrentTimestamp(); + SetCurrentChunkStartTime(XLogReceiptTime); + } + } + else + havedata = false; + } + if (havedata) + { + /* + * Great, streamed far enough. Open the file if it's + * not open already. Also read the timeline history + * file if we haven't initialized timeline history + * yet; it should be streamed over and present in + * pg_wal by now. Use XLOG_FROM_STREAM so that source + * info is set correctly and XLogReceiptTime isn't + * changed. + * + * NB: We must set readTimeLineHistory based on + * recoveryTargetTLI, not receiveTLI. Normally they'll + * be the same, but if recovery_target_timeline is + * 'latest' and archiving is configured, then it's + * possible that we managed to retrieve one or more + * new timeline history files from the archive, + * updating recoveryTargetTLI. + */ + if (readFile < 0) + { + if (!expectedTLEs) + expectedTLEs = readTimeLineHistory(recoveryTargetTLI); + readFile = XLogFileRead(readSegNo, PANIC, + receiveTLI, + XLOG_FROM_STREAM, false); + Assert(readFile >= 0); + } + else + { + /* just make sure source info is correct... */ + readSource = XLOG_FROM_STREAM; + XLogReceiptSource = XLOG_FROM_STREAM; + return true; + } + break; + } + + /* + * Data not here yet. Check for trigger, then wait for + * walreceiver to wake us up when new WAL arrives. + */ + if (CheckForStandbyTrigger()) + { + /* + * Note that we don't "return false" immediately here. + * After being triggered, we still want to replay all + * the WAL that was already streamed. It's in pg_wal + * now, so we just treat this as a failure, and the + * state machine will move on to replay the streamed + * WAL from pg_wal, and then recheck the trigger and + * exit replay. + */ + lastSourceFailed = true; + break; + } + + /* + * Since we have replayed everything we have received so + * far and are about to start waiting for more WAL, let's + * tell the upstream server our replay location now so + * that pg_stat_replication doesn't show stale + * information. + */ + if (!streaming_reply_sent) + { + WalRcvForceReply(); + streaming_reply_sent = true; + } + + /* + * Wait for more WAL to arrive. Time out after 5 seconds + * to react to a trigger file promptly and to check if the + * WAL receiver is still active. + */ + (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM); + ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + break; + } + + default: + elog(ERROR, "unexpected WAL source %d", currentSource); + } + + /* + * Check for recovery pause here so that we can confirm more quickly + * that a requested pause has actually taken effect. + */ + if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != + RECOVERY_NOT_PAUSED) + recoveryPausesHere(false); + + /* + * This possibly-long loop needs to handle interrupts of startup + * process. + */ + HandleStartupProcInterrupts(); + } + + return false; /* not reached */ +} + + +/* + * Determine what log level should be used to report a corrupt WAL record + * in the current WAL page, previously read by XLogPageRead(). + * + * 'emode' is the error mode that would be used to report a file-not-found + * or legitimate end-of-WAL situation. Generally, we use it as-is, but if + * we're retrying the exact same record that we've tried previously, only + * complain the first time to keep the noise down. However, we only do when + * reading from pg_wal, because we don't expect any invalid records in archive + * or in records streamed from the primary. Files in the archive should be complete, + * and we should never hit the end of WAL because we stop and wait for more WAL + * to arrive before replaying it. + * + * NOTE: This function remembers the RecPtr value it was last called with, + * to suppress repeated messages about the same record. Only call this when + * you are about to ereport(), or you might cause a later message to be + * erroneously suppressed. + */ +static int +emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) +{ + static XLogRecPtr lastComplaint = 0; + + if (readSource == XLOG_FROM_PG_WAL && emode == LOG) + { + if (RecPtr == lastComplaint) + emode = DEBUG1; + else + lastComplaint = RecPtr; + } + return emode; +} + + +/* + * Subroutine to try to fetch and validate a prior checkpoint record. + * + * whichChkpt identifies the checkpoint (merely for reporting purposes). + * 1 for "primary", 0 for "other" (backup_label) + */ +static XLogRecord * +ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, + int whichChkpt, bool report, TimeLineID replayTLI) +{ + XLogRecord *record; + uint8 info; + + Assert(xlogreader != NULL); + + if (!XRecOffIsValid(RecPtr)) + { + if (!report) + return NULL; + + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid primary checkpoint link in control file"))); + break; + default: + ereport(LOG, + (errmsg("invalid checkpoint link in backup_label file"))); + break; + } + return NULL; + } + + XLogBeginRead(xlogreader, RecPtr); + record = ReadRecord(xlogreader, LOG, true, replayTLI); + + if (record == NULL) + { + if (!report) + return NULL; + + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid checkpoint record"))); + break; + } + return NULL; + } + if (record->xl_rmid != RM_XLOG_ID) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid resource manager ID in primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid resource manager ID in checkpoint record"))); + break; + } + return NULL; + } + info = record->xl_info & ~XLR_INFO_MASK; + if (info != XLOG_CHECKPOINT_SHUTDOWN && + info != XLOG_CHECKPOINT_ONLINE) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid xl_info in primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid xl_info in checkpoint record"))); + break; + } + return NULL; + } + if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint)) + { + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid length of primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid length of checkpoint record"))); + break; + } + return NULL; + } + return record; +} + +/* + * Scan for new timelines that might have appeared in the archive since we + * started recovery. + * + * If there are any, the function changes recovery target TLI to the latest + * one and returns 'true'. + */ +static bool +rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) +{ + List *newExpectedTLEs; + bool found; + ListCell *cell; + TimeLineID newtarget; + TimeLineID oldtarget = recoveryTargetTLI; + TimeLineHistoryEntry *currentTle = NULL; + + newtarget = findNewestTimeLine(recoveryTargetTLI); + if (newtarget == recoveryTargetTLI) + { + /* No new timelines found */ + return false; + } + + /* + * Determine the list of expected TLIs for the new TLI + */ + + newExpectedTLEs = readTimeLineHistory(newtarget); + + /* + * If the current timeline is not part of the history of the new timeline, + * we cannot proceed to it. + */ + found = false; + foreach(cell, newExpectedTLEs) + { + currentTle = (TimeLineHistoryEntry *) lfirst(cell); + + if (currentTle->tli == recoveryTargetTLI) + { + found = true; + break; + } + } + if (!found) + { + ereport(LOG, + (errmsg("new timeline %u is not a child of database system timeline %u", + newtarget, + replayTLI))); + return false; + } + + /* + * The current timeline was found in the history file, but check that the + * next timeline was forked off from it *after* the current recovery + * location. + */ + if (currentTle->end < replayLSN) + { + ereport(LOG, + (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", + newtarget, + replayTLI, + LSN_FORMAT_ARGS(replayLSN)))); + return false; + } + + /* The new timeline history seems valid. Switch target */ + recoveryTargetTLI = newtarget; + list_free_deep(expectedTLEs); + expectedTLEs = newExpectedTLEs; + + /* + * As in StartupXLOG(), try to ensure we have all the history files + * between the old target and new target in pg_wal. + */ + restoreTimeLineHistoryFiles(oldtarget + 1, newtarget); + + ereport(LOG, + (errmsg("new target timeline is %u", + recoveryTargetTLI))); + + return true; +} + + +/* + * Open a logfile segment for reading (during recovery). + * + * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive. + * Otherwise, it's assumed to be already available in pg_wal. + */ +static int +XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, + XLogSource source, bool notfoundOk) +{ + char xlogfname[MAXFNAMELEN]; + char activitymsg[MAXFNAMELEN + 16]; + char path[MAXPGPATH]; + int fd; + + XLogFileName(xlogfname, tli, segno, wal_segment_size); + + switch (source) + { + case XLOG_FROM_ARCHIVE: + /* Report recovery progress in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", + xlogfname); + set_ps_display(activitymsg); + + if (!RestoreArchivedFile(path, xlogfname, + "RECOVERYXLOG", + wal_segment_size, + InRedo)) + return -1; + break; + + case XLOG_FROM_PG_WAL: + case XLOG_FROM_STREAM: + XLogFilePath(path, tli, segno, wal_segment_size); + break; + + default: + elog(ERROR, "invalid XLogFileRead source %d", source); + } + + /* + * If the segment was fetched from archival storage, replace the existing + * xlog segment (if any) with the archival version. + */ + if (source == XLOG_FROM_ARCHIVE) + { + Assert(!IsInstallXLogFileSegmentActive()); + KeepFileRestoredFromArchive(path, xlogfname); + + /* + * Set path to point at the new file in pg_wal. + */ + snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); + } + + fd = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (fd >= 0) + { + /* Success! */ + curFileTLI = tli; + + /* Report recovery progress in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "recovering %s", + xlogfname); + set_ps_display(activitymsg); + + /* Track source of data in assorted state variables */ + readSource = source; + XLogReceiptSource = source; + /* In FROM_STREAM case, caller tracks receipt time, not me */ + if (source != XLOG_FROM_STREAM) + XLogReceiptTime = GetCurrentTimestamp(); + + return fd; + } + if (errno != ENOENT || !notfoundOk) /* unexpected failure? */ + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return -1; +} + +/* + * Open a logfile segment for reading (during recovery). + * + * This version searches for the segment with any TLI listed in expectedTLEs. + */ +static int +XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source) +{ + char path[MAXPGPATH]; + ListCell *cell; + int fd; + List *tles; + + /* + * Loop looking for a suitable timeline ID: we might need to read any of + * the timelines listed in expectedTLEs. + * + * We expect curFileTLI on entry to be the TLI of the preceding file in + * sequence, or 0 if there was no predecessor. We do not allow curFileTLI + * to go backwards; this prevents us from picking up the wrong file when a + * parent timeline extends to higher segment numbers than the child we + * want to read. + * + * If we haven't read the timeline history file yet, read it now, so that + * we know which TLIs to scan. We don't save the list in expectedTLEs, + * however, unless we actually find a valid segment. That way if there is + * neither a timeline history file nor a WAL segment in the archive, and + * streaming replication is set up, we'll read the timeline history file + * streamed from the primary when we start streaming, instead of + * recovering with a dummy history generated here. + */ + if (expectedTLEs) + tles = expectedTLEs; + else + tles = readTimeLineHistory(recoveryTargetTLI); + + foreach(cell, tles) + { + TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell); + TimeLineID tli = hent->tli; + + if (tli < curFileTLI) + break; /* don't bother looking at too-old TLIs */ + + /* + * Skip scanning the timeline ID that the logfile segment to read + * doesn't belong to + */ + if (hent->begin != InvalidXLogRecPtr) + { + XLogSegNo beginseg = 0; + + XLByteToSeg(hent->begin, beginseg, wal_segment_size); + + /* + * The logfile segment that doesn't belong to the timeline is + * older or newer than the segment that the timeline started or + * ended at, respectively. It's sufficient to check only the + * starting segment of the timeline here. Since the timelines are + * scanned in descending order in this loop, any segments newer + * than the ending segment should belong to newer timeline and + * have already been read before. So it's not necessary to check + * the ending segment of the timeline here. + */ + if (segno < beginseg) + continue; + } + + if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE) + { + fd = XLogFileRead(segno, emode, tli, + XLOG_FROM_ARCHIVE, true); + if (fd != -1) + { + elog(DEBUG1, "got WAL segment from archive"); + if (!expectedTLEs) + expectedTLEs = tles; + return fd; + } + } + + if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL) + { + fd = XLogFileRead(segno, emode, tli, + XLOG_FROM_PG_WAL, true); + if (fd != -1) + { + if (!expectedTLEs) + expectedTLEs = tles; + return fd; + } + } + } + + /* Couldn't find it. For simplicity, complain about front timeline */ + XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size); + errno = ENOENT; + ereport(emode, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return -1; +} + +/* + * Set flag to signal the walreceiver to restart. (The startup process calls + * this on noticing a relevant configuration change.) + */ +void +StartupRequestWalReceiverRestart(void) +{ + if (currentSource == XLOG_FROM_STREAM && WalRcvRunning()) + { + ereport(LOG, + (errmsg("WAL receiver process shutdown requested"))); + + pendingWalRcvRestart = true; + } +} + + +/* + * Has a standby promotion already been triggered? + * + * Unlike CheckForStandbyTrigger(), this works in any process + * that's connected to shared memory. + */ +bool +PromoteIsTriggered(void) +{ + /* + * We check shared state each time only until a standby promotion is + * triggered. We can't trigger a promotion again, so there's no need to + * keep checking after the shared variable has once been seen true. + */ + if (LocalPromoteIsTriggered) + return true; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return LocalPromoteIsTriggered; +} + +static void +SetPromoteIsTriggered(void) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->SharedPromoteIsTriggered = true; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + /* + * Mark the recovery pause state as 'not paused' because the paused state + * ends and promotion continues if a promotion is triggered while recovery + * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly + * return 'paused' while a promotion is ongoing. + */ + SetRecoveryPause(false); + + LocalPromoteIsTriggered = true; +} + +/* + * Check to see whether the user-specified trigger file exists and whether a + * promote request has arrived. If either condition holds, return true. + */ +static bool +CheckForStandbyTrigger(void) +{ + struct stat stat_buf; + + if (LocalPromoteIsTriggered) + return true; + + if (IsPromoteSignaled() && CheckPromoteSignal()) + { + ereport(LOG, (errmsg("received promote request"))); + RemovePromoteSignalFiles(); + ResetPromoteSignaled(); + SetPromoteIsTriggered(); + return true; + } + + if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0) + return false; + + if (stat(PromoteTriggerFile, &stat_buf) == 0) + { + ereport(LOG, + (errmsg("promote trigger file found: %s", PromoteTriggerFile))); + unlink(PromoteTriggerFile); + SetPromoteIsTriggered(); + return true; + } + else if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat promote trigger file \"%s\": %m", + PromoteTriggerFile))); + + return false; +} + +/* + * Remove the files signaling a standby promotion request. + */ +void +RemovePromoteSignalFiles(void) +{ + unlink(PROMOTE_SIGNAL_FILE); +} + +/* + * Check to see if a promote request has arrived. + */ +bool +CheckPromoteSignal(void) +{ + struct stat stat_buf; + + if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) + return true; + + return false; +} + +/* + * Wake up startup process to replay newly arrived WAL, or to notice that + * failover has been requested. + */ +void +WakeupRecovery(void) +{ + SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); +} + +/* + * Schedule a walreceiver wakeup in the main recovery loop. + */ +void +XLogRequestWalReceiverReply(void) +{ + doRequestWalReceiverReply = true; +} + +/* + * Is HotStandby active yet? This is only important in special backends + * since normal backends won't ever be able to connect until this returns + * true. Postmaster knows this by way of signal, not via shared memory. + * + * Unlike testing standbyState, this works in any process that's connected to + * shared memory. (And note that standbyState alone doesn't tell the truth + * anyway.) + */ +bool +HotStandbyActive(void) +{ + /* + * We check shared state each time only until Hot Standby is active. We + * can't de-activate Hot Standby, so there's no need to keep checking + * after the shared variable has once been seen true. + */ + if (LocalHotStandbyActive) + return true; + else + { + /* spinlock is essential on machines with weak memory ordering! */ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return LocalHotStandbyActive; + } +} + +/* + * Like HotStandbyActive(), but to be used only in WAL replay code, + * where we don't need to ask any other process what the state is. + */ +static bool +HotStandbyActiveInReplay(void) +{ + Assert(AmStartupProcess() || !IsPostmasterEnvironment); + return LocalHotStandbyActive; +} + +/* + * Get latest redo apply position. + * + * Exported to allow WALReceiver to read the pointer directly. + */ +XLogRecPtr +GetXLogReplayRecPtr(TimeLineID *replayTLI) +{ + XLogRecPtr recptr; + TimeLineID tli; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + recptr = XLogRecoveryCtl->lastReplayedEndRecPtr; + tli = XLogRecoveryCtl->lastReplayedTLI; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + if (replayTLI) + *replayTLI = tli; + return recptr; +} + + +/* + * Get position of last applied, or the record being applied. + * + * This is different from GetLogReplayRecPtr() in that if a WAL + * record is currently being applied, this includes that record. + */ +XLogRecPtr +GetCurrentReplayRecPtr(TimeLineID *replayEndTLI) +{ + XLogRecPtr recptr; + TimeLineID tli; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + recptr = XLogRecoveryCtl->replayEndRecPtr; + tli = XLogRecoveryCtl->replayEndTLI; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + if (replayEndTLI) + *replayEndTLI = tli; + return recptr; +} + +/* + * Save timestamp of latest processed commit/abort record. + * + * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be + * seen by processes other than the startup process. Note in particular + * that CreateRestartPoint is executed in the checkpointer. + */ +static void +SetLatestXTime(TimestampTz xtime) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->recoveryLastXTime = xtime; + SpinLockRelease(&XLogRecoveryCtl->info_lck); +} + +/* + * Fetch timestamp of latest processed commit/abort record. + */ +TimestampTz +GetLatestXTime(void) +{ + TimestampTz xtime; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + xtime = XLogRecoveryCtl->recoveryLastXTime; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return xtime; +} + +/* + * Save timestamp of the next chunk of WAL records to apply. + * + * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be + * seen by all backends. + */ +static void +SetCurrentChunkStartTime(TimestampTz xtime) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + XLogRecoveryCtl->currentChunkStartTime = xtime; + SpinLockRelease(&XLogRecoveryCtl->info_lck); +} + +/* + * Fetch timestamp of latest processed commit/abort record. + * Startup process maintains an accurate local copy in XLogReceiptTime + */ +TimestampTz +GetCurrentChunkReplayStartTime(void) +{ + TimestampTz xtime; + + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + xtime = XLogRecoveryCtl->currentChunkStartTime; + SpinLockRelease(&XLogRecoveryCtl->info_lck); + + return xtime; +} + +/* + * Returns time of receipt of current chunk of XLOG data, as well as + * whether it was received from streaming replication or from archives. + */ +void +GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream) +{ + /* + * This must be executed in the startup process, since we don't export the + * relevant state to shared memory. + */ + Assert(InRecovery); + + *rtime = XLogReceiptTime; + *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM); +} + +/* + * Note that text field supplied is a parameter name and does not require + * translation + */ +void +RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue) +{ + if (currValue < minValue) + { + if (HotStandbyActiveInReplay()) + { + bool warned_for_promote = false; + + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("hot standby is not possible because of insufficient parameter settings"), + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue))); + + SetRecoveryPause(true); + + ereport(LOG, + (errmsg("recovery has paused"), + errdetail("If recovery is unpaused, the server will shut down."), + errhint("You can then restart the server after making the necessary configuration changes."))); + + while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) + { + HandleStartupProcInterrupts(); + + if (CheckForStandbyTrigger()) + { + if (!warned_for_promote) + ereport(WARNING, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("promotion is not possible because of insufficient parameter settings"), + + /* + * Repeat the detail from above so it's easy to find + * in the log. + */ + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue), + errhint("Restart the server after making the necessary configuration changes."))); + warned_for_promote = true; + } + + /* + * If recovery pause is requested then set it paused. While + * we are in the loop, user might resume and pause again so + * set this every time. + */ + ConfirmRecoveryPaused(); + + /* + * We wait on a condition variable that will wake us as soon + * as the pause ends, but we use a timeout so we can check the + * above conditions periodically too. + */ + ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000, + WAIT_EVENT_RECOVERY_PAUSE); + } + ConditionVariableCancelSleep(); + } + + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("recovery aborted because of insufficient parameter settings"), + /* Repeat the detail from above so it's easy to find in the log. */ + errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", + param_name, + currValue, + minValue), + errhint("You can restart the server after making the necessary configuration changes."))); + } +} diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 90e1c48390..54d5f20734 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -20,7 +20,7 @@ #include #include "access/timeline.h" -#include "access/xlog.h" +#include "access/xlogrecovery.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "miscadmin.h" @@ -46,8 +46,8 @@ bool ignore_invalid_pages = false; * process you're running in, use RecoveryInProgress() but only after shared * memory startup and lock initialization. * - * This is updated from xlog.c, but lives here because it's mostly read by - * WAL redo functions. + * This is updated from xlog.c and xlogrecovery.c, but lives here because + * it's mostly read by WAL redo functions. */ bool InRecovery = false; diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 23f691cd47..4488e3a443 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -38,6 +38,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" +#include "access/xlogrecovery.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 32992bafa0..735fed490b 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -95,6 +95,7 @@ #include "access/transam.h" #include "access/xlog.h" +#include "access/xlogrecovery.h" #include "catalog/pg_control.h" #include "common/file_perm.h" #include "common/ip.h" diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index 9bae16bfc7..29cf8f18e1 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -20,6 +20,7 @@ #include "postgres.h" #include "access/xlog.h" +#include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "libpq/pqsignal.h" #include "miscadmin.h" diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c index 4d71e71f68..c29e82307f 100644 --- a/src/backend/replication/logical/logicalfuncs.c +++ b/src/backend/replication/logical/logicalfuncs.c @@ -19,6 +19,7 @@ #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "catalog/pg_type.h" #include "fmgr.h" diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index ae6316d908..5149ebccb0 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -14,6 +14,7 @@ #include "access/htup_details.h" #include "access/xlog_internal.h" +#include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "funcapi.h" #include "miscadmin.h" diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index b39fce8c23..ceaff097b9 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -56,6 +56,7 @@ #include "access/transam.h" #include "access/xlog_internal.h" #include "access/xlogarchive.h" +#include "access/xlogrecovery.h" #include "catalog/pg_authid.h" #include "catalog/pg_type.h" #include "common/ip.h" diff --git a/src/backend/replication/walreceiverfuncs.c b/src/backend/replication/walreceiverfuncs.c index c50728ea22..90798b9d53 100644 --- a/src/backend/replication/walreceiverfuncs.c +++ b/src/backend/replication/walreceiverfuncs.c @@ -23,6 +23,7 @@ #include #include "access/xlog_internal.h" +#include "access/xlogrecovery.h" #include "pgstat.h" #include "postmaster/startup.h" #include "replication/walreceiver.h" diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 655760fee3..a1dadd4c6a 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -55,6 +55,7 @@ #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" +#include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "catalog/pg_authid.h" #include "catalog/pg_type.h" diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 9f26e41c46..cd4ebe2fc5 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -22,6 +22,7 @@ #include "access/subtrans.h" #include "access/syncscan.h" #include "access/twophase.h" +#include "access/xlogrecovery.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -119,6 +120,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); size = add_size(size, XLOGShmemSize()); + size = add_size(size, XLogRecoveryShmemSize()); size = add_size(size, CLOGShmemSize()); size = add_size(size, CommitTsShmemSize()); size = add_size(size, SUBTRANSShmemSize()); @@ -241,6 +243,7 @@ CreateSharedMemoryAndSemaphores(void) * Set up xlog, clog, and buffers */ XLOGShmemInit(); + XLogRecoveryShmemInit(); CLOGShmemInit(); CommitTsShmemInit(); SUBTRANSShmemInit(); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 87ac0f74b2..27361ac861 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -20,6 +20,7 @@ #include "access/twophase.h" #include "access/xact.h" #include "access/xloginsert.h" +#include "access/xlogrecovery.h" #include "access/xlogutils.h" #include "miscadmin.h" #include "pgstat.h" diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index 543f691f2d..e161d57761 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -29,6 +29,7 @@ #include "portability/instr_time.h" #include "postmaster/bgwriter.h" #include "storage/bufmgr.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/md.h" #include "utils/hsearch.h" diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index e2fe219aa8..568ac62c2a 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -41,6 +41,7 @@ #include "access/twophase.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xlogrecovery.h" #include "catalog/namespace.h" #include "catalog/pg_authid.h" #include "catalog/storage.h" diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a4b1c1286f..4b45ac64db 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -11,13 +11,11 @@ #ifndef XLOG_H #define XLOG_H -#include "access/rmgr.h" #include "access/xlogdefs.h" #include "access/xlogreader.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" -#include "storage/fd.h" /* Sync methods */ @@ -28,36 +26,10 @@ #define SYNC_METHOD_OPEN_DSYNC 4 /* for O_DSYNC */ extern int sync_method; -/* - * Recovery target type. - * Only set during a Point in Time recovery, not when in standby mode. - */ -typedef enum -{ - RECOVERY_TARGET_UNSET, - RECOVERY_TARGET_XID, - RECOVERY_TARGET_TIME, - RECOVERY_TARGET_NAME, - RECOVERY_TARGET_LSN, - RECOVERY_TARGET_IMMEDIATE -} RecoveryTargetType; - -/* - * Recovery target TimeLine goal - */ -typedef enum -{ - RECOVERY_TARGET_TIMELINE_CONTROLFILE, - RECOVERY_TARGET_TIMELINE_LATEST, - RECOVERY_TARGET_TIMELINE_NUMERIC -} RecoveryTargetTimeLineGoal; - extern XLogRecPtr ProcLastRecPtr; extern XLogRecPtr XactLastRecEnd; extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; -extern bool reachedConsistency; - /* these variables are GUC parameters related to XLOG */ extern int wal_segment_size; extern int min_wal_size_mb; @@ -77,34 +49,10 @@ extern bool wal_recycle; extern bool *wal_consistency_checking; extern char *wal_consistency_checking_string; extern bool log_checkpoints; -extern char *recoveryRestoreCommand; -extern char *recoveryEndCommand; -extern char *archiveCleanupCommand; -extern bool recoveryTargetInclusive; -extern int recoveryTargetAction; -extern int recovery_min_apply_delay; -extern char *PrimaryConnInfo; -extern char *PrimarySlotName; -extern bool wal_receiver_create_temp_slot; extern bool track_wal_io_timing; -/* indirectly set via GUC system */ -extern TransactionId recoveryTargetXid; -extern char *recovery_target_time_string; -extern const char *recoveryTargetName; -extern XLogRecPtr recoveryTargetLSN; -extern RecoveryTargetType recoveryTarget; -extern char *PromoteTriggerFile; -extern RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal; -extern TimeLineID recoveryTargetTLIRequested; -extern TimeLineID recoveryTargetTLI; - extern int CheckPointSegments; -/* option set locally in startup process only when signal files exist */ -extern bool StandbyModeRequested; -extern bool StandbyMode; - /* Archive modes */ typedef enum ArchiveMode { @@ -138,14 +86,6 @@ typedef enum RecoveryState RECOVERY_STATE_DONE /* currently in production */ } RecoveryState; -/* Recovery pause states */ -typedef enum RecoveryPauseState -{ - RECOVERY_NOT_PAUSED, /* pause not requested */ - RECOVERY_PAUSE_REQUESTED, /* pause requested, but not yet paused */ - RECOVERY_PAUSED /* recovery is paused */ -} RecoveryPauseState; - extern PGDLLIMPORT int wal_level; /* Is WAL archiving enabled (always or only while server is running normally)? */ @@ -274,19 +214,10 @@ extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli); extern bool RecoveryInProgress(void); extern RecoveryState GetRecoveryState(void); -extern bool HotStandbyActive(void); -extern bool HotStandbyActiveInReplay(void); extern bool XLogInsertAllowed(void); -extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream); -extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); extern XLogRecPtr GetXLogInsertRecPtr(void); extern XLogRecPtr GetXLogWriteRecPtr(void); -extern RecoveryPauseState GetRecoveryPauseState(void); -extern void SetRecoveryPause(bool recoveryPause); -extern TimestampTz GetLatestXTime(void); -extern TimestampTz GetCurrentChunkReplayStartTime(void); -extern void UpdateControlFile(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); extern bool DataChecksumsEnabled(void); @@ -310,19 +241,23 @@ extern XLogRecPtr GetInsertRecPtr(void); extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI); extern TimeLineID GetWALInsertionTimeLine(void); extern XLogRecPtr GetLastImportantRecPtr(void); -extern void RemovePromoteSignalFiles(void); -extern bool PromoteIsTriggered(void); -extern bool CheckPromoteSignal(void); -extern void WakeupRecovery(void); extern void SetWalWriterSleeping(bool sleeping); -extern void StartupRequestWalReceiverRestart(void); -extern void XLogRequestWalReceiverReply(void); - extern void assign_max_wal_size(int newval, void *extra); extern void assign_checkpoint_completion_target(double newval, void *extra); +/* + * Routines used by xlogrecovery.c to call back into xlog.c during recovery. + */ +extern void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI); +extern bool XLogCheckpointNeeded(XLogSegNo new_segno); +extern void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI); +extern void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli); +extern void SetInstallXLogFileSegmentActive(void); +extern bool IsInstallXLogFileSegmentActive(void); +extern void XLogShutdownWalRcv(void); + /* * Routines to start, stop, and get status of a base backup. */ diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h new file mode 100644 index 0000000000..75a0f5fe5e --- /dev/null +++ b/src/include/access/xlogrecovery.h @@ -0,0 +1,157 @@ +/* + * xlogrecovery.h + * + * Functions for WAL recovery and standby mode + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlogrecovery.h + */ +#ifndef XLOGRECOVERY_H +#define XLOGRECOVERY_H + +#include "access/xlogreader.h" +#include "catalog/pg_control.h" +#include "lib/stringinfo.h" +#include "utils/timestamp.h" + +/* + * Recovery target type. + * Only set during a Point in Time recovery, not when in standby mode. + */ +typedef enum +{ + RECOVERY_TARGET_UNSET, + RECOVERY_TARGET_XID, + RECOVERY_TARGET_TIME, + RECOVERY_TARGET_NAME, + RECOVERY_TARGET_LSN, + RECOVERY_TARGET_IMMEDIATE +} RecoveryTargetType; + +/* + * Recovery target TimeLine goal + */ +typedef enum +{ + RECOVERY_TARGET_TIMELINE_CONTROLFILE, + RECOVERY_TARGET_TIMELINE_LATEST, + RECOVERY_TARGET_TIMELINE_NUMERIC +} RecoveryTargetTimeLineGoal; + +/* Recovery pause states */ +typedef enum RecoveryPauseState +{ + RECOVERY_NOT_PAUSED, /* pause not requested */ + RECOVERY_PAUSE_REQUESTED, /* pause requested, but not yet paused */ + RECOVERY_PAUSED /* recovery is paused */ +} RecoveryPauseState; + +/* User-settable GUC parameters */ +extern bool recoveryTargetInclusive; +extern int recoveryTargetAction; +extern int recovery_min_apply_delay; +extern char *PrimaryConnInfo; +extern char *PrimarySlotName; +extern char *recoveryRestoreCommand; +extern char *recoveryEndCommand; +extern char *archiveCleanupCommand; + +/* indirectly set via GUC system */ +extern TransactionId recoveryTargetXid; +extern char *recovery_target_time_string; +extern TimestampTz recoveryTargetTime; +extern const char *recoveryTargetName; +extern XLogRecPtr recoveryTargetLSN; +extern RecoveryTargetType recoveryTarget; +extern char *PromoteTriggerFile; +extern bool wal_receiver_create_temp_slot; +extern RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal; +extern TimeLineID recoveryTargetTLIRequested; +extern TimeLineID recoveryTargetTLI; + +/* Have we already reached a consistent database state? */ +extern bool reachedConsistency; + +/* Are we currently in standby mode? */ +extern bool StandbyMode; + +extern Size XLogRecoveryShmemSize(void); +extern void XLogRecoveryShmemInit(void); + +extern void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdownPtr, bool *haveBackupLabel, bool *haveTblspcMap); +extern void PerformWalRecovery(void); + +/* + * FinishWalRecovery() returns this. It contains information about the point + * where recovery ended, and why it ended. + */ +typedef struct +{ + /* + * Information about the last valid or applied record, after which new WAL + * can be appended. 'lastRec' is the position where the last record + * starts, and 'endOfLog' is its end. 'lastPage' is a copy of the last + * partial page that contains endOfLog (or NULL if endOfLog is exactly at + * page boundary). 'lastPageBeginPtr' is the position where the last page + * begins. + * + * endOfLogTLI is the TLI in the filename of the XLOG segment containing + * the last applied record. It could be different from lastRecTLI, if + * there was a timeline switch in that segment, and we were reading the + * old WAL from a segment belonging to a higher timeline. + */ + XLogRecPtr lastRec; /* start of last valid or applied record */ + TimeLineID lastRecTLI; + XLogRecPtr endOfLog; /* end of last valid or applied record */ + TimeLineID endOfLogTLI; + + XLogRecPtr lastPageBeginPtr; /* LSN of page that contains endOfLog */ + char *lastPage; /* copy of the last page, up to endOfLog */ + + /* + * abortedRecPtr is the start pointer of a broken record at end of WAL + * when recovery completes; missingContrecPtr is the location of the first + * contrecord that went missing. See CreateOverwriteContrecordRecord for + * details. + */ + XLogRecPtr abortedRecPtr; + XLogRecPtr missingContrecPtr; + + /* short human-readable string describing why recovery ended */ + char *recoveryStopReason; + + /* + * If standby or recovery signal file was found, these flags are set + * accordingly. + */ + bool standby_signal_file_found; + bool recovery_signal_file_found; +} EndOfWalRecoveryInfo; + +extern EndOfWalRecoveryInfo *FinishWalRecovery(void); +extern void ShutdownWalRecovery(void); +extern void RemovePromoteSignalFiles(void); + +extern bool HotStandbyActive(void); +extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); +extern RecoveryPauseState GetRecoveryPauseState(void); +extern void SetRecoveryPause(bool recoveryPause); +extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream); +extern TimestampTz GetLatestXTime(void); +extern TimestampTz GetCurrentChunkReplayStartTime(void); +extern XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI); + +extern bool PromoteIsTriggered(void); +extern bool CheckPromoteSignal(void); +extern void WakeupRecovery(void); + +extern void StartupRequestWalReceiverRestart(void); +extern void XLogRequestWalReceiverReply(void); + +extern void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue); + +extern void xlog_outdesc(StringInfo buf, XLogReaderState *record); + +#endif /* XLOGRECOVERY_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index bfb7802f2d..15684f53ba 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -607,6 +607,7 @@ EndDirectModify_function EndForeignInsert_function EndForeignModify_function EndForeignScan_function +EndOfWalRecoveryInfo EndSampleScan_function EnumItem EolType @@ -2945,6 +2946,7 @@ XLogRecordBlockCompressHeader XLogRecordBlockHeader XLogRecordBlockImageHeader XLogRecordBuffer +XLogRecoveryCtlData XLogRedoAction XLogSegNo XLogSource