2022-02-16 08:30:38 +01:00
/*-------------------------------------------------------------------------
*
* xlogrecovery . c
* Functions for WAL recovery , standby mode
*
* This source file contains functions controlling WAL recovery .
* InitWalRecovery ( ) initializes the system for crash or archive recovery ,
* or standby mode , depending on configuration options and the state of
* the control file and possible backup label file . PerformWalRecovery ( )
* performs the actual WAL replay , calling the rmgr - specific redo routines .
* EndWalRecovery ( ) performs end - of - recovery checks and cleanup actions ,
* and prepares information needed to initialize the WAL for writes . In
* addition to these three main functions , there are a bunch of functions
* for interrogating recovery state and controlling the recovery process .
*
*
* Portions Copyright ( c ) 1996 - 2022 , PostgreSQL Global Development Group
* Portions Copyright ( c ) 1994 , Regents of the University of California
*
* src / backend / access / transam / xlogrecovery . c
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
# include "postgres.h"
# include <ctype.h>
# include <math.h>
# include <time.h>
# include <sys/stat.h>
# include <sys/time.h>
# include <unistd.h>
# include "access/timeline.h"
# include "access/transam.h"
# include "access/xact.h"
# include "access/xlog_internal.h"
# include "access/xlogarchive.h"
2022-04-07 09:28:40 +02:00
# include "access/xlogprefetcher.h"
2022-02-16 08:30:38 +01:00
# include "access/xlogreader.h"
# include "access/xlogrecovery.h"
# include "access/xlogutils.h"
# include "catalog/pg_control.h"
# include "commands/tablespace.h"
# include "miscadmin.h"
# include "pgstat.h"
# include "postmaster/bgwriter.h"
# include "postmaster/startup.h"
# include "replication/basebackup.h"
# include "replication/walreceiver.h"
# include "storage/fd.h"
# include "storage/ipc.h"
# include "storage/latch.h"
# include "storage/pmsignal.h"
# include "storage/proc.h"
# include "storage/procarray.h"
# include "storage/spin.h"
# include "utils/builtins.h"
# include "utils/guc.h"
# include "utils/ps_status.h"
# include "utils/pg_rusage.h"
/* Unsupported old recovery command file names (relative to $PGDATA) */
# define RECOVERY_COMMAND_FILE "recovery.conf"
# define RECOVERY_COMMAND_DONE "recovery.done"
/*
* GUC support
*/
const struct config_enum_entry recovery_target_action_options [ ] = {
{ " pause " , RECOVERY_TARGET_ACTION_PAUSE , false } ,
{ " promote " , RECOVERY_TARGET_ACTION_PROMOTE , false } ,
{ " shutdown " , RECOVERY_TARGET_ACTION_SHUTDOWN , false } ,
{ NULL , 0 , false }
} ;
/* options formerly taken from recovery.conf for archive recovery */
char * recoveryRestoreCommand = NULL ;
char * recoveryEndCommand = NULL ;
char * archiveCleanupCommand = NULL ;
RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET ;
bool recoveryTargetInclusive = true ;
int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE ;
TransactionId recoveryTargetXid ;
char * recovery_target_time_string ;
TimestampTz recoveryTargetTime ;
const char * recoveryTargetName ;
XLogRecPtr recoveryTargetLSN ;
int recovery_min_apply_delay = 0 ;
/* options formerly taken from recovery.conf for XLOG streaming */
char * PrimaryConnInfo = NULL ;
char * PrimarySlotName = NULL ;
char * PromoteTriggerFile = NULL ;
bool wal_receiver_create_temp_slot = false ;
/*
* recoveryTargetTimeLineGoal : what the user requested , if any
*
* recoveryTargetTLIRequested : numeric value of requested timeline , if constant
*
* recoveryTargetTLI : the currently understood target timeline ; changes
*
* expectedTLEs : a list of TimeLineHistoryEntries for recoveryTargetTLI and
* the timelines of its known parents , newest first ( so recoveryTargetTLI is
* always the first list member ) . Only these TLIs are expected to be seen in
* the WAL segments we read , and indeed only these TLIs will be considered as
* candidate WAL files to open at all .
*
* curFileTLI : the TLI appearing in the name of the current input WAL file .
* ( This is not necessarily the same as the timeline from which we are
* replaying WAL , which StartupXLOG calls replayTLI , because we could be
* scanning data that was copied from an ancestor timeline when the current
* file was created . ) During a sequential scan we do not allow this value
* to decrease .
*/
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST ;
TimeLineID recoveryTargetTLIRequested = 0 ;
TimeLineID recoveryTargetTLI = 0 ;
static List * expectedTLEs ;
static TimeLineID curFileTLI ;
/*
* When ArchiveRecoveryRequested is set , archive recovery was requested ,
* ie . signal files were present . When InArchiveRecovery is set , we are
* currently recovering using offline XLOG archives . These variables are only
* valid in the startup process .
*
* When ArchiveRecoveryRequested is true , but InArchiveRecovery is false , we ' re
* currently performing crash recovery using only XLOG files in pg_wal , but
* will switch to using offline XLOG archives as soon as we reach the end of
* WAL in pg_wal .
*/
bool ArchiveRecoveryRequested = false ;
bool InArchiveRecovery = false ;
/*
* When StandbyModeRequested is set , standby mode was requested , i . e .
* standby . signal file was present . When StandbyMode is set , we are currently
* in standby mode . These variables are only valid in the startup process .
* They work similarly to ArchiveRecoveryRequested and InArchiveRecovery .
*/
static bool StandbyModeRequested = false ;
bool StandbyMode = false ;
/* was a signal file present at startup? */
static bool standby_signal_file_found = false ;
static bool recovery_signal_file_found = false ;
/*
* CheckPointLoc is the position of the checkpoint record that determines
* where to start the replay . It comes from the backup label file or the
* control file .
*
* RedoStartLSN is the checkpoint ' s REDO location , also from the backup label
* file or the control file . In standby mode , XLOG streaming usually starts
* from the position where an invalid record was found . But if we fail to
* read even the initial checkpoint record , we use the REDO location instead
* of the checkpoint location as the start position of XLOG streaming .
* Otherwise we would have to jump backwards to the REDO location after
* reading the checkpoint record , because the REDO record can precede the
* checkpoint record .
*/
static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr ;
static TimeLineID CheckPointTLI = 0 ;
static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr ;
static TimeLineID RedoStartTLI = 0 ;
/*
* Local copy of SharedHotStandbyActive variable . False actually means " not
* known , need to check the shared state " .
*/
static bool LocalHotStandbyActive = false ;
/*
* Local copy of SharedPromoteIsTriggered variable . False actually means " not
* known , need to check the shared state " .
*/
static bool LocalPromoteIsTriggered = false ;
/* Has the recovery code requested a walreceiver wakeup? */
static bool doRequestWalReceiverReply ;
/* XLogReader object used to parse the WAL records */
static XLogReaderState * xlogreader = NULL ;
2022-04-07 09:28:40 +02:00
/* XLogPrefetcher object used to consume WAL records with read-ahead */
static XLogPrefetcher * xlogprefetcher = NULL ;
2022-02-16 08:30:38 +01:00
/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
typedef struct XLogPageReadPrivate
{
int emode ;
bool fetching_ckpt ; /* are we fetching a checkpoint record? */
bool randAccess ;
TimeLineID replayTLI ;
} XLogPageReadPrivate ;
/* flag to tell XLogPageRead that we have started replaying */
static bool InRedo = false ;
/*
* Codes indicating where we got a WAL file from during recovery , or where
* to attempt to get one .
*/
typedef enum
{
XLOG_FROM_ANY = 0 , /* request to read WAL from any source */
XLOG_FROM_ARCHIVE , /* restored using restore_command */
XLOG_FROM_PG_WAL , /* existing file in pg_wal */
XLOG_FROM_STREAM /* streamed from primary */
} XLogSource ;
/* human-readable names for XLogSources, for debugging output */
static const char * const xlogSourceNames [ ] = { " any " , " archive " , " pg_wal " , " stream " } ;
/*
* readFile is - 1 or a kernel FD for the log file segment that ' s currently
* open for reading . readSegNo identifies the segment . readOff is the offset
* of the page just read , readLen indicates how much of it has been read into
* readBuf , and readSource indicates where we got the currently open file from .
*
* Note : we could use Reserve / ReleaseExternalFD to track consumption of this
* FD too ( like for openLogFile in xlog . c ) ; but it doesn ' t currently seem
* worthwhile , since the XLOG is not read by general - purpose sessions .
*/
static int readFile = - 1 ;
static XLogSegNo readSegNo = 0 ;
static uint32 readOff = 0 ;
static uint32 readLen = 0 ;
static XLogSource readSource = XLOG_FROM_ANY ;
/*
* Keeps track of which source we ' re currently reading from . This is
* different from readSource in that this is always set , even when we don ' t
* currently have a WAL file open . If lastSourceFailed is set , our last
* attempt to read from currentSource failed , and we should try another source
* next .
*
* pendingWalRcvRestart is set when a config change occurs that requires a
* walreceiver restart . This is only valid in XLOG_FROM_STREAM state .
*/
static XLogSource currentSource = XLOG_FROM_ANY ;
static bool lastSourceFailed = false ;
static bool pendingWalRcvRestart = false ;
/*
* These variables track when we last obtained some WAL data to process ,
* and where we got it from . ( XLogReceiptSource is initially the same as
* readSource , but readSource gets reset to zero when we don ' t have data
* to process right now . It is also different from currentSource , which
* also changes when we try to read from a source and fail , while
* XLogReceiptSource tracks where we last successfully read some WAL . )
*/
static TimestampTz XLogReceiptTime = 0 ;
static XLogSource XLogReceiptSource = XLOG_FROM_ANY ;
/* Local copy of WalRcv->flushedUpto */
static XLogRecPtr flushedUpto = 0 ;
static TimeLineID receiveTLI = 0 ;
/*
* Copy of minRecoveryPoint and backupEndPoint from the control file .
*
* In order to reach consistency , we must replay the WAL up to
* minRecoveryPoint . If backupEndRequired is true , we must also reach
* backupEndPoint , or if it ' s invalid , an end - of - backup record corresponding
* to backupStartPoint .
*
* Note : In archive recovery , after consistency has been reached , the
* functions in xlog . c will start updating minRecoveryPoint in the control
* file . But this copy of minRecoveryPoint variable reflects the value at the
* beginning of recovery , and is * not * updated after consistency is reached .
*/
static XLogRecPtr minRecoveryPoint ;
static TimeLineID minRecoveryPointTLI ;
static XLogRecPtr backupStartPoint ;
static XLogRecPtr backupEndPoint ;
static bool backupEndRequired = false ;
/*
* Have we reached a consistent database state ? In crash recovery , we have
* to replay all the WAL , so reachedConsistency is never set . During archive
* recovery , the database is consistent once minRecoveryPoint is reached .
*
* Consistent state means that the system is internally consistent , all
* the WAL has been replayed up to a certain point , and importantly , there
* is no trace of later actions on disk .
*/
bool reachedConsistency = false ;
/* Buffers dedicated to consistency checks of size BLCKSZ */
static char * replay_image_masked = NULL ;
static char * primary_image_masked = NULL ;
/*
* Shared - memory state for WAL recovery .
*/
typedef struct XLogRecoveryCtlData
{
/*
* SharedHotStandbyActive indicates if we allow hot standby queries to be
* run . Protected by info_lck .
*/
bool SharedHotStandbyActive ;
/*
* SharedPromoteIsTriggered indicates if a standby promotion has been
* triggered . Protected by info_lck .
*/
bool SharedPromoteIsTriggered ;
/*
* recoveryWakeupLatch is used to wake up the startup process to continue
* WAL replay , if it is waiting for WAL to arrive or failover trigger file
* to appear .
*
* Note that the startup process also uses another latch , its procLatch ,
* to wait for recovery conflict . If we get rid of recoveryWakeupLatch for
* signaling the startup process in favor of using its procLatch , which
* comports better with possible generic signal handlers using that latch .
* But we should not do that because the startup process doesn ' t assume
* that it ' s waken up by walreceiver process or SIGHUP signal handler
* while it ' s waiting for recovery conflict . The separate latches ,
* recoveryWakeupLatch and procLatch , should be used for inter - process
* communication for WAL replay and recovery conflict , respectively .
*/
Latch recoveryWakeupLatch ;
/*
* Last record successfully replayed .
*/
XLogRecPtr lastReplayedReadRecPtr ; /* start position */
XLogRecPtr lastReplayedEndRecPtr ; /* end+1 position */
TimeLineID lastReplayedTLI ; /* timeline */
/*
* When we ' re currently replaying a record , ie . in a redo function ,
* replayEndRecPtr points to the end + 1 of the record being replayed ,
* otherwise it ' s equal to lastReplayedEndRecPtr .
*/
XLogRecPtr replayEndRecPtr ;
TimeLineID replayEndTLI ;
/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
TimestampTz recoveryLastXTime ;
/*
* timestamp of when we started replaying the current chunk of WAL data ,
* only relevant for replication or archive recovery
*/
TimestampTz currentChunkStartTime ;
/* Recovery pause state */
RecoveryPauseState recoveryPauseState ;
ConditionVariable recoveryNotPausedCV ;
slock_t info_lck ; /* locks shared variables shown above */
} XLogRecoveryCtlData ;
static XLogRecoveryCtlData * XLogRecoveryCtl = NULL ;
/*
* abortedRecPtr is the start pointer of a broken record at end of WAL when
* recovery completes ; missingContrecPtr is the location of the first
* contrecord that went missing . See CreateOverwriteContrecordRecord for
* details .
*/
static XLogRecPtr abortedRecPtr ;
static XLogRecPtr missingContrecPtr ;
/*
* if recoveryStopsBefore / After returns true , it saves information of the stop
* point here
*/
static TransactionId recoveryStopXid ;
static TimestampTz recoveryStopTime ;
static XLogRecPtr recoveryStopLSN ;
static char recoveryStopName [ MAXFNAMELEN ] ;
static bool recoveryStopAfter ;
/* prototypes for local functions */
static void ApplyWalRecord ( XLogReaderState * xlogreader , XLogRecord * record , TimeLineID * replayTLI ) ;
static void readRecoverySignalFile ( void ) ;
static void validateRecoveryParameters ( void ) ;
static bool read_backup_label ( XLogRecPtr * checkPointLoc ,
TimeLineID * backupLabelTLI ,
bool * backupEndRequired , bool * backupFromStandby ) ;
static bool read_tablespace_map ( List * * tablespaces ) ;
static void xlogrecovery_redo ( XLogReaderState * record , TimeLineID replayTLI ) ;
static void CheckRecoveryConsistency ( void ) ;
static void rm_redo_error_callback ( void * arg ) ;
# ifdef WAL_DEBUG
static void xlog_outrec ( StringInfo buf , XLogReaderState * record ) ;
# endif
static void xlog_block_info ( StringInfo buf , XLogReaderState * record ) ;
static void checkTimeLineSwitch ( XLogRecPtr lsn , TimeLineID newTLI ,
TimeLineID prevTLI , TimeLineID replayTLI ) ;
static bool getRecordTimestamp ( XLogReaderState * record , TimestampTz * recordXtime ) ;
static void verifyBackupPageConsistency ( XLogReaderState * record ) ;
static bool recoveryStopsBefore ( XLogReaderState * record ) ;
static bool recoveryStopsAfter ( XLogReaderState * record ) ;
static char * getRecoveryStopReason ( void ) ;
static void recoveryPausesHere ( bool endOfRecovery ) ;
static bool recoveryApplyDelay ( XLogReaderState * record ) ;
static void ConfirmRecoveryPaused ( void ) ;
2022-04-07 09:28:40 +02:00
static XLogRecord * ReadRecord ( XLogPrefetcher * xlogprefetcher ,
int emode , bool fetching_ckpt ,
TimeLineID replayTLI ) ;
2022-02-16 08:30:38 +01:00
static int XLogPageRead ( XLogReaderState * xlogreader , XLogRecPtr targetPagePtr ,
int reqLen , XLogRecPtr targetRecPtr , char * readBuf ) ;
2022-04-07 09:28:40 +02:00
static XLogPageReadResult WaitForWALToBecomeAvailable ( XLogRecPtr RecPtr ,
bool randAccess ,
bool fetching_ckpt ,
XLogRecPtr tliRecPtr ,
TimeLineID replayTLI ,
XLogRecPtr replayLSN ,
bool nonblocking ) ;
2022-02-16 08:30:38 +01:00
static int emode_for_corrupt_record ( int emode , XLogRecPtr RecPtr ) ;
2022-04-07 09:28:40 +02:00
static XLogRecord * ReadCheckpointRecord ( XLogPrefetcher * xlogprefetcher , XLogRecPtr RecPtr ,
2022-02-16 08:30:38 +01:00
int whichChkpt , bool report , TimeLineID replayTLI ) ;
static bool rescanLatestTimeLine ( TimeLineID replayTLI , XLogRecPtr replayLSN ) ;
static int XLogFileRead ( XLogSegNo segno , int emode , TimeLineID tli ,
XLogSource source , bool notfoundOk ) ;
static int XLogFileReadAnyTLI ( XLogSegNo segno , int emode , XLogSource source ) ;
static bool CheckForStandbyTrigger ( void ) ;
static void SetPromoteIsTriggered ( void ) ;
static bool HotStandbyActiveInReplay ( void ) ;
static void SetCurrentChunkStartTime ( TimestampTz xtime ) ;
static void SetLatestXTime ( TimestampTz xtime ) ;
/*
* Initialization of shared memory for WAL recovery
*/
Size
XLogRecoveryShmemSize ( void )
{
Size size ;
/* XLogRecoveryCtl */
size = sizeof ( XLogRecoveryCtlData ) ;
return size ;
}
void
XLogRecoveryShmemInit ( void )
{
bool found ;
XLogRecoveryCtl = ( XLogRecoveryCtlData * )
ShmemInitStruct ( " XLOG Recovery Ctl " , XLogRecoveryShmemSize ( ) , & found ) ;
if ( found )
return ;
memset ( XLogRecoveryCtl , 0 , sizeof ( XLogRecoveryCtlData ) ) ;
SpinLockInit ( & XLogRecoveryCtl - > info_lck ) ;
InitSharedLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ) ;
ConditionVariableInit ( & XLogRecoveryCtl - > recoveryNotPausedCV ) ;
}
/*
* Prepare the system for WAL recovery , if needed .
*
* This is called by StartupXLOG ( ) which coordinates the server startup
* sequence . This function analyzes the control file and the backup label
* file , if any , and figures out whether we need to perform crash recovery or
* archive recovery , and how far we need to replay the WAL to reach a
* consistent state .
*
* This doesn ' t yet change the on - disk state , except for creating the symlinks
* from table space map file if any , and for fetching WAL files needed to find
* the checkpoint record . On entry , the caller has already read the control
* file into memory , and passes it as argument . This function updates it to
* reflect the recovery state , and the caller is expected to write it back to
* disk does after initializing other subsystems , but before calling
* PerformWalRecovery ( ) .
*
* This initializes some global variables like ArchiveModeRequested , and
* StandbyModeRequested and InRecovery .
*/
void
InitWalRecovery ( ControlFileData * ControlFile , bool * wasShutdown_ptr ,
bool * haveBackupLabel_ptr , bool * haveTblspcMap_ptr )
{
XLogPageReadPrivate * private ;
struct stat st ;
bool wasShutdown ;
XLogRecord * record ;
DBState dbstate_at_startup ;
bool haveTblspcMap = false ;
bool haveBackupLabel = false ;
CheckPoint checkPoint ;
bool backupFromStandby = false ;
dbstate_at_startup = ControlFile - > state ;
/*
* Initialize on the assumption we want to recover to the latest timeline
* that ' s active according to pg_control .
*/
if ( ControlFile - > minRecoveryPointTLI >
ControlFile - > checkPointCopy . ThisTimeLineID )
recoveryTargetTLI = ControlFile - > minRecoveryPointTLI ;
else
recoveryTargetTLI = ControlFile - > checkPointCopy . ThisTimeLineID ;
/*
* Check for signal files , and if so set up state for offline recovery
*/
readRecoverySignalFile ( ) ;
validateRecoveryParameters ( ) ;
if ( ArchiveRecoveryRequested )
{
if ( StandbyModeRequested )
ereport ( LOG ,
( errmsg ( " entering standby mode " ) ) ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_XID )
ereport ( LOG ,
( errmsg ( " starting point-in-time recovery to XID %u " ,
recoveryTargetXid ) ) ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_TIME )
ereport ( LOG ,
( errmsg ( " starting point-in-time recovery to %s " ,
timestamptz_to_str ( recoveryTargetTime ) ) ) ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_NAME )
ereport ( LOG ,
( errmsg ( " starting point-in-time recovery to \" %s \" " ,
recoveryTargetName ) ) ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_LSN )
ereport ( LOG ,
( errmsg ( " starting point-in-time recovery to WAL location (LSN) \" %X/%X \" " ,
LSN_FORMAT_ARGS ( recoveryTargetLSN ) ) ) ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_IMMEDIATE )
ereport ( LOG ,
( errmsg ( " starting point-in-time recovery to earliest consistent point " ) ) ) ;
else
ereport ( LOG ,
( errmsg ( " starting archive recovery " ) ) ) ;
}
/*
* Take ownership of the wakeup latch if we ' re going to sleep during
* recovery .
*/
if ( ArchiveRecoveryRequested )
OwnLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ) ;
private = palloc0 ( sizeof ( XLogPageReadPrivate ) ) ;
xlogreader =
XLogReaderAllocate ( wal_segment_size , NULL ,
XL_ROUTINE ( . page_read = & XLogPageRead ,
. segment_open = NULL ,
. segment_close = wal_segment_close ) ,
private ) ;
if ( ! xlogreader )
ereport ( ERROR ,
( errcode ( ERRCODE_OUT_OF_MEMORY ) ,
errmsg ( " out of memory " ) ,
errdetail ( " Failed while allocating a WAL reading processor. " ) ) ) ;
xlogreader - > system_identifier = ControlFile - > system_identifier ;
2022-04-07 09:28:40 +02:00
/*
* Set the WAL decode buffer size . This limits how far ahead we can read
* in the WAL .
*/
XLogReaderSetDecodeBuffer ( xlogreader , NULL , wal_decode_buffer_size ) ;
/* Create a WAL prefetcher. */
xlogprefetcher = XLogPrefetcherAllocate ( xlogreader ) ;
2022-02-16 08:30:38 +01:00
/*
* Allocate two page buffers dedicated to WAL consistency checks . We do
* it this way , rather than just making static arrays , for two reasons :
* ( 1 ) no need to waste the storage in most instantiations of the backend ;
* ( 2 ) a static char array isn ' t guaranteed to have any particular
* alignment , whereas palloc ( ) will provide MAXALIGN ' d storage .
*/
replay_image_masked = ( char * ) palloc ( BLCKSZ ) ;
primary_image_masked = ( char * ) palloc ( BLCKSZ ) ;
if ( read_backup_label ( & CheckPointLoc , & CheckPointTLI , & backupEndRequired ,
& backupFromStandby ) )
{
List * tablespaces = NIL ;
/*
* Archive recovery was requested , and thanks to the backup label
* file , we know how far we need to replay to reach consistency . Enter
* archive recovery directly .
*/
InArchiveRecovery = true ;
if ( StandbyModeRequested )
StandbyMode = true ;
/*
* When a backup_label file is present , we want to roll forward from
* the checkpoint it identifies , rather than using pg_control .
*/
2022-04-07 09:28:40 +02:00
record = ReadCheckpointRecord ( xlogprefetcher , CheckPointLoc , 0 , true ,
CheckPointTLI ) ;
2022-02-16 08:30:38 +01:00
if ( record ! = NULL )
{
memcpy ( & checkPoint , XLogRecGetData ( xlogreader ) , sizeof ( CheckPoint ) ) ;
wasShutdown = ( ( record - > xl_info & ~ XLR_INFO_MASK ) = = XLOG_CHECKPOINT_SHUTDOWN ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " checkpoint record is at %X/%X " ,
LSN_FORMAT_ARGS ( CheckPointLoc ) ) ) ) ;
InRecovery = true ; /* force recovery even if SHUTDOWNED */
/*
* Make sure that REDO location exists . This may not be the case
* if there was a crash during an online backup , which left a
* backup_label around that references a WAL segment that ' s
* already been archived .
*/
if ( checkPoint . redo < CheckPointLoc )
{
2022-04-07 09:28:40 +02:00
XLogPrefetcherBeginRead ( xlogprefetcher , checkPoint . redo ) ;
if ( ! ReadRecord ( xlogprefetcher , LOG , false ,
2022-02-16 08:30:38 +01:00
checkPoint . ThisTimeLineID ) )
ereport ( FATAL ,
( errmsg ( " could not find redo location referenced by checkpoint record " ) ,
errhint ( " If you are restoring from a backup, touch \" %s/recovery.signal \" and add required recovery options. \n "
" If you are not restoring from a backup, try removing the file \" %s/backup_label \" . \n "
" Be careful: removing \" %s/backup_label \" will result in a corrupt cluster if restoring from a backup. " ,
DataDir , DataDir , DataDir ) ) ) ;
}
}
else
{
ereport ( FATAL ,
( errmsg ( " could not locate required checkpoint record " ) ,
errhint ( " If you are restoring from a backup, touch \" %s/recovery.signal \" and add required recovery options. \n "
" If you are not restoring from a backup, try removing the file \" %s/backup_label \" . \n "
" Be careful: removing \" %s/backup_label \" will result in a corrupt cluster if restoring from a backup. " ,
DataDir , DataDir , DataDir ) ) ) ;
wasShutdown = false ; /* keep compiler quiet */
}
/* Read the tablespace_map file if present and create symlinks. */
if ( read_tablespace_map ( & tablespaces ) )
{
ListCell * lc ;
foreach ( lc , tablespaces )
{
tablespaceinfo * ti = lfirst ( lc ) ;
char * linkloc ;
linkloc = psprintf ( " pg_tblspc/%s " , ti - > oid ) ;
/*
* Remove the existing symlink if any and Create the symlink
* under PGDATA .
*/
remove_tablespace_symlink ( linkloc ) ;
if ( symlink ( ti - > path , linkloc ) < 0 )
ereport ( ERROR ,
( errcode_for_file_access ( ) ,
errmsg ( " could not create symbolic link \" %s \" : %m " ,
linkloc ) ) ) ;
pfree ( ti - > oid ) ;
pfree ( ti - > path ) ;
pfree ( ti ) ;
}
/* tell the caller to delete it later */
haveTblspcMap = true ;
}
/* tell the caller to delete it later */
haveBackupLabel = true ;
}
else
{
/*
* If tablespace_map file is present without backup_label file , there
* is no use of such file . There is no harm in retaining it , but it
* is better to get rid of the map file so that we don ' t have any
* redundant file in data directory and it will avoid any sort of
* confusion . It seems prudent though to just rename the file out of
* the way rather than delete it completely , also we ignore any error
* that occurs in rename operation as even if map file is present
* without backup_label file , it is harmless .
*/
if ( stat ( TABLESPACE_MAP , & st ) = = 0 )
{
unlink ( TABLESPACE_MAP_OLD ) ;
if ( durable_rename ( TABLESPACE_MAP , TABLESPACE_MAP_OLD , DEBUG1 ) = = 0 )
ereport ( LOG ,
( errmsg ( " ignoring file \" %s \" because no file \" %s \" exists " ,
TABLESPACE_MAP , BACKUP_LABEL_FILE ) ,
errdetail ( " File \" %s \" was renamed to \" %s \" . " ,
TABLESPACE_MAP , TABLESPACE_MAP_OLD ) ) ) ;
else
ereport ( LOG ,
( errmsg ( " ignoring file \" %s \" because no file \" %s \" exists " ,
TABLESPACE_MAP , BACKUP_LABEL_FILE ) ,
errdetail ( " Could not rename file \" %s \" to \" %s \" : %m. " ,
TABLESPACE_MAP , TABLESPACE_MAP_OLD ) ) ) ;
}
/*
* It ' s possible that archive recovery was requested , but we don ' t
* know how far we need to replay the WAL before we reach consistency .
* This can happen for example if a base backup is taken from a
* running server using an atomic filesystem snapshot , without calling
* pg_start / stop_backup . Or if you just kill a running primary server
* and put it into archive recovery by creating a recovery signal
* file .
*
* Our strategy in that case is to perform crash recovery first ,
* replaying all the WAL present in pg_wal , and only enter archive
* recovery after that .
*
* But usually we already know how far we need to replay the WAL ( up
* to minRecoveryPoint , up to backupEndPoint , or until we see an
* end - of - backup record ) , and we can enter archive recovery directly .
*/
if ( ArchiveRecoveryRequested & &
( ControlFile - > minRecoveryPoint ! = InvalidXLogRecPtr | |
ControlFile - > backupEndRequired | |
ControlFile - > backupEndPoint ! = InvalidXLogRecPtr | |
ControlFile - > state = = DB_SHUTDOWNED ) )
{
InArchiveRecovery = true ;
if ( StandbyModeRequested )
StandbyMode = true ;
}
/* Get the last valid checkpoint record. */
CheckPointLoc = ControlFile - > checkPoint ;
CheckPointTLI = ControlFile - > checkPointCopy . ThisTimeLineID ;
RedoStartLSN = ControlFile - > checkPointCopy . redo ;
RedoStartTLI = ControlFile - > checkPointCopy . ThisTimeLineID ;
2022-04-07 09:28:40 +02:00
record = ReadCheckpointRecord ( xlogprefetcher , CheckPointLoc , 1 , true ,
2022-02-16 08:30:38 +01:00
CheckPointTLI ) ;
if ( record ! = NULL )
{
ereport ( DEBUG1 ,
( errmsg_internal ( " checkpoint record is at %X/%X " ,
LSN_FORMAT_ARGS ( CheckPointLoc ) ) ) ) ;
}
else
{
/*
* We used to attempt to go back to a secondary checkpoint record
* here , but only when not in standby mode . We now just fail if we
* can ' t read the last checkpoint because this allows us to
* simplify processing around checkpoints .
*/
ereport ( PANIC ,
( errmsg ( " could not locate a valid checkpoint record " ) ) ) ;
}
memcpy ( & checkPoint , XLogRecGetData ( xlogreader ) , sizeof ( CheckPoint ) ) ;
wasShutdown = ( ( record - > xl_info & ~ XLR_INFO_MASK ) = = XLOG_CHECKPOINT_SHUTDOWN ) ;
}
/*
* If the location of the checkpoint record is not on the expected
* timeline in the history of the requested timeline , we cannot proceed :
* the backup is not part of the history of the requested timeline .
*/
Assert ( expectedTLEs ) ; /* was initialized by reading checkpoint
* record */
if ( tliOfPointInHistory ( CheckPointLoc , expectedTLEs ) ! =
CheckPointTLI )
{
XLogRecPtr switchpoint ;
/*
* tliSwitchPoint will throw an error if the checkpoint ' s timeline is
* not in expectedTLEs at all .
*/
switchpoint = tliSwitchPoint ( ControlFile - > checkPointCopy . ThisTimeLineID , expectedTLEs , NULL ) ;
ereport ( FATAL ,
( errmsg ( " requested timeline %u is not a child of this server's history " ,
recoveryTargetTLI ) ,
errdetail ( " Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X. " ,
LSN_FORMAT_ARGS ( ControlFile - > checkPoint ) ,
ControlFile - > checkPointCopy . ThisTimeLineID ,
LSN_FORMAT_ARGS ( switchpoint ) ) ) ) ;
}
/*
* The min recovery point should be part of the requested timeline ' s
* history , too .
*/
if ( ! XLogRecPtrIsInvalid ( ControlFile - > minRecoveryPoint ) & &
tliOfPointInHistory ( ControlFile - > minRecoveryPoint - 1 , expectedTLEs ) ! =
ControlFile - > minRecoveryPointTLI )
ereport ( FATAL ,
( errmsg ( " requested timeline %u does not contain minimum recovery point %X/%X on timeline %u " ,
recoveryTargetTLI ,
LSN_FORMAT_ARGS ( ControlFile - > minRecoveryPoint ) ,
ControlFile - > minRecoveryPointTLI ) ) ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " redo record is at %X/%X; shutdown %s " ,
LSN_FORMAT_ARGS ( checkPoint . redo ) ,
wasShutdown ? " true " : " false " ) ) ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " next transaction ID: " UINT64_FORMAT " ; next OID: %u " ,
U64FromFullTransactionId ( checkPoint . nextXid ) ,
checkPoint . nextOid ) ) ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " next MultiXactId: %u; next MultiXactOffset: %u " ,
checkPoint . nextMulti , checkPoint . nextMultiOffset ) ) ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " oldest unfrozen transaction ID: %u, in database %u " ,
checkPoint . oldestXid , checkPoint . oldestXidDB ) ) ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " oldest MultiXactId: %u, in database %u " ,
checkPoint . oldestMulti , checkPoint . oldestMultiDB ) ) ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " commit timestamp Xid oldest/newest: %u/%u " ,
checkPoint . oldestCommitTsXid ,
checkPoint . newestCommitTsXid ) ) ) ;
if ( ! TransactionIdIsNormal ( XidFromFullTransactionId ( checkPoint . nextXid ) ) )
ereport ( PANIC ,
( errmsg ( " invalid next transaction ID " ) ) ) ;
/* sanity check */
if ( checkPoint . redo > CheckPointLoc )
ereport ( PANIC ,
( errmsg ( " invalid redo in checkpoint record " ) ) ) ;
/*
* Check whether we need to force recovery from WAL . If it appears to
* have been a clean shutdown and we did not have a recovery signal file ,
* then assume no recovery needed .
*/
if ( checkPoint . redo < CheckPointLoc )
{
if ( wasShutdown )
ereport ( PANIC ,
( errmsg ( " invalid redo record in shutdown checkpoint " ) ) ) ;
InRecovery = true ;
}
else if ( ControlFile - > state ! = DB_SHUTDOWNED )
InRecovery = true ;
else if ( ArchiveRecoveryRequested )
{
/* force recovery due to presence of recovery signal file */
InRecovery = true ;
}
/*
2022-02-16 22:15:08 +01:00
* If recovery is needed , update our in - memory copy of pg_control to show
* that we are recovering and to show the selected checkpoint as the place
* we are starting from . We also mark pg_control with any minimum recovery
* stop point obtained from a backup history file .
*
* We don ' t write the changes to disk yet , though . Only do that after
* initializing various subsystems .
2022-02-16 08:30:38 +01:00
*/
2022-02-16 22:15:08 +01:00
if ( InRecovery )
2022-02-16 08:30:38 +01:00
{
2022-02-16 22:15:08 +01:00
if ( InArchiveRecovery )
{
ControlFile - > state = DB_IN_ARCHIVE_RECOVERY ;
}
else
{
2022-02-16 08:30:38 +01:00
ereport ( LOG ,
2022-02-16 22:15:08 +01:00
( errmsg ( " database system was not properly shut down; "
" automatic recovery in progress " ) ) ) ;
if ( recoveryTargetTLI > ControlFile - > checkPointCopy . ThisTimeLineID )
ereport ( LOG ,
( errmsg ( " crash recovery starts in timeline %u "
" and has target timeline %u " ,
ControlFile - > checkPointCopy . ThisTimeLineID ,
recoveryTargetTLI ) ) ) ;
ControlFile - > state = DB_IN_CRASH_RECOVERY ;
}
ControlFile - > checkPoint = CheckPointLoc ;
ControlFile - > checkPointCopy = checkPoint ;
if ( InArchiveRecovery )
2022-02-16 08:30:38 +01:00
{
2022-02-16 22:15:08 +01:00
/* initialize minRecoveryPoint if not set yet */
if ( ControlFile - > minRecoveryPoint < checkPoint . redo )
{
ControlFile - > minRecoveryPoint = checkPoint . redo ;
ControlFile - > minRecoveryPointTLI = checkPoint . ThisTimeLineID ;
}
2022-02-16 08:30:38 +01:00
}
2022-02-16 22:15:08 +01:00
/*
* Set backupStartPoint if we ' re starting recovery from a base backup .
*
* Also set backupEndPoint and use minRecoveryPoint as the backup end
* location if we ' re starting recovery from a base backup which was
* taken from a standby . In this case , the database system status in
* pg_control must indicate that the database was already in recovery .
* Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
* DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
* before reaching this point ; e . g . because restore_command or
* primary_conninfo were faulty .
*
* Any other state indicates that the backup somehow became corrupted
* and we can ' t sensibly continue with recovery .
*/
if ( haveBackupLabel )
2022-02-16 08:30:38 +01:00
{
2022-02-16 22:15:08 +01:00
ControlFile - > backupStartPoint = checkPoint . redo ;
ControlFile - > backupEndRequired = backupEndRequired ;
if ( backupFromStandby )
{
if ( dbstate_at_startup ! = DB_IN_ARCHIVE_RECOVERY & &
dbstate_at_startup ! = DB_SHUTDOWNED_IN_RECOVERY )
ereport ( FATAL ,
( errmsg ( " backup_label contains data inconsistent with control file " ) ,
errhint ( " This means that the backup is corrupted and you will "
" have to use another backup for recovery. " ) ) ) ;
ControlFile - > backupEndPoint = ControlFile - > minRecoveryPoint ;
}
2022-02-16 08:30:38 +01:00
}
}
/* remember these, so that we know when we have reached consistency */
backupStartPoint = ControlFile - > backupStartPoint ;
backupEndRequired = ControlFile - > backupEndRequired ;
backupEndPoint = ControlFile - > backupEndPoint ;
if ( InArchiveRecovery )
{
minRecoveryPoint = ControlFile - > minRecoveryPoint ;
minRecoveryPointTLI = ControlFile - > minRecoveryPointTLI ;
}
else
{
minRecoveryPoint = InvalidXLogRecPtr ;
minRecoveryPointTLI = 0 ;
}
/*
* Start recovery assuming that the final record isn ' t lost .
*/
abortedRecPtr = InvalidXLogRecPtr ;
missingContrecPtr = InvalidXLogRecPtr ;
* wasShutdown_ptr = wasShutdown ;
* haveBackupLabel_ptr = haveBackupLabel ;
* haveTblspcMap_ptr = haveTblspcMap ;
}
/*
* See if there are any recovery signal files and if so , set state for
* recovery .
*
* See if there is a recovery command file ( recovery . conf ) , and if so
* throw an ERROR since as of PG12 we no longer recognize that .
*/
static void
readRecoverySignalFile ( void )
{
struct stat stat_buf ;
if ( IsBootstrapProcessingMode ( ) )
return ;
/*
* Check for old recovery API file : recovery . conf
*/
if ( stat ( RECOVERY_COMMAND_FILE , & stat_buf ) = = 0 )
ereport ( FATAL ,
( errcode_for_file_access ( ) ,
errmsg ( " using recovery command file \" %s \" is not supported " ,
RECOVERY_COMMAND_FILE ) ) ) ;
/*
* Remove unused . done file , if present . Ignore if absent .
*/
unlink ( RECOVERY_COMMAND_DONE ) ;
/*
* Check for recovery signal files and if found , fsync them since they
* represent server state information . We don ' t sweat too much about the
* possibility of fsync failure , however .
*
* If present , standby signal file takes precedence . If neither is present
* then we won ' t enter archive recovery .
*/
if ( stat ( STANDBY_SIGNAL_FILE , & stat_buf ) = = 0 )
{
int fd ;
fd = BasicOpenFilePerm ( STANDBY_SIGNAL_FILE , O_RDWR | PG_BINARY ,
S_IRUSR | S_IWUSR ) ;
if ( fd > = 0 )
{
( void ) pg_fsync ( fd ) ;
close ( fd ) ;
}
standby_signal_file_found = true ;
}
else if ( stat ( RECOVERY_SIGNAL_FILE , & stat_buf ) = = 0 )
{
int fd ;
fd = BasicOpenFilePerm ( RECOVERY_SIGNAL_FILE , O_RDWR | PG_BINARY ,
S_IRUSR | S_IWUSR ) ;
if ( fd > = 0 )
{
( void ) pg_fsync ( fd ) ;
close ( fd ) ;
}
recovery_signal_file_found = true ;
}
StandbyModeRequested = false ;
ArchiveRecoveryRequested = false ;
if ( standby_signal_file_found )
{
StandbyModeRequested = true ;
ArchiveRecoveryRequested = true ;
}
else if ( recovery_signal_file_found )
{
StandbyModeRequested = false ;
ArchiveRecoveryRequested = true ;
}
else
return ;
/*
* We don ' t support standby mode in standalone backends ; that requires
* other processes such as the WAL receiver to be alive .
*/
if ( StandbyModeRequested & & ! IsUnderPostmaster )
ereport ( FATAL ,
( errcode ( ERRCODE_FEATURE_NOT_SUPPORTED ) ,
errmsg ( " standby mode is not supported by single-user servers " ) ) ) ;
}
static void
validateRecoveryParameters ( void )
{
if ( ! ArchiveRecoveryRequested )
return ;
/*
* Check for compulsory parameters
*/
if ( StandbyModeRequested )
{
if ( ( PrimaryConnInfo = = NULL | | strcmp ( PrimaryConnInfo , " " ) = = 0 ) & &
( recoveryRestoreCommand = = NULL | | strcmp ( recoveryRestoreCommand , " " ) = = 0 ) )
ereport ( WARNING ,
( errmsg ( " specified neither primary_conninfo nor restore_command " ) ,
errhint ( " The database server will regularly poll the pg_wal subdirectory to check for files placed there. " ) ) ) ;
}
else
{
if ( recoveryRestoreCommand = = NULL | |
strcmp ( recoveryRestoreCommand , " " ) = = 0 )
ereport ( FATAL ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " must specify restore_command when standby mode is not enabled " ) ) ) ;
}
/*
* Override any inconsistent requests . Note that this is a change of
* behaviour in 9.5 ; prior to this we simply ignored a request to pause if
* hot_standby = off , which was surprising behaviour .
*/
if ( recoveryTargetAction = = RECOVERY_TARGET_ACTION_PAUSE & &
! EnableHotStandby )
recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN ;
/*
* Final parsing of recovery_target_time string ; see also
* check_recovery_target_time ( ) .
*/
if ( recoveryTarget = = RECOVERY_TARGET_TIME )
{
recoveryTargetTime = DatumGetTimestampTz ( DirectFunctionCall3 ( timestamptz_in ,
CStringGetDatum ( recovery_target_time_string ) ,
ObjectIdGetDatum ( InvalidOid ) ,
Int32GetDatum ( - 1 ) ) ) ;
}
/*
* If user specified recovery_target_timeline , validate it or compute the
* " latest " value . We can ' t do this until after we ' ve gotten the restore
* command and set InArchiveRecovery , because we need to fetch timeline
* history files from the archive .
*/
if ( recoveryTargetTimeLineGoal = = RECOVERY_TARGET_TIMELINE_NUMERIC )
{
TimeLineID rtli = recoveryTargetTLIRequested ;
/* Timeline 1 does not have a history file, all else should */
if ( rtli ! = 1 & & ! existsTimeLineHistory ( rtli ) )
ereport ( FATAL ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " recovery target timeline %u does not exist " ,
rtli ) ) ) ;
recoveryTargetTLI = rtli ;
}
else if ( recoveryTargetTimeLineGoal = = RECOVERY_TARGET_TIMELINE_LATEST )
{
/* We start the "latest" search from pg_control's timeline */
recoveryTargetTLI = findNewestTimeLine ( recoveryTargetTLI ) ;
}
else
{
/*
* else we just use the recoveryTargetTLI as already read from
* ControlFile
*/
Assert ( recoveryTargetTimeLineGoal = = RECOVERY_TARGET_TIMELINE_CONTROLFILE ) ;
}
}
/*
* read_backup_label : check to see if a backup_label file is present
*
* If we see a backup_label during recovery , we assume that we are recovering
* from a backup dump file , and we therefore roll forward from the checkpoint
* identified by the label file , NOT what pg_control says . This avoids the
* problem that pg_control might have been archived one or more checkpoints
* later than the start of the dump , and so if we rely on it as the start
* point , we will fail to restore a consistent database state .
*
* Returns true if a backup_label was found ( and fills the checkpoint
* location and TLI into * checkPointLoc and * backupLabelTLI , respectively ) ;
* returns false if not . If this backup_label came from a streamed backup ,
* * backupEndRequired is set to true . If this backup_label was created during
* recovery , * backupFromStandby is set to true .
*
* Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
* and TLI read from the backup file .
*/
static bool
read_backup_label ( XLogRecPtr * checkPointLoc , TimeLineID * backupLabelTLI ,
bool * backupEndRequired , bool * backupFromStandby )
{
char startxlogfilename [ MAXFNAMELEN ] ;
TimeLineID tli_from_walseg ,
tli_from_file ;
FILE * lfp ;
char ch ;
char backuptype [ 20 ] ;
char backupfrom [ 20 ] ;
char backuplabel [ MAXPGPATH ] ;
char backuptime [ 128 ] ;
uint32 hi ,
lo ;
/* suppress possible uninitialized-variable warnings */
* checkPointLoc = InvalidXLogRecPtr ;
* backupLabelTLI = 0 ;
* backupEndRequired = false ;
* backupFromStandby = false ;
/*
* See if label file is present
*/
lfp = AllocateFile ( BACKUP_LABEL_FILE , " r " ) ;
if ( ! lfp )
{
if ( errno ! = ENOENT )
ereport ( FATAL ,
( errcode_for_file_access ( ) ,
errmsg ( " could not read file \" %s \" : %m " ,
BACKUP_LABEL_FILE ) ) ) ;
return false ; /* it's not there, all is fine */
}
/*
* Read and parse the START WAL LOCATION and CHECKPOINT lines ( this code
* is pretty crude , but we are not expecting any variability in the file
* format ) .
*/
if ( fscanf ( lfp , " START WAL LOCATION: %X/%X (file %08X%16s)%c " ,
& hi , & lo , & tli_from_walseg , startxlogfilename , & ch ) ! = 5 | | ch ! = ' \n ' )
ereport ( FATAL ,
( errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ) ,
errmsg ( " invalid data in file \" %s \" " , BACKUP_LABEL_FILE ) ) ) ;
RedoStartLSN = ( ( uint64 ) hi ) < < 32 | lo ;
RedoStartTLI = tli_from_walseg ;
if ( fscanf ( lfp , " CHECKPOINT LOCATION: %X/%X%c " ,
& hi , & lo , & ch ) ! = 3 | | ch ! = ' \n ' )
ereport ( FATAL ,
( errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ) ,
errmsg ( " invalid data in file \" %s \" " , BACKUP_LABEL_FILE ) ) ) ;
* checkPointLoc = ( ( uint64 ) hi ) < < 32 | lo ;
* backupLabelTLI = tli_from_walseg ;
/*
2022-04-06 20:41:03 +02:00
* BACKUP METHOD lets us know if this was a typical backup ( " streamed " ,
* which could mean either pg_basebackup or the pg_backup_start / stop
* method was used ) or if this label came from somewhere else ( the only
* other option today being from pg_rewind ) . If this was a streamed
* backup then we know that we need to play through until we get to the
* end of the WAL which was generated during the backup ( at which point
* we will have reached consistency and backupEndRequired will be reset
* to be false ) .
2022-02-16 08:30:38 +01:00
*/
if ( fscanf ( lfp , " BACKUP METHOD: %19s \n " , backuptype ) = = 1 )
{
if ( strcmp ( backuptype , " streamed " ) = = 0 )
* backupEndRequired = true ;
}
2022-04-06 20:41:03 +02:00
/*
* BACKUP FROM lets us know if this was from a primary or a standby . If
* it was from a standby , we ' ll double - check that the control file state
* matches that of a standby .
*/
2022-02-16 08:30:38 +01:00
if ( fscanf ( lfp , " BACKUP FROM: %19s \n " , backupfrom ) = = 1 )
{
if ( strcmp ( backupfrom , " standby " ) = = 0 )
* backupFromStandby = true ;
}
/*
* Parse START TIME and LABEL . Those are not mandatory fields for recovery
* but checking for their presence is useful for debugging and the next
* sanity checks . Cope also with the fact that the result buffers have a
* pre - allocated size , hence if the backup_label file has been generated
* with strings longer than the maximum assumed here an incorrect parsing
* happens . That ' s fine as only minor consistency checks are done
* afterwards .
*/
if ( fscanf ( lfp , " START TIME: %127[^ \n ] \n " , backuptime ) = = 1 )
ereport ( DEBUG1 ,
( errmsg_internal ( " backup time %s in file \" %s \" " ,
backuptime , BACKUP_LABEL_FILE ) ) ) ;
if ( fscanf ( lfp , " LABEL: %1023[^ \n ] \n " , backuplabel ) = = 1 )
ereport ( DEBUG1 ,
( errmsg_internal ( " backup label %s in file \" %s \" " ,
backuplabel , BACKUP_LABEL_FILE ) ) ) ;
/*
* START TIMELINE is new as of 11. Its parsing is not mandatory , still use
* it as a sanity check if present .
*/
if ( fscanf ( lfp , " START TIMELINE: %u \n " , & tli_from_file ) = = 1 )
{
if ( tli_from_walseg ! = tli_from_file )
ereport ( FATAL ,
( errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ) ,
errmsg ( " invalid data in file \" %s \" " , BACKUP_LABEL_FILE ) ,
errdetail ( " Timeline ID parsed is %u, but expected %u. " ,
tli_from_file , tli_from_walseg ) ) ) ;
ereport ( DEBUG1 ,
( errmsg_internal ( " backup timeline %u in file \" %s \" " ,
tli_from_file , BACKUP_LABEL_FILE ) ) ) ;
}
if ( ferror ( lfp ) | | FreeFile ( lfp ) )
ereport ( FATAL ,
( errcode_for_file_access ( ) ,
errmsg ( " could not read file \" %s \" : %m " ,
BACKUP_LABEL_FILE ) ) ) ;
return true ;
}
/*
* read_tablespace_map : check to see if a tablespace_map file is present
*
* If we see a tablespace_map file during recovery , we assume that we are
* recovering from a backup dump file , and we therefore need to create symlinks
* as per the information present in tablespace_map file .
*
* Returns true if a tablespace_map file was found ( and fills * tablespaces
* with a tablespaceinfo struct for each tablespace listed in the file ) ;
* returns false if not .
*/
static bool
read_tablespace_map ( List * * tablespaces )
{
tablespaceinfo * ti ;
FILE * lfp ;
char str [ MAXPGPATH ] ;
int ch ,
i ,
n ;
bool was_backslash ;
/*
* See if tablespace_map file is present
*/
lfp = AllocateFile ( TABLESPACE_MAP , " r " ) ;
if ( ! lfp )
{
if ( errno ! = ENOENT )
ereport ( FATAL ,
( errcode_for_file_access ( ) ,
errmsg ( " could not read file \" %s \" : %m " ,
TABLESPACE_MAP ) ) ) ;
return false ; /* it's not there, all is fine */
}
/*
* Read and parse the link name and path lines from tablespace_map file
* ( this code is pretty crude , but we are not expecting any variability in
* the file format ) . De - escape any backslashes that were inserted .
*/
i = 0 ;
was_backslash = false ;
while ( ( ch = fgetc ( lfp ) ) ! = EOF )
{
if ( ! was_backslash & & ( ch = = ' \n ' | | ch = = ' \r ' ) )
{
if ( i = = 0 )
continue ; /* \r immediately followed by \n */
/*
* The de - escaped line should contain an OID followed by exactly
* one space followed by a path . The path might start with
* spaces , so don ' t be too liberal about parsing .
*/
str [ i ] = ' \0 ' ;
n = 0 ;
while ( str [ n ] & & str [ n ] ! = ' ' )
n + + ;
if ( n < 1 | | n > = i - 1 )
ereport ( FATAL ,
( errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ) ,
errmsg ( " invalid data in file \" %s \" " , TABLESPACE_MAP ) ) ) ;
str [ n + + ] = ' \0 ' ;
ti = palloc0 ( sizeof ( tablespaceinfo ) ) ;
ti - > oid = pstrdup ( str ) ;
ti - > path = pstrdup ( str + n ) ;
* tablespaces = lappend ( * tablespaces , ti ) ;
i = 0 ;
continue ;
}
else if ( ! was_backslash & & ch = = ' \\ ' )
was_backslash = true ;
else
{
if ( i < sizeof ( str ) - 1 )
str [ i + + ] = ch ;
was_backslash = false ;
}
}
if ( i ! = 0 | | was_backslash ) /* last line not terminated? */
ereport ( FATAL ,
( errcode ( ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ) ,
errmsg ( " invalid data in file \" %s \" " , TABLESPACE_MAP ) ) ) ;
if ( ferror ( lfp ) | | FreeFile ( lfp ) )
ereport ( FATAL ,
( errcode_for_file_access ( ) ,
errmsg ( " could not read file \" %s \" : %m " ,
TABLESPACE_MAP ) ) ) ;
return true ;
}
/*
* Finish WAL recovery .
*
* This does not close the ' xlogreader ' yet , because in some cases the caller
* still wants to re - read the last checkpoint record by calling
* ReadCheckPointRecord ( ) .
*
* Returns the position of the last valid or applied record , after which new
* WAL should be appended , information about why recovery was ended , and some
* other things . See the WalRecoveryResult struct for details .
*/
EndOfWalRecoveryInfo *
FinishWalRecovery ( void )
{
EndOfWalRecoveryInfo * result = palloc ( sizeof ( EndOfWalRecoveryInfo ) ) ;
XLogRecPtr lastRec ;
TimeLineID lastRecTLI ;
XLogRecPtr endOfLog ;
/*
* Kill WAL receiver , if it ' s still running , before we continue to write
* the startup checkpoint and aborted - contrecord records . It will trump
* over these records and subsequent ones if it ' s still alive when we
* start writing WAL .
*/
XLogShutdownWalRcv ( ) ;
/*
* We are now done reading the xlog from stream . Turn off streaming
* recovery to force fetching the files ( which would be required at end of
* recovery , e . g . , timeline history file ) from archive or pg_wal .
*
* Note that standby mode must be turned off after killing WAL receiver ,
* i . e . , calling XLogShutdownWalRcv ( ) .
*/
Assert ( ! WalRcvStreaming ( ) ) ;
StandbyMode = false ;
/*
* Determine where to start writing WAL next .
*
* Re - fetch the last valid or last applied record , so we can identify the
* exact endpoint of what we consider the valid portion of WAL . There may
* be an incomplete continuation record after that , in which case
* ' abortedRecPtr ' and ' missingContrecPtr ' are set and the caller will
* write a special OVERWRITE_CONTRECORD message to mark that the rest of
* it is intentionally missing . See CreateOverwriteContrecordRecord ( ) .
*
* An important side - effect of this is to load the last page into
* xlogreader . The caller uses it to initialize the WAL for writing .
*/
if ( ! InRecovery )
{
lastRec = CheckPointLoc ;
lastRecTLI = CheckPointTLI ;
}
else
{
lastRec = XLogRecoveryCtl - > lastReplayedReadRecPtr ;
lastRecTLI = XLogRecoveryCtl - > lastReplayedTLI ;
}
2022-04-07 09:28:40 +02:00
XLogPrefetcherBeginRead ( xlogprefetcher , lastRec ) ;
( void ) ReadRecord ( xlogprefetcher , PANIC , false , lastRecTLI ) ;
2022-02-16 08:30:38 +01:00
endOfLog = xlogreader - > EndRecPtr ;
/*
* Remember the TLI in the filename of the XLOG segment containing the
* end - of - log . It could be different from the timeline that endOfLog
* nominally belongs to , if there was a timeline switch in that segment ,
* and we were reading the old WAL from a segment belonging to a higher
* timeline .
*/
result - > endOfLogTLI = xlogreader - > seg . ws_tli ;
if ( ArchiveRecoveryRequested )
{
/*
* We are no longer in archive recovery state .
*
* We are now done reading the old WAL . Turn off archive fetching if
* it was active .
*/
Assert ( InArchiveRecovery ) ;
InArchiveRecovery = false ;
/*
* If the ending log segment is still open , close it ( to avoid
* problems on Windows with trying to rename or delete an open file ) .
*/
if ( readFile > = 0 )
{
close ( readFile ) ;
readFile = - 1 ;
}
}
/*
* Copy the last partial block to the caller , for initializing the WAL
* buffer for appending new WAL .
*/
if ( endOfLog % XLOG_BLCKSZ ! = 0 )
{
char * page ;
int len ;
XLogRecPtr pageBeginPtr ;
pageBeginPtr = endOfLog - ( endOfLog % XLOG_BLCKSZ ) ;
Assert ( readOff = = XLogSegmentOffset ( pageBeginPtr , wal_segment_size ) ) ;
/* Copy the valid part of the last block */
len = endOfLog % XLOG_BLCKSZ ;
page = palloc ( len ) ;
memcpy ( page , xlogreader - > readBuf , len ) ;
result - > lastPageBeginPtr = pageBeginPtr ;
result - > lastPage = page ;
}
else
{
/* There is no partial block to copy. */
result - > lastPageBeginPtr = endOfLog ;
result - > lastPage = NULL ;
}
/*
* Create a comment for the history file to explain why and where timeline
* changed .
*/
result - > recoveryStopReason = getRecoveryStopReason ( ) ;
result - > lastRec = lastRec ;
result - > lastRecTLI = lastRecTLI ;
result - > endOfLog = endOfLog ;
result - > abortedRecPtr = abortedRecPtr ;
result - > missingContrecPtr = missingContrecPtr ;
result - > standby_signal_file_found = standby_signal_file_found ;
result - > recovery_signal_file_found = recovery_signal_file_found ;
return result ;
}
/*
* Clean up the WAL reader and leftovers from restoring WAL from archive
*/
void
ShutdownWalRecovery ( void )
{
char recoveryPath [ MAXPGPATH ] ;
2022-04-07 09:28:40 +02:00
/* Final update of pg_stat_recovery_prefetch. */
XLogPrefetcherComputeStats ( xlogprefetcher ) ;
2022-02-16 08:30:38 +01:00
/* Shut down xlogreader */
if ( readFile > = 0 )
{
close ( readFile ) ;
readFile = - 1 ;
}
XLogReaderFree ( xlogreader ) ;
2022-04-07 09:28:40 +02:00
XLogPrefetcherFree ( xlogprefetcher ) ;
2022-02-16 08:30:38 +01:00
if ( ArchiveRecoveryRequested )
{
/*
* Since there might be a partial WAL segment named RECOVERYXLOG , get
* rid of it .
*/
snprintf ( recoveryPath , MAXPGPATH , XLOGDIR " /RECOVERYXLOG " ) ;
unlink ( recoveryPath ) ; /* ignore any error */
/* Get rid of any remaining recovered timeline-history file, too */
snprintf ( recoveryPath , MAXPGPATH , XLOGDIR " /RECOVERYHISTORY " ) ;
unlink ( recoveryPath ) ; /* ignore any error */
}
/*
* We don ' t need the latch anymore . It ' s not strictly necessary to disown
* it , but let ' s do it for the sake of tidiness .
*/
if ( ArchiveRecoveryRequested )
DisownLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ) ;
}
/*
* Perform WAL recovery .
*
* If the system was shut down cleanly , this is never called .
*/
void
PerformWalRecovery ( void )
{
XLogRecord * record ;
bool reachedRecoveryTarget = false ;
TimeLineID replayTLI ;
/*
* Initialize shared variables for tracking progress of WAL replay , as if
* we had just replayed the record before the REDO location ( or the
* checkpoint record itself , if it ' s a shutdown checkpoint ) .
*/
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
if ( RedoStartLSN < CheckPointLoc )
{
XLogRecoveryCtl - > lastReplayedReadRecPtr = InvalidXLogRecPtr ;
XLogRecoveryCtl - > lastReplayedEndRecPtr = RedoStartLSN ;
XLogRecoveryCtl - > lastReplayedTLI = RedoStartTLI ;
}
else
{
XLogRecoveryCtl - > lastReplayedReadRecPtr = xlogreader - > ReadRecPtr ;
XLogRecoveryCtl - > lastReplayedEndRecPtr = xlogreader - > EndRecPtr ;
XLogRecoveryCtl - > lastReplayedTLI = CheckPointTLI ;
}
XLogRecoveryCtl - > replayEndRecPtr = XLogRecoveryCtl - > lastReplayedEndRecPtr ;
XLogRecoveryCtl - > replayEndTLI = XLogRecoveryCtl - > lastReplayedTLI ;
XLogRecoveryCtl - > recoveryLastXTime = 0 ;
XLogRecoveryCtl - > currentChunkStartTime = 0 ;
XLogRecoveryCtl - > recoveryPauseState = RECOVERY_NOT_PAUSED ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
/* Also ensure XLogReceiptTime has a sane value */
XLogReceiptTime = GetCurrentTimestamp ( ) ;
/*
* Let postmaster know we ' ve started redo now , so that it can launch the
* archiver if necessary .
*/
if ( IsUnderPostmaster )
SendPostmasterSignal ( PMSIGNAL_RECOVERY_STARTED ) ;
/*
* Allow read - only connections immediately if we ' re consistent already .
*/
CheckRecoveryConsistency ( ) ;
/*
* Find the first record that logically follows the checkpoint - - - it
* might physically precede it , though .
*/
if ( RedoStartLSN < CheckPointLoc )
{
/* back up to find the record */
replayTLI = RedoStartTLI ;
2022-04-07 09:28:40 +02:00
XLogPrefetcherBeginRead ( xlogprefetcher , RedoStartLSN ) ;
record = ReadRecord ( xlogprefetcher , PANIC , false , replayTLI ) ;
2022-02-16 08:30:38 +01:00
}
else
{
/* just have to read next record after CheckPoint */
Assert ( xlogreader - > ReadRecPtr = = CheckPointLoc ) ;
replayTLI = CheckPointTLI ;
2022-04-07 09:28:40 +02:00
record = ReadRecord ( xlogprefetcher , LOG , false , replayTLI ) ;
2022-02-16 08:30:38 +01:00
}
if ( record ! = NULL )
{
TimestampTz xtime ;
PGRUsage ru0 ;
pg_rusage_init ( & ru0 ) ;
InRedo = true ;
2022-04-07 07:26:43 +02:00
RmgrStartup ( ) ;
2022-02-16 08:30:38 +01:00
ereport ( LOG ,
( errmsg ( " redo starts at %X/%X " ,
LSN_FORMAT_ARGS ( xlogreader - > ReadRecPtr ) ) ) ) ;
/* Prepare to report progress of the redo phase. */
if ( ! StandbyMode )
begin_startup_progress_phase ( ) ;
/*
* main redo apply loop
*/
do
{
if ( ! StandbyMode )
ereport_startup_progress ( " redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X " ,
LSN_FORMAT_ARGS ( xlogreader - > ReadRecPtr ) ) ;
# ifdef WAL_DEBUG
if ( XLOG_DEBUG | |
2022-04-07 08:24:00 +02:00
( record - > xl_rmid = = RM_XACT_ID & & trace_recovery_messages < = DEBUG2 ) | |
( record - > xl_rmid ! = RM_XACT_ID & & trace_recovery_messages < = DEBUG3 ) )
2022-02-16 08:30:38 +01:00
{
StringInfoData buf ;
initStringInfo ( & buf ) ;
appendStringInfo ( & buf , " REDO @ %X/%X; LSN %X/%X: " ,
LSN_FORMAT_ARGS ( xlogreader - > ReadRecPtr ) ,
LSN_FORMAT_ARGS ( xlogreader - > EndRecPtr ) ) ;
xlog_outrec ( & buf , xlogreader ) ;
appendStringInfoString ( & buf , " - " ) ;
xlog_outdesc ( & buf , xlogreader ) ;
elog ( LOG , " %s " , buf . data ) ;
pfree ( buf . data ) ;
}
# endif
/* Handle interrupt signals of startup process */
HandleStartupProcInterrupts ( ) ;
/*
* Pause WAL replay , if requested by a hot - standby session via
* SetRecoveryPause ( ) .
*
* Note that we intentionally don ' t take the info_lck spinlock
* here . We might therefore read a slightly stale value of the
* recoveryPause flag , but it can ' t be very stale ( no worse than
* the last spinlock we did acquire ) . Since a pause request is a
* pretty asynchronous thing anyway , possibly responding to it one
* WAL record later than we otherwise would is a minor issue , so
* it doesn ' t seem worth adding another spinlock cycle to prevent
* that .
*/
if ( ( ( volatile XLogRecoveryCtlData * ) XLogRecoveryCtl ) - > recoveryPauseState ! =
RECOVERY_NOT_PAUSED )
recoveryPausesHere ( false ) ;
/*
* Have we reached our recovery target ?
*/
if ( recoveryStopsBefore ( xlogreader ) )
{
reachedRecoveryTarget = true ;
break ;
}
/*
* If we ' ve been asked to lag the primary , wait on latch until
* enough time has passed .
*/
if ( recoveryApplyDelay ( xlogreader ) )
{
/*
* We test for paused recovery again here . If user sets
* delayed apply , it may be because they expect to pause
* recovery in case of problems , so we must test again here
* otherwise pausing during the delay - wait wouldn ' t work .
*/
if ( ( ( volatile XLogRecoveryCtlData * ) XLogRecoveryCtl ) - > recoveryPauseState ! =
RECOVERY_NOT_PAUSED )
recoveryPausesHere ( false ) ;
}
/*
* Apply the record
*/
ApplyWalRecord ( xlogreader , record , & replayTLI ) ;
/* Exit loop if we reached inclusive recovery target */
if ( recoveryStopsAfter ( xlogreader ) )
{
reachedRecoveryTarget = true ;
break ;
}
/* Else, try to fetch the next WAL record */
2022-04-07 09:28:40 +02:00
record = ReadRecord ( xlogprefetcher , LOG , false , replayTLI ) ;
2022-02-16 08:30:38 +01:00
} while ( record ! = NULL ) ;
/*
* end of main redo apply loop
*/
if ( reachedRecoveryTarget )
{
if ( ! reachedConsistency )
ereport ( FATAL ,
( errmsg ( " requested recovery stop point is before consistent recovery point " ) ) ) ;
/*
* This is the last point where we can restart recovery with a new
* recovery target , if we shutdown and begin again . After this ,
* Resource Managers may choose to do permanent corrective actions
* at end of recovery .
*/
switch ( recoveryTargetAction )
{
case RECOVERY_TARGET_ACTION_SHUTDOWN :
/*
* exit with special return code to request shutdown of
* postmaster . Log messages issued from postmaster .
*/
proc_exit ( 3 ) ;
case RECOVERY_TARGET_ACTION_PAUSE :
SetRecoveryPause ( true ) ;
recoveryPausesHere ( true ) ;
/* drop into promote */
case RECOVERY_TARGET_ACTION_PROMOTE :
break ;
}
}
2022-04-07 07:26:43 +02:00
RmgrCleanup ( ) ;
2022-02-16 08:30:38 +01:00
ereport ( LOG ,
( errmsg ( " redo done at %X/%X system usage: %s " ,
LSN_FORMAT_ARGS ( xlogreader - > ReadRecPtr ) ,
pg_rusage_show ( & ru0 ) ) ) ) ;
xtime = GetLatestXTime ( ) ;
if ( xtime )
ereport ( LOG ,
( errmsg ( " last completed transaction was at log time %s " ,
timestamptz_to_str ( xtime ) ) ) ) ;
InRedo = false ;
}
else
{
/* there are no WAL records following the checkpoint */
ereport ( LOG ,
( errmsg ( " redo is not required " ) ) ) ;
}
/*
* This check is intentionally after the above log messages that indicate
* how far recovery went .
*/
if ( ArchiveRecoveryRequested & &
recoveryTarget ! = RECOVERY_TARGET_UNSET & &
! reachedRecoveryTarget )
ereport ( FATAL ,
( errmsg ( " recovery ended before configured recovery target was reached " ) ) ) ;
}
/*
* Subroutine of PerformWalRecovery , to apply one WAL record .
*/
static void
ApplyWalRecord ( XLogReaderState * xlogreader , XLogRecord * record , TimeLineID * replayTLI )
{
ErrorContextCallback errcallback ;
bool switchedTLI = false ;
/* Setup error traceback support for ereport() */
errcallback . callback = rm_redo_error_callback ;
errcallback . arg = ( void * ) xlogreader ;
errcallback . previous = error_context_stack ;
error_context_stack = & errcallback ;
/*
* ShmemVariableCache - > nextXid must be beyond record ' s xid .
*/
AdvanceNextFullTransactionIdPastXid ( record - > xl_xid ) ;
/*
* Before replaying this record , check if this record causes the current
* timeline to change . The record is already considered to be part of the
* new timeline , so we update replayTLI before replaying it . That ' s
* important so that replayEndTLI , which is recorded as the minimum
* recovery point ' s TLI if recovery stops after this record , is set
* correctly .
*/
if ( record - > xl_rmid = = RM_XLOG_ID )
{
TimeLineID newReplayTLI = * replayTLI ;
TimeLineID prevReplayTLI = * replayTLI ;
uint8 info = record - > xl_info & ~ XLR_INFO_MASK ;
if ( info = = XLOG_CHECKPOINT_SHUTDOWN )
{
CheckPoint checkPoint ;
memcpy ( & checkPoint , XLogRecGetData ( xlogreader ) , sizeof ( CheckPoint ) ) ;
newReplayTLI = checkPoint . ThisTimeLineID ;
prevReplayTLI = checkPoint . PrevTimeLineID ;
}
else if ( info = = XLOG_END_OF_RECOVERY )
{
xl_end_of_recovery xlrec ;
memcpy ( & xlrec , XLogRecGetData ( xlogreader ) , sizeof ( xl_end_of_recovery ) ) ;
newReplayTLI = xlrec . ThisTimeLineID ;
prevReplayTLI = xlrec . PrevTimeLineID ;
}
if ( newReplayTLI ! = * replayTLI )
{
/* Check that it's OK to switch to this TLI */
checkTimeLineSwitch ( xlogreader - > EndRecPtr ,
newReplayTLI , prevReplayTLI , * replayTLI ) ;
/* Following WAL records should be run with new TLI */
* replayTLI = newReplayTLI ;
switchedTLI = true ;
}
}
/*
* Update shared replayEndRecPtr before replaying this record , so that
* XLogFlush will update minRecoveryPoint correctly .
*/
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
XLogRecoveryCtl - > replayEndRecPtr = xlogreader - > EndRecPtr ;
XLogRecoveryCtl - > replayEndTLI = * replayTLI ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
/*
* If we are attempting to enter Hot Standby mode , process XIDs we see
*/
if ( standbyState > = STANDBY_INITIALIZED & &
TransactionIdIsValid ( record - > xl_xid ) )
RecordKnownAssignedTransactionIds ( record - > xl_xid ) ;
/*
* Some XLOG record types that are related to recovery are processed
* directly here , rather than in xlog_redo ( )
*/
if ( record - > xl_rmid = = RM_XLOG_ID )
xlogrecovery_redo ( xlogreader , * replayTLI ) ;
/* Now apply the WAL record itself */
2022-04-07 07:26:43 +02:00
GetRmgr ( record - > xl_rmid ) . rm_redo ( xlogreader ) ;
2022-02-16 08:30:38 +01:00
/*
* After redo , check whether the backup pages associated with the WAL
* record are consistent with the existing pages . This check is done only
* if consistency check is enabled for this record .
*/
if ( ( record - > xl_info & XLR_CHECK_CONSISTENCY ) ! = 0 )
verifyBackupPageConsistency ( xlogreader ) ;
/* Pop the error context stack */
error_context_stack = errcallback . previous ;
/*
* Update lastReplayedEndRecPtr after this record has been successfully
* replayed .
*/
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
XLogRecoveryCtl - > lastReplayedReadRecPtr = xlogreader - > ReadRecPtr ;
XLogRecoveryCtl - > lastReplayedEndRecPtr = xlogreader - > EndRecPtr ;
XLogRecoveryCtl - > lastReplayedTLI = * replayTLI ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
/*
* If rm_redo called XLogRequestWalReceiverReply , then we wake up the
* receiver so that it notices the updated lastReplayedEndRecPtr and sends
* a reply to the primary .
*/
if ( doRequestWalReceiverReply )
{
doRequestWalReceiverReply = false ;
WalRcvForceReply ( ) ;
}
/* Allow read-only connections if we're consistent now */
CheckRecoveryConsistency ( ) ;
/* Is this a timeline switch? */
if ( switchedTLI )
{
/*
* Before we continue on the new timeline , clean up any ( possibly
* bogus ) future WAL segments on the old timeline .
*/
RemoveNonParentXlogFiles ( xlogreader - > EndRecPtr , * replayTLI ) ;
/*
* Wake up any walsenders to notice that we are on a new timeline .
*/
if ( AllowCascadeReplication ( ) )
WalSndWakeup ( ) ;
2022-04-07 09:28:40 +02:00
/* Reset the prefetcher. */
XLogPrefetchReconfigure ( ) ;
2022-02-16 08:30:38 +01:00
}
}
/*
* Some XLOG RM record types that are directly related to WAL recovery are
* handled here rather than in the xlog_redo ( )
*/
static void
xlogrecovery_redo ( XLogReaderState * record , TimeLineID replayTLI )
{
uint8 info = XLogRecGetInfo ( record ) & ~ XLR_INFO_MASK ;
XLogRecPtr lsn = record - > EndRecPtr ;
Assert ( XLogRecGetRmid ( record ) = = RM_XLOG_ID ) ;
if ( info = = XLOG_OVERWRITE_CONTRECORD )
{
/* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
xl_overwrite_contrecord xlrec ;
memcpy ( & xlrec , XLogRecGetData ( record ) , sizeof ( xl_overwrite_contrecord ) ) ;
if ( xlrec . overwritten_lsn ! = record - > overwrittenRecPtr )
elog ( FATAL , " mismatching overwritten LSN %X/%X -> %X/%X " ,
LSN_FORMAT_ARGS ( xlrec . overwritten_lsn ) ,
LSN_FORMAT_ARGS ( record - > overwrittenRecPtr ) ) ;
2022-03-23 18:22:10 +01:00
/* We have safely skipped the aborted record */
abortedRecPtr = InvalidXLogRecPtr ;
missingContrecPtr = InvalidXLogRecPtr ;
2022-02-16 08:30:38 +01:00
ereport ( LOG ,
( errmsg ( " successfully skipped missing contrecord at %X/%X, overwritten at %s " ,
LSN_FORMAT_ARGS ( xlrec . overwritten_lsn ) ,
timestamptz_to_str ( xlrec . overwrite_time ) ) ) ) ;
/* Verifying the record should only happen once */
record - > overwrittenRecPtr = InvalidXLogRecPtr ;
}
else if ( info = = XLOG_BACKUP_END )
{
XLogRecPtr startpoint ;
memcpy ( & startpoint , XLogRecGetData ( record ) , sizeof ( startpoint ) ) ;
if ( backupStartPoint = = startpoint )
{
/*
* We have reached the end of base backup , the point where
2022-04-06 20:41:03 +02:00
* pg_backup_stop ( ) was done . The data on disk is now consistent
2022-02-16 08:30:38 +01:00
* ( assuming we have also reached minRecoveryPoint ) . Set
* backupEndPoint to the current LSN , so that the next call to
* CheckRecoveryConsistency ( ) will notice it and do the
* end - of - backup processing .
*/
elog ( DEBUG1 , " end of backup record reached " ) ;
backupEndPoint = lsn ;
}
else
elog ( DEBUG1 , " saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X " ,
LSN_FORMAT_ARGS ( startpoint ) , LSN_FORMAT_ARGS ( backupStartPoint ) ) ;
}
}
/*
* Checks if recovery has reached a consistent state . When consistency is
* reached and we have a valid starting standby snapshot , tell postmaster
* that it can start accepting read - only connections .
*/
static void
CheckRecoveryConsistency ( void )
{
XLogRecPtr lastReplayedEndRecPtr ;
TimeLineID lastReplayedTLI ;
/*
* During crash recovery , we don ' t reach a consistent state until we ' ve
* replayed all the WAL .
*/
if ( XLogRecPtrIsInvalid ( minRecoveryPoint ) )
return ;
Assert ( InArchiveRecovery ) ;
/*
* assume that we are called in the startup process , and hence don ' t need
* a lock to read lastReplayedEndRecPtr
*/
lastReplayedEndRecPtr = XLogRecoveryCtl - > lastReplayedEndRecPtr ;
lastReplayedTLI = XLogRecoveryCtl - > lastReplayedTLI ;
/*
* Have we reached the point where our base backup was completed ?
*/
if ( ! XLogRecPtrIsInvalid ( backupEndPoint ) & &
backupEndPoint < = lastReplayedEndRecPtr )
{
elog ( DEBUG1 , " end of backup reached " ) ;
/*
* We have reached the end of base backup , as indicated by pg_control .
* Update the control file accordingly .
*/
ReachedEndOfBackup ( lastReplayedEndRecPtr , lastReplayedTLI ) ;
backupStartPoint = InvalidXLogRecPtr ;
backupEndPoint = InvalidXLogRecPtr ;
backupEndRequired = false ;
}
/*
* Have we passed our safe starting point ? Note that minRecoveryPoint is
2022-04-06 20:41:03 +02:00
* known to be incorrectly set if recovering from a backup , until
2022-02-16 08:30:38 +01:00
* the XLOG_BACKUP_END arrives to advise us of the correct
* minRecoveryPoint . All we know prior to that is that we ' re not
* consistent yet .
*/
if ( ! reachedConsistency & & ! backupEndRequired & &
minRecoveryPoint < = lastReplayedEndRecPtr )
{
/*
* Check to see if the XLOG sequence contained any unresolved
* references to uninitialized pages .
*/
XLogCheckInvalidPages ( ) ;
reachedConsistency = true ;
ereport ( LOG ,
( errmsg ( " consistent recovery state reached at %X/%X " ,
LSN_FORMAT_ARGS ( lastReplayedEndRecPtr ) ) ) ) ;
}
/*
* Have we got a valid starting snapshot that will allow queries to be
* run ? If so , we can tell postmaster that the database is consistent now ,
* enabling connections .
*/
if ( standbyState = = STANDBY_SNAPSHOT_READY & &
! LocalHotStandbyActive & &
reachedConsistency & &
IsUnderPostmaster )
{
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
XLogRecoveryCtl - > SharedHotStandbyActive = true ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
LocalHotStandbyActive = true ;
SendPostmasterSignal ( PMSIGNAL_BEGIN_HOT_STANDBY ) ;
}
}
/*
* Error context callback for errors occurring during rm_redo ( ) .
*/
static void
rm_redo_error_callback ( void * arg )
{
XLogReaderState * record = ( XLogReaderState * ) arg ;
StringInfoData buf ;
initStringInfo ( & buf ) ;
xlog_outdesc ( & buf , record ) ;
xlog_block_info ( & buf , record ) ;
/* translator: %s is a WAL record description */
errcontext ( " WAL redo at %X/%X for %s " ,
LSN_FORMAT_ARGS ( record - > ReadRecPtr ) ,
buf . data ) ;
pfree ( buf . data ) ;
}
/*
* Returns a string describing an XLogRecord , consisting of its identity
* optionally followed by a colon , a space , and a further description .
*/
void
xlog_outdesc ( StringInfo buf , XLogReaderState * record )
{
2022-04-07 07:26:43 +02:00
RmgrData rmgr = GetRmgr ( XLogRecGetRmid ( record ) ) ;
2022-02-16 08:30:38 +01:00
uint8 info = XLogRecGetInfo ( record ) ;
const char * id ;
2022-04-07 07:26:43 +02:00
appendStringInfoString ( buf , rmgr . rm_name ) ;
2022-02-16 08:30:38 +01:00
appendStringInfoChar ( buf , ' / ' ) ;
2022-04-07 07:26:43 +02:00
id = rmgr . rm_identify ( info ) ;
2022-02-16 08:30:38 +01:00
if ( id = = NULL )
appendStringInfo ( buf , " UNKNOWN (%X): " , info & ~ XLR_INFO_MASK ) ;
else
appendStringInfo ( buf , " %s: " , id ) ;
2022-04-07 07:26:43 +02:00
rmgr . rm_desc ( buf , record ) ;
2022-02-16 08:30:38 +01:00
}
# ifdef WAL_DEBUG
static void
xlog_outrec ( StringInfo buf , XLogReaderState * record )
{
appendStringInfo ( buf , " prev %X/%X; xid %u " ,
LSN_FORMAT_ARGS ( XLogRecGetPrev ( record ) ) ,
XLogRecGetXid ( record ) ) ;
appendStringInfo ( buf , " ; len %u " ,
XLogRecGetDataLen ( record ) ) ;
xlog_block_info ( buf , record ) ;
}
# endif /* WAL_DEBUG */
/*
* Returns a string giving information about all the blocks in an
* XLogRecord .
*/
static void
xlog_block_info ( StringInfo buf , XLogReaderState * record )
{
int block_id ;
/* decode block references */
2022-03-18 05:45:04 +01:00
for ( block_id = 0 ; block_id < = XLogRecMaxBlockId ( record ) ; block_id + + )
2022-02-16 08:30:38 +01:00
{
RelFileNode rnode ;
ForkNumber forknum ;
BlockNumber blk ;
if ( ! XLogRecHasBlockRef ( record , block_id ) )
continue ;
XLogRecGetBlockTag ( record , block_id , & rnode , & forknum , & blk ) ;
if ( forknum ! = MAIN_FORKNUM )
appendStringInfo ( buf , " ; blkref #%d: rel %u/%u/%u, fork %u, blk %u " ,
block_id ,
rnode . spcNode , rnode . dbNode , rnode . relNode ,
forknum ,
blk ) ;
else
appendStringInfo ( buf , " ; blkref #%d: rel %u/%u/%u, blk %u " ,
block_id ,
rnode . spcNode , rnode . dbNode , rnode . relNode ,
blk ) ;
if ( XLogRecHasBlockImage ( record , block_id ) )
appendStringInfoString ( buf , " FPW " ) ;
}
}
/*
* Check that it ' s OK to switch to new timeline during recovery .
*
* ' lsn ' is the address of the shutdown checkpoint record we ' re about to
* replay . ( Currently , timeline can only change at a shutdown checkpoint ) .
*/
static void
checkTimeLineSwitch ( XLogRecPtr lsn , TimeLineID newTLI , TimeLineID prevTLI ,
TimeLineID replayTLI )
{
/* Check that the record agrees on what the current (old) timeline is */
if ( prevTLI ! = replayTLI )
ereport ( PANIC ,
( errmsg ( " unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record " ,
prevTLI , replayTLI ) ) ) ;
/*
* The new timeline better be in the list of timelines we expect to see ,
* according to the timeline history . It should also not decrease .
*/
if ( newTLI < replayTLI | | ! tliInHistory ( newTLI , expectedTLEs ) )
ereport ( PANIC ,
( errmsg ( " unexpected timeline ID %u (after %u) in checkpoint record " ,
newTLI , replayTLI ) ) ) ;
/*
* If we have not yet reached min recovery point , and we ' re about to
* switch to a timeline greater than the timeline of the min recovery
* point : trouble . After switching to the new timeline , we could not
* possibly visit the min recovery point on the correct timeline anymore .
* This can happen if there is a newer timeline in the archive that
* branched before the timeline the min recovery point is on , and you
* attempt to do PITR to the new timeline .
*/
if ( ! XLogRecPtrIsInvalid ( minRecoveryPoint ) & &
lsn < minRecoveryPoint & &
newTLI > minRecoveryPointTLI )
ereport ( PANIC ,
( errmsg ( " unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u " ,
newTLI ,
LSN_FORMAT_ARGS ( minRecoveryPoint ) ,
minRecoveryPointTLI ) ) ) ;
/* Looks good */
}
/*
* Extract timestamp from WAL record .
*
* If the record contains a timestamp , returns true , and saves the timestamp
* in * recordXtime . If the record type has no timestamp , returns false .
* Currently , only transaction commit / abort records and restore points contain
* timestamps .
*/
static bool
getRecordTimestamp ( XLogReaderState * record , TimestampTz * recordXtime )
{
uint8 info = XLogRecGetInfo ( record ) & ~ XLR_INFO_MASK ;
uint8 xact_info = info & XLOG_XACT_OPMASK ;
uint8 rmid = XLogRecGetRmid ( record ) ;
if ( rmid = = RM_XLOG_ID & & info = = XLOG_RESTORE_POINT )
{
* recordXtime = ( ( xl_restore_point * ) XLogRecGetData ( record ) ) - > rp_time ;
return true ;
}
if ( rmid = = RM_XACT_ID & & ( xact_info = = XLOG_XACT_COMMIT | |
xact_info = = XLOG_XACT_COMMIT_PREPARED ) )
{
* recordXtime = ( ( xl_xact_commit * ) XLogRecGetData ( record ) ) - > xact_time ;
return true ;
}
if ( rmid = = RM_XACT_ID & & ( xact_info = = XLOG_XACT_ABORT | |
xact_info = = XLOG_XACT_ABORT_PREPARED ) )
{
* recordXtime = ( ( xl_xact_abort * ) XLogRecGetData ( record ) ) - > xact_time ;
return true ;
}
return false ;
}
/*
* Checks whether the current buffer page and backup page stored in the
* WAL record are consistent or not . Before comparing the two pages , a
* masking can be applied to the pages to ignore certain areas like hint bits ,
* unused space between pd_lower and pd_upper among other things . This
* function should be called once WAL replay has been completed for a
* given record .
*/
static void
verifyBackupPageConsistency ( XLogReaderState * record )
{
2022-04-07 07:26:43 +02:00
RmgrData rmgr = GetRmgr ( XLogRecGetRmid ( record ) ) ;
2022-02-16 08:30:38 +01:00
RelFileNode rnode ;
ForkNumber forknum ;
BlockNumber blkno ;
int block_id ;
/* Records with no backup blocks have no need for consistency checks. */
if ( ! XLogRecHasAnyBlockRefs ( record ) )
return ;
Assert ( ( XLogRecGetInfo ( record ) & XLR_CHECK_CONSISTENCY ) ! = 0 ) ;
2022-03-18 05:45:04 +01:00
for ( block_id = 0 ; block_id < = XLogRecMaxBlockId ( record ) ; block_id + + )
2022-02-16 08:30:38 +01:00
{
Buffer buf ;
Page page ;
if ( ! XLogRecGetBlockTag ( record , block_id , & rnode , & forknum , & blkno ) )
{
/*
* WAL record doesn ' t contain a block reference with the given id .
* Do nothing .
*/
continue ;
}
Assert ( XLogRecHasBlockImage ( record , block_id ) ) ;
if ( XLogRecBlockImageApply ( record , block_id ) )
{
/*
* WAL record has already applied the page , so bypass the
* consistency check as that would result in comparing the full
* page stored in the record with itself .
*/
continue ;
}
/*
* Read the contents from the current buffer and store it in a
* temporary page .
*/
buf = XLogReadBufferExtended ( rnode , forknum , blkno ,
2022-04-07 09:28:40 +02:00
RBM_NORMAL_NO_LOG ,
InvalidBuffer ) ;
2022-02-16 08:30:38 +01:00
if ( ! BufferIsValid ( buf ) )
continue ;
LockBuffer ( buf , BUFFER_LOCK_EXCLUSIVE ) ;
page = BufferGetPage ( buf ) ;
/*
* Take a copy of the local page where WAL has been applied to have a
* comparison base before masking it . . .
*/
memcpy ( replay_image_masked , page , BLCKSZ ) ;
/* No need for this page anymore now that a copy is in. */
UnlockReleaseBuffer ( buf ) ;
/*
* If the block LSN is already ahead of this WAL record , we can ' t
* expect contents to match . This can happen if recovery is
* restarted .
*/
if ( PageGetLSN ( replay_image_masked ) > record - > EndRecPtr )
continue ;
/*
* Read the contents from the backup copy , stored in WAL record and
* store it in a temporary page . There is no need to allocate a new
* page here , a local buffer is fine to hold its contents and a mask
* can be directly applied on it .
*/
if ( ! RestoreBlockImage ( record , block_id , primary_image_masked ) )
elog ( ERROR , " failed to restore block image " ) ;
/*
* If masking function is defined , mask both the primary and replay
* images
*/
2022-04-07 07:26:43 +02:00
if ( rmgr . rm_mask ! = NULL )
2022-02-16 08:30:38 +01:00
{
2022-04-07 07:26:43 +02:00
rmgr . rm_mask ( replay_image_masked , blkno ) ;
rmgr . rm_mask ( primary_image_masked , blkno ) ;
2022-02-16 08:30:38 +01:00
}
/* Time to compare the primary and replay images. */
if ( memcmp ( replay_image_masked , primary_image_masked , BLCKSZ ) ! = 0 )
{
elog ( FATAL ,
" inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u " ,
rnode . spcNode , rnode . dbNode , rnode . relNode ,
forknum , blkno ) ;
}
}
}
/*
* For point - in - time recovery , this function decides whether we want to
* stop applying the XLOG before the current record .
*
* Returns true if we are stopping , false otherwise . If stopping , some
* information is saved in recoveryStopXid et al for use in annotating the
* new timeline ' s history file .
*/
static bool
recoveryStopsBefore ( XLogReaderState * record )
{
bool stopsHere = false ;
uint8 xact_info ;
bool isCommit ;
TimestampTz recordXtime = 0 ;
TransactionId recordXid ;
/*
* Ignore recovery target settings when not in archive recovery ( meaning
* we are in crash recovery ) .
*/
if ( ! ArchiveRecoveryRequested )
return false ;
/* Check if we should stop as soon as reaching consistency */
if ( recoveryTarget = = RECOVERY_TARGET_IMMEDIATE & & reachedConsistency )
{
ereport ( LOG ,
( errmsg ( " recovery stopping after reaching consistency " ) ) ) ;
recoveryStopAfter = false ;
recoveryStopXid = InvalidTransactionId ;
recoveryStopLSN = InvalidXLogRecPtr ;
recoveryStopTime = 0 ;
recoveryStopName [ 0 ] = ' \0 ' ;
return true ;
}
/* Check if target LSN has been reached */
if ( recoveryTarget = = RECOVERY_TARGET_LSN & &
! recoveryTargetInclusive & &
record - > ReadRecPtr > = recoveryTargetLSN )
{
recoveryStopAfter = false ;
recoveryStopXid = InvalidTransactionId ;
recoveryStopLSN = record - > ReadRecPtr ;
recoveryStopTime = 0 ;
recoveryStopName [ 0 ] = ' \0 ' ;
ereport ( LOG ,
( errmsg ( " recovery stopping before WAL location (LSN) \" %X/%X \" " ,
LSN_FORMAT_ARGS ( recoveryStopLSN ) ) ) ) ;
return true ;
}
/* Otherwise we only consider stopping before COMMIT or ABORT records. */
if ( XLogRecGetRmid ( record ) ! = RM_XACT_ID )
return false ;
xact_info = XLogRecGetInfo ( record ) & XLOG_XACT_OPMASK ;
if ( xact_info = = XLOG_XACT_COMMIT )
{
isCommit = true ;
recordXid = XLogRecGetXid ( record ) ;
}
else if ( xact_info = = XLOG_XACT_COMMIT_PREPARED )
{
xl_xact_commit * xlrec = ( xl_xact_commit * ) XLogRecGetData ( record ) ;
xl_xact_parsed_commit parsed ;
isCommit = true ;
ParseCommitRecord ( XLogRecGetInfo ( record ) ,
xlrec ,
& parsed ) ;
recordXid = parsed . twophase_xid ;
}
else if ( xact_info = = XLOG_XACT_ABORT )
{
isCommit = false ;
recordXid = XLogRecGetXid ( record ) ;
}
else if ( xact_info = = XLOG_XACT_ABORT_PREPARED )
{
xl_xact_abort * xlrec = ( xl_xact_abort * ) XLogRecGetData ( record ) ;
xl_xact_parsed_abort parsed ;
isCommit = false ;
ParseAbortRecord ( XLogRecGetInfo ( record ) ,
xlrec ,
& parsed ) ;
recordXid = parsed . twophase_xid ;
}
else
return false ;
if ( recoveryTarget = = RECOVERY_TARGET_XID & & ! recoveryTargetInclusive )
{
/*
* There can be only one transaction end record with this exact
* transactionid
*
* when testing for an xid , we MUST test for equality only , since
* transactions are numbered in the order they start , not the order
* they complete . A higher numbered xid will complete before you about
* 50 % of the time . . .
*/
stopsHere = ( recordXid = = recoveryTargetXid ) ;
}
if ( recoveryTarget = = RECOVERY_TARGET_TIME & &
getRecordTimestamp ( record , & recordXtime ) )
{
/*
* There can be many transactions that share the same commit time , so
* we stop after the last one , if we are inclusive , or stop at the
* first one if we are exclusive
*/
if ( recoveryTargetInclusive )
stopsHere = ( recordXtime > recoveryTargetTime ) ;
else
stopsHere = ( recordXtime > = recoveryTargetTime ) ;
}
if ( stopsHere )
{
recoveryStopAfter = false ;
recoveryStopXid = recordXid ;
recoveryStopTime = recordXtime ;
recoveryStopLSN = InvalidXLogRecPtr ;
recoveryStopName [ 0 ] = ' \0 ' ;
if ( isCommit )
{
ereport ( LOG ,
( errmsg ( " recovery stopping before commit of transaction %u, time %s " ,
recoveryStopXid ,
timestamptz_to_str ( recoveryStopTime ) ) ) ) ;
}
else
{
ereport ( LOG ,
( errmsg ( " recovery stopping before abort of transaction %u, time %s " ,
recoveryStopXid ,
timestamptz_to_str ( recoveryStopTime ) ) ) ) ;
}
}
return stopsHere ;
}
/*
* Same as recoveryStopsBefore , but called after applying the record .
*
* We also track the timestamp of the latest applied COMMIT / ABORT
* record in XLogRecoveryCtl - > recoveryLastXTime .
*/
static bool
recoveryStopsAfter ( XLogReaderState * record )
{
uint8 info ;
uint8 xact_info ;
uint8 rmid ;
TimestampTz recordXtime ;
/*
* Ignore recovery target settings when not in archive recovery ( meaning
* we are in crash recovery ) .
*/
if ( ! ArchiveRecoveryRequested )
return false ;
info = XLogRecGetInfo ( record ) & ~ XLR_INFO_MASK ;
rmid = XLogRecGetRmid ( record ) ;
/*
* There can be many restore points that share the same name ; we stop at
* the first one .
*/
if ( recoveryTarget = = RECOVERY_TARGET_NAME & &
rmid = = RM_XLOG_ID & & info = = XLOG_RESTORE_POINT )
{
xl_restore_point * recordRestorePointData ;
recordRestorePointData = ( xl_restore_point * ) XLogRecGetData ( record ) ;
if ( strcmp ( recordRestorePointData - > rp_name , recoveryTargetName ) = = 0 )
{
recoveryStopAfter = true ;
recoveryStopXid = InvalidTransactionId ;
recoveryStopLSN = InvalidXLogRecPtr ;
( void ) getRecordTimestamp ( record , & recoveryStopTime ) ;
strlcpy ( recoveryStopName , recordRestorePointData - > rp_name , MAXFNAMELEN ) ;
ereport ( LOG ,
( errmsg ( " recovery stopping at restore point \" %s \" , time %s " ,
recoveryStopName ,
timestamptz_to_str ( recoveryStopTime ) ) ) ) ;
return true ;
}
}
/* Check if the target LSN has been reached */
if ( recoveryTarget = = RECOVERY_TARGET_LSN & &
recoveryTargetInclusive & &
record - > ReadRecPtr > = recoveryTargetLSN )
{
recoveryStopAfter = true ;
recoveryStopXid = InvalidTransactionId ;
recoveryStopLSN = record - > ReadRecPtr ;
recoveryStopTime = 0 ;
recoveryStopName [ 0 ] = ' \0 ' ;
ereport ( LOG ,
( errmsg ( " recovery stopping after WAL location (LSN) \" %X/%X \" " ,
LSN_FORMAT_ARGS ( recoveryStopLSN ) ) ) ) ;
return true ;
}
if ( rmid ! = RM_XACT_ID )
return false ;
xact_info = info & XLOG_XACT_OPMASK ;
if ( xact_info = = XLOG_XACT_COMMIT | |
xact_info = = XLOG_XACT_COMMIT_PREPARED | |
xact_info = = XLOG_XACT_ABORT | |
xact_info = = XLOG_XACT_ABORT_PREPARED )
{
TransactionId recordXid ;
/* Update the last applied transaction timestamp */
if ( getRecordTimestamp ( record , & recordXtime ) )
SetLatestXTime ( recordXtime ) ;
/* Extract the XID of the committed/aborted transaction */
if ( xact_info = = XLOG_XACT_COMMIT_PREPARED )
{
xl_xact_commit * xlrec = ( xl_xact_commit * ) XLogRecGetData ( record ) ;
xl_xact_parsed_commit parsed ;
ParseCommitRecord ( XLogRecGetInfo ( record ) ,
xlrec ,
& parsed ) ;
recordXid = parsed . twophase_xid ;
}
else if ( xact_info = = XLOG_XACT_ABORT_PREPARED )
{
xl_xact_abort * xlrec = ( xl_xact_abort * ) XLogRecGetData ( record ) ;
xl_xact_parsed_abort parsed ;
ParseAbortRecord ( XLogRecGetInfo ( record ) ,
xlrec ,
& parsed ) ;
recordXid = parsed . twophase_xid ;
}
else
recordXid = XLogRecGetXid ( record ) ;
/*
* There can be only one transaction end record with this exact
* transactionid
*
* when testing for an xid , we MUST test for equality only , since
* transactions are numbered in the order they start , not the order
* they complete . A higher numbered xid will complete before you about
* 50 % of the time . . .
*/
if ( recoveryTarget = = RECOVERY_TARGET_XID & & recoveryTargetInclusive & &
recordXid = = recoveryTargetXid )
{
recoveryStopAfter = true ;
recoveryStopXid = recordXid ;
recoveryStopTime = recordXtime ;
recoveryStopLSN = InvalidXLogRecPtr ;
recoveryStopName [ 0 ] = ' \0 ' ;
if ( xact_info = = XLOG_XACT_COMMIT | |
xact_info = = XLOG_XACT_COMMIT_PREPARED )
{
ereport ( LOG ,
( errmsg ( " recovery stopping after commit of transaction %u, time %s " ,
recoveryStopXid ,
timestamptz_to_str ( recoveryStopTime ) ) ) ) ;
}
else if ( xact_info = = XLOG_XACT_ABORT | |
xact_info = = XLOG_XACT_ABORT_PREPARED )
{
ereport ( LOG ,
( errmsg ( " recovery stopping after abort of transaction %u, time %s " ,
recoveryStopXid ,
timestamptz_to_str ( recoveryStopTime ) ) ) ) ;
}
return true ;
}
}
/* Check if we should stop as soon as reaching consistency */
if ( recoveryTarget = = RECOVERY_TARGET_IMMEDIATE & & reachedConsistency )
{
ereport ( LOG ,
( errmsg ( " recovery stopping after reaching consistency " ) ) ) ;
recoveryStopAfter = true ;
recoveryStopXid = InvalidTransactionId ;
recoveryStopTime = 0 ;
recoveryStopLSN = InvalidXLogRecPtr ;
recoveryStopName [ 0 ] = ' \0 ' ;
return true ;
}
return false ;
}
/*
* Create a comment for the history file to explain why and where
* timeline changed .
*/
static char *
getRecoveryStopReason ( void )
{
char reason [ 200 ] ;
if ( recoveryTarget = = RECOVERY_TARGET_XID )
snprintf ( reason , sizeof ( reason ) ,
" %s transaction %u " ,
recoveryStopAfter ? " after " : " before " ,
recoveryStopXid ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_TIME )
snprintf ( reason , sizeof ( reason ) ,
" %s %s \n " ,
recoveryStopAfter ? " after " : " before " ,
timestamptz_to_str ( recoveryStopTime ) ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_LSN )
snprintf ( reason , sizeof ( reason ) ,
" %s LSN %X/%X \n " ,
recoveryStopAfter ? " after " : " before " ,
LSN_FORMAT_ARGS ( recoveryStopLSN ) ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_NAME )
snprintf ( reason , sizeof ( reason ) ,
" at restore point \" %s \" " ,
recoveryStopName ) ;
else if ( recoveryTarget = = RECOVERY_TARGET_IMMEDIATE )
snprintf ( reason , sizeof ( reason ) , " reached consistency " ) ;
else
snprintf ( reason , sizeof ( reason ) , " no recovery target specified " ) ;
return pstrdup ( reason ) ;
}
/*
* Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED .
*
* endOfRecovery is true if the recovery target is reached and
* the paused state starts at the end of recovery because of
* recovery_target_action = pause , and false otherwise .
*/
static void
recoveryPausesHere ( bool endOfRecovery )
{
/* Don't pause unless users can connect! */
if ( ! LocalHotStandbyActive )
return ;
/* Don't pause after standby promotion has been triggered */
if ( LocalPromoteIsTriggered )
return ;
if ( endOfRecovery )
ereport ( LOG ,
( errmsg ( " pausing at the end of recovery " ) ,
errhint ( " Execute pg_wal_replay_resume() to promote. " ) ) ) ;
else
ereport ( LOG ,
( errmsg ( " recovery has paused " ) ,
errhint ( " Execute pg_wal_replay_resume() to continue. " ) ) ) ;
/* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
while ( GetRecoveryPauseState ( ) ! = RECOVERY_NOT_PAUSED )
{
HandleStartupProcInterrupts ( ) ;
if ( CheckForStandbyTrigger ( ) )
return ;
/*
* If recovery pause is requested then set it paused . While we are in
* the loop , user might resume and pause again so set this every time .
*/
ConfirmRecoveryPaused ( ) ;
/*
* We wait on a condition variable that will wake us as soon as the
* pause ends , but we use a timeout so we can check the above exit
* condition periodically too .
*/
ConditionVariableTimedSleep ( & XLogRecoveryCtl - > recoveryNotPausedCV , 1000 ,
WAIT_EVENT_RECOVERY_PAUSE ) ;
}
ConditionVariableCancelSleep ( ) ;
}
/*
* When recovery_min_apply_delay is set , we wait long enough to make sure
* certain record types are applied at least that interval behind the primary .
*
* Returns true if we waited .
*
* Note that the delay is calculated between the WAL record log time and
* the current time on standby . We would prefer to keep track of when this
* standby received each WAL record , which would allow a more consistent
* approach and one not affected by time synchronisation issues , but that
* is significantly more effort and complexity for little actual gain in
* usability .
*/
static bool
recoveryApplyDelay ( XLogReaderState * record )
{
uint8 xact_info ;
TimestampTz xtime ;
TimestampTz delayUntil ;
long msecs ;
/* nothing to do if no delay configured */
if ( recovery_min_apply_delay < = 0 )
return false ;
/* no delay is applied on a database not yet consistent */
if ( ! reachedConsistency )
return false ;
/* nothing to do if crash recovery is requested */
if ( ! ArchiveRecoveryRequested )
return false ;
/*
* Is it a COMMIT record ?
*
* We deliberately choose not to delay aborts since they have no effect on
* MVCC . We already allow replay of records that don ' t have a timestamp ,
* so there is already opportunity for issues caused by early conflicts on
* standbys .
*/
if ( XLogRecGetRmid ( record ) ! = RM_XACT_ID )
return false ;
xact_info = XLogRecGetInfo ( record ) & XLOG_XACT_OPMASK ;
if ( xact_info ! = XLOG_XACT_COMMIT & &
xact_info ! = XLOG_XACT_COMMIT_PREPARED )
return false ;
if ( ! getRecordTimestamp ( record , & xtime ) )
return false ;
delayUntil = TimestampTzPlusMilliseconds ( xtime , recovery_min_apply_delay ) ;
/*
* Exit without arming the latch if it ' s already past time to apply this
* record
*/
msecs = TimestampDifferenceMilliseconds ( GetCurrentTimestamp ( ) , delayUntil ) ;
if ( msecs < = 0 )
return false ;
while ( true )
{
ResetLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ) ;
/*
* This might change recovery_min_apply_delay or the trigger file ' s
* location .
*/
HandleStartupProcInterrupts ( ) ;
if ( CheckForStandbyTrigger ( ) )
break ;
/*
* Recalculate delayUntil as recovery_min_apply_delay could have
* changed while waiting in this loop .
*/
delayUntil = TimestampTzPlusMilliseconds ( xtime , recovery_min_apply_delay ) ;
/*
* Wait for difference between GetCurrentTimestamp ( ) and delayUntil .
*/
msecs = TimestampDifferenceMilliseconds ( GetCurrentTimestamp ( ) ,
delayUntil ) ;
if ( msecs < = 0 )
break ;
elog ( DEBUG2 , " recovery apply delay %ld milliseconds " , msecs ) ;
( void ) WaitLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ,
WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH ,
msecs ,
WAIT_EVENT_RECOVERY_APPLY_DELAY ) ;
}
return true ;
}
/*
* Get the current state of the recovery pause request .
*/
RecoveryPauseState
GetRecoveryPauseState ( void )
{
RecoveryPauseState state ;
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
state = XLogRecoveryCtl - > recoveryPauseState ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
return state ;
}
/*
* Set the recovery pause state .
*
* If recovery pause is requested then sets the recovery pause state to
* ' pause requested ' if it is not already ' paused ' . Otherwise , sets it
* to ' not paused ' to resume the recovery . The recovery pause will be
* confirmed by the ConfirmRecoveryPaused .
*/
void
SetRecoveryPause ( bool recoveryPause )
{
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
if ( ! recoveryPause )
XLogRecoveryCtl - > recoveryPauseState = RECOVERY_NOT_PAUSED ;
else if ( XLogRecoveryCtl - > recoveryPauseState = = RECOVERY_NOT_PAUSED )
XLogRecoveryCtl - > recoveryPauseState = RECOVERY_PAUSE_REQUESTED ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
if ( ! recoveryPause )
ConditionVariableBroadcast ( & XLogRecoveryCtl - > recoveryNotPausedCV ) ;
}
/*
* Confirm the recovery pause by setting the recovery pause state to
* RECOVERY_PAUSED .
*/
static void
ConfirmRecoveryPaused ( void )
{
/* If recovery pause is requested then set it paused */
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
if ( XLogRecoveryCtl - > recoveryPauseState = = RECOVERY_PAUSE_REQUESTED )
XLogRecoveryCtl - > recoveryPauseState = RECOVERY_PAUSED ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
}
/*
* Attempt to read the next XLOG record .
*
* Before first call , the reader needs to be positioned to the first record
2022-04-07 09:28:40 +02:00
* by calling XLogPrefetcherBeginRead ( ) .
2022-02-16 08:30:38 +01:00
*
* If no valid record is available , returns NULL , or fails if emode is PANIC .
* ( emode must be either PANIC , LOG ) . In standby mode , retries until a valid
* record is available .
*/
static XLogRecord *
2022-04-07 09:28:40 +02:00
ReadRecord ( XLogPrefetcher * xlogprefetcher , int emode ,
2022-02-16 08:30:38 +01:00
bool fetching_ckpt , TimeLineID replayTLI )
{
XLogRecord * record ;
2022-04-07 09:28:40 +02:00
XLogReaderState * xlogreader = XLogPrefetcherGetReader ( xlogprefetcher ) ;
2022-02-16 08:30:38 +01:00
XLogPageReadPrivate * private = ( XLogPageReadPrivate * ) xlogreader - > private_data ;
/* Pass through parameters to XLogPageRead */
private - > fetching_ckpt = fetching_ckpt ;
private - > emode = emode ;
private - > randAccess = ( xlogreader - > ReadRecPtr = = InvalidXLogRecPtr ) ;
private - > replayTLI = replayTLI ;
/* This is the first attempt to read this page. */
lastSourceFailed = false ;
for ( ; ; )
{
char * errormsg ;
2022-04-07 09:28:40 +02:00
record = XLogPrefetcherReadRecord ( xlogprefetcher , & errormsg ) ;
2022-02-16 08:30:38 +01:00
if ( record = = NULL )
{
/*
* When not in standby mode we find that WAL ends in an incomplete
* record , keep track of that record . After recovery is done ,
2022-04-11 10:49:41 +02:00
* we ' ll write a record to indicate to downstream WAL readers that
2022-02-16 08:30:38 +01:00
* that portion is to be ignored .
*/
if ( ! StandbyMode & &
! XLogRecPtrIsInvalid ( xlogreader - > abortedRecPtr ) )
{
abortedRecPtr = xlogreader - > abortedRecPtr ;
missingContrecPtr = xlogreader - > missingContrecPtr ;
}
if ( readFile > = 0 )
{
close ( readFile ) ;
readFile = - 1 ;
}
/*
* We only end up here without a message when XLogPageRead ( )
* failed - in that case we already logged something . In
* StandbyMode that only happens if we have been triggered , so we
* shouldn ' t loop anymore in that case .
*/
if ( errormsg )
ereport ( emode_for_corrupt_record ( emode , xlogreader - > EndRecPtr ) ,
( errmsg_internal ( " %s " , errormsg ) /* already translated */ ) ) ;
}
/*
* Check page TLI is one of the expected values .
*/
else if ( ! tliInHistory ( xlogreader - > latestPageTLI , expectedTLEs ) )
{
char fname [ MAXFNAMELEN ] ;
XLogSegNo segno ;
int32 offset ;
XLByteToSeg ( xlogreader - > latestPagePtr , segno , wal_segment_size ) ;
offset = XLogSegmentOffset ( xlogreader - > latestPagePtr ,
wal_segment_size ) ;
XLogFileName ( fname , xlogreader - > seg . ws_tli , segno ,
wal_segment_size ) ;
ereport ( emode_for_corrupt_record ( emode , xlogreader - > EndRecPtr ) ,
( errmsg ( " unexpected timeline ID %u in log segment %s, offset %u " ,
xlogreader - > latestPageTLI ,
fname ,
offset ) ) ) ;
record = NULL ;
}
if ( record )
{
/* Great, got a record */
return record ;
}
else
{
/* No valid record available from this source */
lastSourceFailed = true ;
/*
* If archive recovery was requested , but we were still doing
* crash recovery , switch to archive recovery and retry using the
* offline archive . We have now replayed all the valid WAL in
* pg_wal , so we are presumably now consistent .
*
* We require that there ' s at least some valid WAL present in
* pg_wal , however ( ! fetching_ckpt ) . We could recover using the
* WAL from the archive , even if pg_wal is completely empty , but
* we ' d have no idea how far we ' d have to replay to reach
* consistency . So err on the safe side and give up .
*/
if ( ! InArchiveRecovery & & ArchiveRecoveryRequested & &
! fetching_ckpt )
{
ereport ( DEBUG1 ,
( errmsg_internal ( " reached end of WAL in pg_wal, entering archive recovery " ) ) ) ;
InArchiveRecovery = true ;
if ( StandbyModeRequested )
StandbyMode = true ;
SwitchIntoArchiveRecovery ( xlogreader - > EndRecPtr , replayTLI ) ;
minRecoveryPoint = xlogreader - > EndRecPtr ;
minRecoveryPointTLI = replayTLI ;
CheckRecoveryConsistency ( ) ;
/*
* Before we retry , reset lastSourceFailed and currentSource
* so that we will check the archive next .
*/
lastSourceFailed = false ;
currentSource = XLOG_FROM_ANY ;
continue ;
}
/* In standby mode, loop back to retry. Otherwise, give up. */
if ( StandbyMode & & ! CheckForStandbyTrigger ( ) )
continue ;
else
return NULL ;
}
}
}
/*
* Read the XLOG page containing RecPtr into readBuf ( if not read already ) .
2022-04-07 09:28:40 +02:00
* Returns number of bytes read , if the page is read successfully , or
* XLREAD_FAIL in case of errors . When errors occur , they are ereport ' ed , but
* only if they have not been previously reported .
*
* While prefetching , xlogreader - > nonblocking may be set . In that case ,
* returns XLREAD_WOULDBLOCK if we ' d otherwise have to wait for more WAL .
2022-02-16 08:30:38 +01:00
*
* This is responsible for restoring files from archive as needed , as well
* as for waiting for the requested WAL record to arrive in standby mode .
*
* ' emode ' specifies the log level used for reporting " file not found " or
* " end of WAL " situations in archive recovery , or in standby mode when a
* trigger file is found . If set to WARNING or below , XLogPageRead ( ) returns
2022-04-07 09:28:40 +02:00
* XLREAD_FAIL in those situations , on higher log levels the ereport ( ) won ' t
2022-02-16 08:30:38 +01:00
* return .
*
* In standby mode , if after a successful return of XLogPageRead ( ) the
* caller finds the record it ' s interested in to be broken , it should
* ereport the error with the level determined by
* emode_for_corrupt_record ( ) , and then set lastSourceFailed
* and call XLogPageRead ( ) again with the same arguments . This lets
* XLogPageRead ( ) to try fetching the record from another source , or to
* sleep and retry .
*/
static int
XLogPageRead ( XLogReaderState * xlogreader , XLogRecPtr targetPagePtr , int reqLen ,
XLogRecPtr targetRecPtr , char * readBuf )
{
XLogPageReadPrivate * private =
( XLogPageReadPrivate * ) xlogreader - > private_data ;
int emode = private - > emode ;
uint32 targetPageOff ;
XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY ;
int r ;
XLByteToSeg ( targetPagePtr , targetSegNo , wal_segment_size ) ;
targetPageOff = XLogSegmentOffset ( targetPagePtr , wal_segment_size ) ;
/*
* See if we need to switch to a new segment because the requested record
* is not in the currently open one .
*/
if ( readFile > = 0 & &
! XLByteInSeg ( targetPagePtr , readSegNo , wal_segment_size ) )
{
/*
* Request a restartpoint if we ' ve replayed too much xlog since the
* last one .
*/
if ( ArchiveRecoveryRequested & & IsUnderPostmaster )
{
if ( XLogCheckpointNeeded ( readSegNo ) )
{
( void ) GetRedoRecPtr ( ) ;
if ( XLogCheckpointNeeded ( readSegNo ) )
RequestCheckpoint ( CHECKPOINT_CAUSE_XLOG ) ;
}
}
close ( readFile ) ;
readFile = - 1 ;
readSource = XLOG_FROM_ANY ;
}
XLByteToSeg ( targetPagePtr , readSegNo , wal_segment_size ) ;
retry :
/* See if we need to retrieve more data */
if ( readFile < 0 | |
( readSource = = XLOG_FROM_STREAM & &
flushedUpto < targetPagePtr + reqLen ) )
{
2022-04-07 09:28:40 +02:00
if ( readFile > = 0 & &
xlogreader - > nonblocking & &
readSource = = XLOG_FROM_STREAM & &
flushedUpto < targetPagePtr + reqLen )
return XLREAD_WOULDBLOCK ;
switch ( WaitForWALToBecomeAvailable ( targetPagePtr + reqLen ,
private - > randAccess ,
private - > fetching_ckpt ,
targetRecPtr ,
private - > replayTLI ,
xlogreader - > EndRecPtr ,
xlogreader - > nonblocking ) )
2022-02-16 08:30:38 +01:00
{
2022-04-07 09:28:40 +02:00
case XLREAD_WOULDBLOCK :
return XLREAD_WOULDBLOCK ;
case XLREAD_FAIL :
if ( readFile > = 0 )
close ( readFile ) ;
readFile = - 1 ;
readLen = 0 ;
readSource = XLOG_FROM_ANY ;
return XLREAD_FAIL ;
case XLREAD_SUCCESS :
break ;
2022-02-16 08:30:38 +01:00
}
}
/*
* At this point , we have the right segment open and if we ' re streaming we
* know the requested record is in it .
*/
Assert ( readFile ! = - 1 ) ;
/*
* If the current segment is being streamed from the primary , calculate
* how much of the current page we have received already . We know the
* requested record has been received , but this is for the benefit of
* future calls , to allow quick exit at the top of this function .
*/
if ( readSource = = XLOG_FROM_STREAM )
{
if ( ( ( targetPagePtr ) / XLOG_BLCKSZ ) ! = ( flushedUpto / XLOG_BLCKSZ ) )
readLen = XLOG_BLCKSZ ;
else
readLen = XLogSegmentOffset ( flushedUpto , wal_segment_size ) -
targetPageOff ;
}
else
readLen = XLOG_BLCKSZ ;
/* Read the requested page */
readOff = targetPageOff ;
pgstat_report_wait_start ( WAIT_EVENT_WAL_READ ) ;
r = pg_pread ( readFile , readBuf , XLOG_BLCKSZ , ( off_t ) readOff ) ;
if ( r ! = XLOG_BLCKSZ )
{
char fname [ MAXFNAMELEN ] ;
int save_errno = errno ;
pgstat_report_wait_end ( ) ;
XLogFileName ( fname , curFileTLI , readSegNo , wal_segment_size ) ;
if ( r < 0 )
{
errno = save_errno ;
ereport ( emode_for_corrupt_record ( emode , targetPagePtr + reqLen ) ,
( errcode_for_file_access ( ) ,
errmsg ( " could not read from log segment %s, offset %u: %m " ,
fname , readOff ) ) ) ;
}
else
ereport ( emode_for_corrupt_record ( emode , targetPagePtr + reqLen ) ,
( errcode ( ERRCODE_DATA_CORRUPTED ) ,
errmsg ( " could not read from log segment %s, offset %u: read %d of %zu " ,
fname , readOff , r , ( Size ) XLOG_BLCKSZ ) ) ) ;
goto next_record_is_invalid ;
}
pgstat_report_wait_end ( ) ;
Assert ( targetSegNo = = readSegNo ) ;
Assert ( targetPageOff = = readOff ) ;
Assert ( reqLen < = readLen ) ;
xlogreader - > seg . ws_tli = curFileTLI ;
/*
* Check the page header immediately , so that we can retry immediately if
* it ' s not valid . This may seem unnecessary , because ReadPageInternal ( )
* validates the page header anyway , and would propagate the failure up to
* ReadRecord ( ) , which would retry . However , there ' s a corner case with
* continuation records , if a record is split across two pages such that
* we would need to read the two pages from different sources . For
* example , imagine a scenario where a streaming replica is started up ,
* and replay reaches a record that ' s split across two WAL segments . The
* first page is only available locally , in pg_wal , because it ' s already
* been recycled on the primary . The second page , however , is not present
* in pg_wal , and we should stream it from the primary . There is a
* recycled WAL segment present in pg_wal , with garbage contents , however .
* We would read the first page from the local WAL segment , but when
* reading the second page , we would read the bogus , recycled , WAL
* segment . If we didn ' t catch that case here , we would never recover ,
* because ReadRecord ( ) would retry reading the whole record from the
* beginning .
*
* Of course , this only catches errors in the page header , which is what
* happens in the case of a recycled WAL segment . Other kinds of errors or
* corruption still has the same problem . But this at least fixes the
* common case , which can happen as part of normal operation .
*
* Validating the page header is cheap enough that doing it twice
* shouldn ' t be a big deal from a performance point of view .
*
* When not in standby mode , an invalid page header should cause recovery
* to end , not retry reading the page , so we don ' t need to validate the
* page header here for the retry . Instead , ReadPageInternal ( ) is
* responsible for the validation .
*/
if ( StandbyMode & &
! XLogReaderValidatePageHeader ( xlogreader , targetPagePtr , readBuf ) )
{
/*
* Emit this error right now then retry this page immediately . Use
* errmsg_internal ( ) because the message was already translated .
*/
if ( xlogreader - > errormsg_buf [ 0 ] )
ereport ( emode_for_corrupt_record ( emode , xlogreader - > EndRecPtr ) ,
( errmsg_internal ( " %s " , xlogreader - > errormsg_buf ) ) ) ;
/* reset any error XLogReaderValidatePageHeader() might have set */
xlogreader - > errormsg_buf [ 0 ] = ' \0 ' ;
goto next_record_is_invalid ;
}
return readLen ;
next_record_is_invalid :
lastSourceFailed = true ;
if ( readFile > = 0 )
close ( readFile ) ;
readFile = - 1 ;
readLen = 0 ;
readSource = XLOG_FROM_ANY ;
/* In standby-mode, keep trying */
if ( StandbyMode )
goto retry ;
else
2022-04-07 09:28:40 +02:00
return XLREAD_FAIL ;
2022-02-16 08:30:38 +01:00
}
/*
* Open the WAL segment containing WAL location ' RecPtr ' .
*
* The segment can be fetched via restore_command , or via walreceiver having
* streamed the record , or it can already be present in pg_wal . Checking
* pg_wal is mainly for crash recovery , but it will be polled in standby mode
* too , in case someone copies a new segment directly to pg_wal . That is not
* documented or recommended , though .
*
* If ' fetching_ckpt ' is true , we ' re fetching a checkpoint record , and should
* prepare to read WAL starting from RedoStartLSN after this .
*
* ' RecPtr ' might not point to the beginning of the record we ' re interested
* in , it might also point to the page or segment header . In that case ,
* ' tliRecPtr ' is the position of the WAL record we ' re interested in . It is
* used to decide which timeline to stream the requested WAL from .
*
* ' replayLSN ' is the current replay LSN , so that if we scan for new
* timelines , we can reject a switch to a timeline that branched off before
* this point .
*
* If the record is not immediately available , the function returns false
* if we ' re not in standby mode . In standby mode , waits for it to become
* available .
*
* When the requested record becomes available , the function opens the file
2022-04-07 09:28:40 +02:00
* containing it ( if not open already ) , and returns XLREAD_SUCCESS . When end
* of standby mode is triggered by the user , and there is no more WAL
* available , returns XLREAD_FAIL .
*
* If nonblocking is true , then give up immediately if we can ' t satisfy the
* request , returning XLREAD_WOULDBLOCK instead of waiting .
2022-02-16 08:30:38 +01:00
*/
2022-04-07 09:28:40 +02:00
static XLogPageReadResult
2022-02-16 08:30:38 +01:00
WaitForWALToBecomeAvailable ( XLogRecPtr RecPtr , bool randAccess ,
bool fetching_ckpt , XLogRecPtr tliRecPtr ,
2022-04-07 09:28:40 +02:00
TimeLineID replayTLI , XLogRecPtr replayLSN ,
bool nonblocking )
2022-02-16 08:30:38 +01:00
{
static TimestampTz last_fail_time = 0 ;
TimestampTz now ;
bool streaming_reply_sent = false ;
/*-------
* Standby mode is implemented by a state machine :
*
* 1. Read from either archive or pg_wal ( XLOG_FROM_ARCHIVE ) , or just
* pg_wal ( XLOG_FROM_PG_WAL )
* 2. Check trigger file
* 3. Read from primary server via walreceiver ( XLOG_FROM_STREAM )
* 4. Rescan timelines
* 5. Sleep wal_retrieve_retry_interval milliseconds , and loop back to 1.
*
* Failure to read from the current source advances the state machine to
* the next state .
*
* ' currentSource ' indicates the current state . There are no currentSource
* values for " check trigger " , " rescan timelines " , and " sleep " states ,
* those actions are taken when reading from the previous source fails , as
* part of advancing to the next state .
*
* If standby mode is turned off while reading WAL from stream , we move
* to XLOG_FROM_ARCHIVE and reset lastSourceFailed , to force fetching
* the files ( which would be required at end of recovery , e . g . , timeline
* history file ) from archive or pg_wal . We don ' t need to kill WAL receiver
* here because it ' s already stopped when standby mode is turned off at
* the end of recovery .
* - - - - - - -
*/
if ( ! InArchiveRecovery )
currentSource = XLOG_FROM_PG_WAL ;
else if ( currentSource = = XLOG_FROM_ANY | |
( ! StandbyMode & & currentSource = = XLOG_FROM_STREAM ) )
{
lastSourceFailed = false ;
currentSource = XLOG_FROM_ARCHIVE ;
}
for ( ; ; )
{
XLogSource oldSource = currentSource ;
bool startWalReceiver = false ;
/*
* First check if we failed to read from the current source , and
* advance the state machine if so . The failure to read might ' ve
* happened outside this function , e . g when a CRC check fails on a
* record , or within this loop .
*/
if ( lastSourceFailed )
{
2022-04-07 09:28:40 +02:00
/*
* Don ' t allow any retry loops to occur during nonblocking
* readahead . Let the caller process everything that has been
* decoded already first .
*/
if ( nonblocking )
return XLREAD_WOULDBLOCK ;
2022-02-16 08:30:38 +01:00
switch ( currentSource )
{
case XLOG_FROM_ARCHIVE :
case XLOG_FROM_PG_WAL :
/*
* Check to see if the trigger file exists . Note that we
* do this only after failure , so when you create the
* trigger file , we still finish replaying as much as we
* can from archive and pg_wal before failover .
*/
if ( StandbyMode & & CheckForStandbyTrigger ( ) )
{
XLogShutdownWalRcv ( ) ;
2022-04-07 09:28:40 +02:00
return XLREAD_FAIL ;
2022-02-16 08:30:38 +01:00
}
/*
* Not in standby mode , and we ' ve now tried the archive
* and pg_wal .
*/
if ( ! StandbyMode )
2022-04-07 09:28:40 +02:00
return XLREAD_FAIL ;
2022-02-16 08:30:38 +01:00
/*
* Move to XLOG_FROM_STREAM state , and set to start a
* walreceiver if necessary .
*/
currentSource = XLOG_FROM_STREAM ;
startWalReceiver = true ;
break ;
case XLOG_FROM_STREAM :
/*
* Failure while streaming . Most likely , we got here
* because streaming replication was terminated , or
* promotion was triggered . But we also get here if we
* find an invalid record in the WAL streamed from the
* primary , in which case something is seriously wrong .
* There ' s little chance that the problem will just go
* away , but PANIC is not good for availability either ,
* especially in hot standby mode . So , we treat that the
* same as disconnection , and retry from archive / pg_wal
* again . The WAL in the archive should be identical to
* what was streamed , so it ' s unlikely that it helps , but
* one can hope . . .
*/
/*
* We should be able to move to XLOG_FROM_STREAM only in
* standby mode .
*/
Assert ( StandbyMode ) ;
/*
* Before we leave XLOG_FROM_STREAM state , make sure that
* walreceiver is not active , so that it won ' t overwrite
* WAL that we restore from archive .
*/
if ( WalRcvStreaming ( ) )
XLogShutdownWalRcv ( ) ;
/*
* Before we sleep , re - scan for possible new timelines if
* we were requested to recover to the latest timeline .
*/
if ( recoveryTargetTimeLineGoal = = RECOVERY_TARGET_TIMELINE_LATEST )
{
if ( rescanLatestTimeLine ( replayTLI , replayLSN ) )
{
currentSource = XLOG_FROM_ARCHIVE ;
break ;
}
}
/*
* XLOG_FROM_STREAM is the last state in our state
* machine , so we ' ve exhausted all the options for
* obtaining the requested WAL . We ' re going to loop back
* and retry from the archive , but if it hasn ' t been long
* since last attempt , sleep wal_retrieve_retry_interval
* milliseconds to avoid busy - waiting .
*/
now = GetCurrentTimestamp ( ) ;
if ( ! TimestampDifferenceExceeds ( last_fail_time , now ,
wal_retrieve_retry_interval ) )
{
long wait_time ;
wait_time = wal_retrieve_retry_interval -
TimestampDifferenceMilliseconds ( last_fail_time , now ) ;
elog ( LOG , " waiting for WAL to become available at %X/%X " ,
LSN_FORMAT_ARGS ( RecPtr ) ) ;
( void ) WaitLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ,
WL_LATCH_SET | WL_TIMEOUT |
WL_EXIT_ON_PM_DEATH ,
wait_time ,
WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL ) ;
ResetLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ) ;
now = GetCurrentTimestamp ( ) ;
/* Handle interrupt signals of startup process */
HandleStartupProcInterrupts ( ) ;
}
last_fail_time = now ;
currentSource = XLOG_FROM_ARCHIVE ;
break ;
default :
elog ( ERROR , " unexpected WAL source %d " , currentSource ) ;
}
}
else if ( currentSource = = XLOG_FROM_PG_WAL )
{
/*
* We just successfully read a file in pg_wal . We prefer files in
* the archive over ones in pg_wal , so try the next file again
* from the archive first .
*/
if ( InArchiveRecovery )
currentSource = XLOG_FROM_ARCHIVE ;
}
if ( currentSource ! = oldSource )
elog ( DEBUG2 , " switched WAL source from %s to %s after %s " ,
xlogSourceNames [ oldSource ] , xlogSourceNames [ currentSource ] ,
lastSourceFailed ? " failure " : " success " ) ;
/*
* We ' ve now handled possible failure . Try to read from the chosen
* source .
*/
lastSourceFailed = false ;
switch ( currentSource )
{
case XLOG_FROM_ARCHIVE :
case XLOG_FROM_PG_WAL :
/*
* WAL receiver must not be running when reading WAL from
* archive or pg_wal .
*/
Assert ( ! WalRcvStreaming ( ) ) ;
/* Close any old file we might have open. */
if ( readFile > = 0 )
{
close ( readFile ) ;
readFile = - 1 ;
}
/* Reset curFileTLI if random fetch. */
if ( randAccess )
curFileTLI = 0 ;
/*
* Try to restore the file from archive , or read an existing
* file from pg_wal .
*/
readFile = XLogFileReadAnyTLI ( readSegNo , DEBUG2 ,
currentSource = = XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
currentSource ) ;
if ( readFile > = 0 )
2022-04-07 09:28:40 +02:00
return XLREAD_SUCCESS ; /* success! */
2022-02-16 08:30:38 +01:00
/*
* Nope , not found in archive or pg_wal .
*/
lastSourceFailed = true ;
break ;
case XLOG_FROM_STREAM :
{
bool havedata ;
/*
* We should be able to move to XLOG_FROM_STREAM only in
* standby mode .
*/
Assert ( StandbyMode ) ;
/*
* First , shutdown walreceiver if its restart has been
* requested - - but no point if we ' re already slated for
* starting it .
*/
if ( pendingWalRcvRestart & & ! startWalReceiver )
{
XLogShutdownWalRcv ( ) ;
/*
* Re - scan for possible new timelines if we were
* requested to recover to the latest timeline .
*/
if ( recoveryTargetTimeLineGoal = =
RECOVERY_TARGET_TIMELINE_LATEST )
rescanLatestTimeLine ( replayTLI , replayLSN ) ;
startWalReceiver = true ;
}
pendingWalRcvRestart = false ;
/*
* Launch walreceiver if needed .
*
* If fetching_ckpt is true , RecPtr points to the initial
* checkpoint location . In that case , we use RedoStartLSN
* as the streaming start position instead of RecPtr , so
* that when we later jump backwards to start redo at
* RedoStartLSN , we will have the logs streamed already .
*/
if ( startWalReceiver & &
PrimaryConnInfo & & strcmp ( PrimaryConnInfo , " " ) ! = 0 )
{
XLogRecPtr ptr ;
TimeLineID tli ;
if ( fetching_ckpt )
{
ptr = RedoStartLSN ;
tli = RedoStartTLI ;
}
else
{
ptr = RecPtr ;
/*
* Use the record begin position to determine the
* TLI , rather than the position we ' re reading .
*/
tli = tliOfPointInHistory ( tliRecPtr , expectedTLEs ) ;
if ( curFileTLI > 0 & & tli < curFileTLI )
elog ( ERROR , " according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u " ,
LSN_FORMAT_ARGS ( tliRecPtr ) ,
tli , curFileTLI ) ;
}
curFileTLI = tli ;
SetInstallXLogFileSegmentActive ( ) ;
RequestXLogStreaming ( tli , ptr , PrimaryConnInfo ,
PrimarySlotName ,
wal_receiver_create_temp_slot ) ;
flushedUpto = 0 ;
}
/*
* Check if WAL receiver is active or wait to start up .
*/
if ( ! WalRcvStreaming ( ) )
{
lastSourceFailed = true ;
break ;
}
/*
* Walreceiver is active , so see if new data has arrived .
*
* We only advance XLogReceiptTime when we obtain fresh
* WAL from walreceiver and observe that we had already
* processed everything before the most recent " chunk "
* that it flushed to disk . In steady state where we are
* keeping up with the incoming data , XLogReceiptTime will
* be updated on each cycle . When we are behind ,
* XLogReceiptTime will not advance , so the grace time
* allotted to conflicting queries will decrease .
*/
if ( RecPtr < flushedUpto )
havedata = true ;
else
{
XLogRecPtr latestChunkStart ;
flushedUpto = GetWalRcvFlushRecPtr ( & latestChunkStart , & receiveTLI ) ;
if ( RecPtr < flushedUpto & & receiveTLI = = curFileTLI )
{
havedata = true ;
if ( latestChunkStart < = RecPtr )
{
XLogReceiptTime = GetCurrentTimestamp ( ) ;
SetCurrentChunkStartTime ( XLogReceiptTime ) ;
}
}
else
havedata = false ;
}
if ( havedata )
{
/*
* Great , streamed far enough . Open the file if it ' s
* not open already . Also read the timeline history
* file if we haven ' t initialized timeline history
* yet ; it should be streamed over and present in
* pg_wal by now . Use XLOG_FROM_STREAM so that source
* info is set correctly and XLogReceiptTime isn ' t
* changed .
*
* NB : We must set readTimeLineHistory based on
* recoveryTargetTLI , not receiveTLI . Normally they ' ll
* be the same , but if recovery_target_timeline is
* ' latest ' and archiving is configured , then it ' s
* possible that we managed to retrieve one or more
* new timeline history files from the archive ,
* updating recoveryTargetTLI .
*/
if ( readFile < 0 )
{
if ( ! expectedTLEs )
expectedTLEs = readTimeLineHistory ( recoveryTargetTLI ) ;
readFile = XLogFileRead ( readSegNo , PANIC ,
receiveTLI ,
XLOG_FROM_STREAM , false ) ;
Assert ( readFile > = 0 ) ;
}
else
{
/* just make sure source info is correct... */
readSource = XLOG_FROM_STREAM ;
XLogReceiptSource = XLOG_FROM_STREAM ;
2022-04-07 09:28:40 +02:00
return XLREAD_SUCCESS ;
2022-02-16 08:30:38 +01:00
}
break ;
}
2022-04-07 09:28:40 +02:00
/* In nonblocking mode, return rather than sleeping. */
if ( nonblocking )
return XLREAD_WOULDBLOCK ;
2022-02-16 08:30:38 +01:00
/*
* Data not here yet . Check for trigger , then wait for
* walreceiver to wake us up when new WAL arrives .
*/
if ( CheckForStandbyTrigger ( ) )
{
/*
2022-04-07 09:28:40 +02:00
* Note that we don ' t return XLREAD_FAIL immediately
* here . After being triggered , we still want to
* replay all the WAL that was already streamed . It ' s
* in pg_wal now , so we just treat this as a failure ,
* and the state machine will move on to replay the
* streamed WAL from pg_wal , and then recheck the
* trigger and exit replay .
2022-02-16 08:30:38 +01:00
*/
lastSourceFailed = true ;
break ;
}
/*
* Since we have replayed everything we have received so
* far and are about to start waiting for more WAL , let ' s
* tell the upstream server our replay location now so
* that pg_stat_replication doesn ' t show stale
* information .
*/
if ( ! streaming_reply_sent )
{
WalRcvForceReply ( ) ;
streaming_reply_sent = true ;
}
2022-04-07 09:28:40 +02:00
/* Update pg_stat_recovery_prefetch before sleeping. */
XLogPrefetcherComputeStats ( xlogprefetcher ) ;
2022-02-16 08:30:38 +01:00
/*
* Wait for more WAL to arrive . Time out after 5 seconds
* to react to a trigger file promptly and to check if the
* WAL receiver is still active .
*/
( void ) WaitLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ,
WL_LATCH_SET | WL_TIMEOUT |
WL_EXIT_ON_PM_DEATH ,
5000L , WAIT_EVENT_RECOVERY_WAL_STREAM ) ;
ResetLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ) ;
break ;
}
default :
elog ( ERROR , " unexpected WAL source %d " , currentSource ) ;
}
/*
* Check for recovery pause here so that we can confirm more quickly
* that a requested pause has actually taken effect .
*/
if ( ( ( volatile XLogRecoveryCtlData * ) XLogRecoveryCtl ) - > recoveryPauseState ! =
RECOVERY_NOT_PAUSED )
recoveryPausesHere ( false ) ;
/*
* This possibly - long loop needs to handle interrupts of startup
* process .
*/
HandleStartupProcInterrupts ( ) ;
}
2022-04-07 09:28:40 +02:00
return XLREAD_FAIL ; /* not reached */
2022-02-16 08:30:38 +01:00
}
/*
* Determine what log level should be used to report a corrupt WAL record
* in the current WAL page , previously read by XLogPageRead ( ) .
*
* ' emode ' is the error mode that would be used to report a file - not - found
* or legitimate end - of - WAL situation . Generally , we use it as - is , but if
* we ' re retrying the exact same record that we ' ve tried previously , only
* complain the first time to keep the noise down . However , we only do when
* reading from pg_wal , because we don ' t expect any invalid records in archive
* or in records streamed from the primary . Files in the archive should be complete ,
* and we should never hit the end of WAL because we stop and wait for more WAL
* to arrive before replaying it .
*
* NOTE : This function remembers the RecPtr value it was last called with ,
* to suppress repeated messages about the same record . Only call this when
* you are about to ereport ( ) , or you might cause a later message to be
* erroneously suppressed .
*/
static int
emode_for_corrupt_record ( int emode , XLogRecPtr RecPtr )
{
static XLogRecPtr lastComplaint = 0 ;
if ( readSource = = XLOG_FROM_PG_WAL & & emode = = LOG )
{
if ( RecPtr = = lastComplaint )
emode = DEBUG1 ;
else
lastComplaint = RecPtr ;
}
return emode ;
}
/*
* Subroutine to try to fetch and validate a prior checkpoint record .
*
* whichChkpt identifies the checkpoint ( merely for reporting purposes ) .
* 1 for " primary " , 0 for " other " ( backup_label )
*/
static XLogRecord *
2022-04-07 09:28:40 +02:00
ReadCheckpointRecord ( XLogPrefetcher * xlogprefetcher , XLogRecPtr RecPtr ,
2022-02-16 08:30:38 +01:00
int whichChkpt , bool report , TimeLineID replayTLI )
{
XLogRecord * record ;
uint8 info ;
Assert ( xlogreader ! = NULL ) ;
if ( ! XRecOffIsValid ( RecPtr ) )
{
if ( ! report )
return NULL ;
switch ( whichChkpt )
{
case 1 :
ereport ( LOG ,
( errmsg ( " invalid primary checkpoint link in control file " ) ) ) ;
break ;
default :
ereport ( LOG ,
( errmsg ( " invalid checkpoint link in backup_label file " ) ) ) ;
break ;
}
return NULL ;
}
2022-04-07 09:28:40 +02:00
XLogPrefetcherBeginRead ( xlogprefetcher , RecPtr ) ;
record = ReadRecord ( xlogprefetcher , LOG , true , replayTLI ) ;
2022-02-16 08:30:38 +01:00
if ( record = = NULL )
{
if ( ! report )
return NULL ;
switch ( whichChkpt )
{
case 1 :
ereport ( LOG ,
( errmsg ( " invalid primary checkpoint record " ) ) ) ;
break ;
default :
ereport ( LOG ,
( errmsg ( " invalid checkpoint record " ) ) ) ;
break ;
}
return NULL ;
}
if ( record - > xl_rmid ! = RM_XLOG_ID )
{
switch ( whichChkpt )
{
case 1 :
ereport ( LOG ,
( errmsg ( " invalid resource manager ID in primary checkpoint record " ) ) ) ;
break ;
default :
ereport ( LOG ,
( errmsg ( " invalid resource manager ID in checkpoint record " ) ) ) ;
break ;
}
return NULL ;
}
info = record - > xl_info & ~ XLR_INFO_MASK ;
if ( info ! = XLOG_CHECKPOINT_SHUTDOWN & &
info ! = XLOG_CHECKPOINT_ONLINE )
{
switch ( whichChkpt )
{
case 1 :
ereport ( LOG ,
( errmsg ( " invalid xl_info in primary checkpoint record " ) ) ) ;
break ;
default :
ereport ( LOG ,
( errmsg ( " invalid xl_info in checkpoint record " ) ) ) ;
break ;
}
return NULL ;
}
if ( record - > xl_tot_len ! = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof ( CheckPoint ) )
{
switch ( whichChkpt )
{
case 1 :
ereport ( LOG ,
( errmsg ( " invalid length of primary checkpoint record " ) ) ) ;
break ;
default :
ereport ( LOG ,
( errmsg ( " invalid length of checkpoint record " ) ) ) ;
break ;
}
return NULL ;
}
return record ;
}
/*
* Scan for new timelines that might have appeared in the archive since we
* started recovery .
*
* If there are any , the function changes recovery target TLI to the latest
* one and returns ' true ' .
*/
static bool
rescanLatestTimeLine ( TimeLineID replayTLI , XLogRecPtr replayLSN )
{
List * newExpectedTLEs ;
bool found ;
ListCell * cell ;
TimeLineID newtarget ;
TimeLineID oldtarget = recoveryTargetTLI ;
TimeLineHistoryEntry * currentTle = NULL ;
newtarget = findNewestTimeLine ( recoveryTargetTLI ) ;
if ( newtarget = = recoveryTargetTLI )
{
/* No new timelines found */
return false ;
}
/*
* Determine the list of expected TLIs for the new TLI
*/
newExpectedTLEs = readTimeLineHistory ( newtarget ) ;
/*
* If the current timeline is not part of the history of the new timeline ,
* we cannot proceed to it .
*/
found = false ;
foreach ( cell , newExpectedTLEs )
{
currentTle = ( TimeLineHistoryEntry * ) lfirst ( cell ) ;
if ( currentTle - > tli = = recoveryTargetTLI )
{
found = true ;
break ;
}
}
if ( ! found )
{
ereport ( LOG ,
( errmsg ( " new timeline %u is not a child of database system timeline %u " ,
newtarget ,
replayTLI ) ) ) ;
return false ;
}
/*
* The current timeline was found in the history file , but check that the
* next timeline was forked off from it * after * the current recovery
* location .
*/
if ( currentTle - > end < replayLSN )
{
ereport ( LOG ,
( errmsg ( " new timeline %u forked off current database system timeline %u before current recovery point %X/%X " ,
newtarget ,
replayTLI ,
LSN_FORMAT_ARGS ( replayLSN ) ) ) ) ;
return false ;
}
/* The new timeline history seems valid. Switch target */
recoveryTargetTLI = newtarget ;
list_free_deep ( expectedTLEs ) ;
expectedTLEs = newExpectedTLEs ;
/*
* As in StartupXLOG ( ) , try to ensure we have all the history files
* between the old target and new target in pg_wal .
*/
restoreTimeLineHistoryFiles ( oldtarget + 1 , newtarget ) ;
ereport ( LOG ,
( errmsg ( " new target timeline is %u " ,
recoveryTargetTLI ) ) ) ;
return true ;
}
/*
* Open a logfile segment for reading ( during recovery ) .
*
* If source = = XLOG_FROM_ARCHIVE , the segment is retrieved from archive .
* Otherwise , it ' s assumed to be already available in pg_wal .
*/
static int
XLogFileRead ( XLogSegNo segno , int emode , TimeLineID tli ,
XLogSource source , bool notfoundOk )
{
char xlogfname [ MAXFNAMELEN ] ;
char activitymsg [ MAXFNAMELEN + 16 ] ;
char path [ MAXPGPATH ] ;
int fd ;
XLogFileName ( xlogfname , tli , segno , wal_segment_size ) ;
switch ( source )
{
case XLOG_FROM_ARCHIVE :
/* Report recovery progress in PS display */
snprintf ( activitymsg , sizeof ( activitymsg ) , " waiting for %s " ,
xlogfname ) ;
set_ps_display ( activitymsg ) ;
if ( ! RestoreArchivedFile ( path , xlogfname ,
" RECOVERYXLOG " ,
wal_segment_size ,
InRedo ) )
return - 1 ;
break ;
case XLOG_FROM_PG_WAL :
case XLOG_FROM_STREAM :
XLogFilePath ( path , tli , segno , wal_segment_size ) ;
break ;
default :
elog ( ERROR , " invalid XLogFileRead source %d " , source ) ;
}
/*
* If the segment was fetched from archival storage , replace the existing
* xlog segment ( if any ) with the archival version .
*/
if ( source = = XLOG_FROM_ARCHIVE )
{
Assert ( ! IsInstallXLogFileSegmentActive ( ) ) ;
KeepFileRestoredFromArchive ( path , xlogfname ) ;
/*
* Set path to point at the new file in pg_wal .
*/
snprintf ( path , MAXPGPATH , XLOGDIR " /%s " , xlogfname ) ;
}
fd = BasicOpenFile ( path , O_RDONLY | PG_BINARY ) ;
if ( fd > = 0 )
{
/* Success! */
curFileTLI = tli ;
/* Report recovery progress in PS display */
snprintf ( activitymsg , sizeof ( activitymsg ) , " recovering %s " ,
xlogfname ) ;
set_ps_display ( activitymsg ) ;
/* Track source of data in assorted state variables */
readSource = source ;
XLogReceiptSource = source ;
/* In FROM_STREAM case, caller tracks receipt time, not me */
if ( source ! = XLOG_FROM_STREAM )
XLogReceiptTime = GetCurrentTimestamp ( ) ;
return fd ;
}
if ( errno ! = ENOENT | | ! notfoundOk ) /* unexpected failure? */
ereport ( PANIC ,
( errcode_for_file_access ( ) ,
errmsg ( " could not open file \" %s \" : %m " , path ) ) ) ;
return - 1 ;
}
/*
* Open a logfile segment for reading ( during recovery ) .
*
* This version searches for the segment with any TLI listed in expectedTLEs .
*/
static int
XLogFileReadAnyTLI ( XLogSegNo segno , int emode , XLogSource source )
{
char path [ MAXPGPATH ] ;
ListCell * cell ;
int fd ;
List * tles ;
/*
* Loop looking for a suitable timeline ID : we might need to read any of
* the timelines listed in expectedTLEs .
*
* We expect curFileTLI on entry to be the TLI of the preceding file in
* sequence , or 0 if there was no predecessor . We do not allow curFileTLI
* to go backwards ; this prevents us from picking up the wrong file when a
* parent timeline extends to higher segment numbers than the child we
* want to read .
*
* If we haven ' t read the timeline history file yet , read it now , so that
* we know which TLIs to scan . We don ' t save the list in expectedTLEs ,
* however , unless we actually find a valid segment . That way if there is
* neither a timeline history file nor a WAL segment in the archive , and
* streaming replication is set up , we ' ll read the timeline history file
* streamed from the primary when we start streaming , instead of
* recovering with a dummy history generated here .
*/
if ( expectedTLEs )
tles = expectedTLEs ;
else
tles = readTimeLineHistory ( recoveryTargetTLI ) ;
foreach ( cell , tles )
{
TimeLineHistoryEntry * hent = ( TimeLineHistoryEntry * ) lfirst ( cell ) ;
TimeLineID tli = hent - > tli ;
if ( tli < curFileTLI )
break ; /* don't bother looking at too-old TLIs */
/*
* Skip scanning the timeline ID that the logfile segment to read
* doesn ' t belong to
*/
if ( hent - > begin ! = InvalidXLogRecPtr )
{
XLogSegNo beginseg = 0 ;
XLByteToSeg ( hent - > begin , beginseg , wal_segment_size ) ;
/*
* The logfile segment that doesn ' t belong to the timeline is
* older or newer than the segment that the timeline started or
* ended at , respectively . It ' s sufficient to check only the
* starting segment of the timeline here . Since the timelines are
* scanned in descending order in this loop , any segments newer
* than the ending segment should belong to newer timeline and
* have already been read before . So it ' s not necessary to check
* the ending segment of the timeline here .
*/
if ( segno < beginseg )
continue ;
}
if ( source = = XLOG_FROM_ANY | | source = = XLOG_FROM_ARCHIVE )
{
fd = XLogFileRead ( segno , emode , tli ,
XLOG_FROM_ARCHIVE , true ) ;
if ( fd ! = - 1 )
{
elog ( DEBUG1 , " got WAL segment from archive " ) ;
if ( ! expectedTLEs )
expectedTLEs = tles ;
return fd ;
}
}
if ( source = = XLOG_FROM_ANY | | source = = XLOG_FROM_PG_WAL )
{
fd = XLogFileRead ( segno , emode , tli ,
XLOG_FROM_PG_WAL , true ) ;
if ( fd ! = - 1 )
{
if ( ! expectedTLEs )
expectedTLEs = tles ;
return fd ;
}
}
}
/* Couldn't find it. For simplicity, complain about front timeline */
XLogFilePath ( path , recoveryTargetTLI , segno , wal_segment_size ) ;
errno = ENOENT ;
ereport ( emode ,
( errcode_for_file_access ( ) ,
errmsg ( " could not open file \" %s \" : %m " , path ) ) ) ;
return - 1 ;
}
/*
* Set flag to signal the walreceiver to restart . ( The startup process calls
* this on noticing a relevant configuration change . )
*/
void
StartupRequestWalReceiverRestart ( void )
{
if ( currentSource = = XLOG_FROM_STREAM & & WalRcvRunning ( ) )
{
ereport ( LOG ,
( errmsg ( " WAL receiver process shutdown requested " ) ) ) ;
pendingWalRcvRestart = true ;
}
}
/*
* Has a standby promotion already been triggered ?
*
* Unlike CheckForStandbyTrigger ( ) , this works in any process
* that ' s connected to shared memory .
*/
bool
PromoteIsTriggered ( void )
{
/*
* We check shared state each time only until a standby promotion is
* triggered . We can ' t trigger a promotion again , so there ' s no need to
* keep checking after the shared variable has once been seen true .
*/
if ( LocalPromoteIsTriggered )
return true ;
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
LocalPromoteIsTriggered = XLogRecoveryCtl - > SharedPromoteIsTriggered ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
return LocalPromoteIsTriggered ;
}
static void
SetPromoteIsTriggered ( void )
{
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
XLogRecoveryCtl - > SharedPromoteIsTriggered = true ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
/*
* Mark the recovery pause state as ' not paused ' because the paused state
* ends and promotion continues if a promotion is triggered while recovery
* is paused . Otherwise pg_get_wal_replay_pause_state ( ) can mistakenly
* return ' paused ' while a promotion is ongoing .
*/
SetRecoveryPause ( false ) ;
LocalPromoteIsTriggered = true ;
}
/*
* Check to see whether the user - specified trigger file exists and whether a
* promote request has arrived . If either condition holds , return true .
*/
static bool
CheckForStandbyTrigger ( void )
{
struct stat stat_buf ;
if ( LocalPromoteIsTriggered )
return true ;
if ( IsPromoteSignaled ( ) & & CheckPromoteSignal ( ) )
{
ereport ( LOG , ( errmsg ( " received promote request " ) ) ) ;
RemovePromoteSignalFiles ( ) ;
ResetPromoteSignaled ( ) ;
SetPromoteIsTriggered ( ) ;
return true ;
}
if ( PromoteTriggerFile = = NULL | | strcmp ( PromoteTriggerFile , " " ) = = 0 )
return false ;
if ( stat ( PromoteTriggerFile , & stat_buf ) = = 0 )
{
ereport ( LOG ,
( errmsg ( " promote trigger file found: %s " , PromoteTriggerFile ) ) ) ;
unlink ( PromoteTriggerFile ) ;
SetPromoteIsTriggered ( ) ;
return true ;
}
else if ( errno ! = ENOENT )
ereport ( ERROR ,
( errcode_for_file_access ( ) ,
errmsg ( " could not stat promote trigger file \" %s \" : %m " ,
PromoteTriggerFile ) ) ) ;
return false ;
}
/*
* Remove the files signaling a standby promotion request .
*/
void
RemovePromoteSignalFiles ( void )
{
unlink ( PROMOTE_SIGNAL_FILE ) ;
}
/*
* Check to see if a promote request has arrived .
*/
bool
CheckPromoteSignal ( void )
{
struct stat stat_buf ;
if ( stat ( PROMOTE_SIGNAL_FILE , & stat_buf ) = = 0 )
return true ;
return false ;
}
/*
* Wake up startup process to replay newly arrived WAL , or to notice that
* failover has been requested .
*/
void
WakeupRecovery ( void )
{
SetLatch ( & XLogRecoveryCtl - > recoveryWakeupLatch ) ;
}
/*
* Schedule a walreceiver wakeup in the main recovery loop .
*/
void
XLogRequestWalReceiverReply ( void )
{
doRequestWalReceiverReply = true ;
}
/*
* Is HotStandby active yet ? This is only important in special backends
* since normal backends won ' t ever be able to connect until this returns
* true . Postmaster knows this by way of signal , not via shared memory .
*
* Unlike testing standbyState , this works in any process that ' s connected to
* shared memory . ( And note that standbyState alone doesn ' t tell the truth
* anyway . )
*/
bool
HotStandbyActive ( void )
{
/*
* We check shared state each time only until Hot Standby is active . We
* can ' t de - activate Hot Standby , so there ' s no need to keep checking
* after the shared variable has once been seen true .
*/
if ( LocalHotStandbyActive )
return true ;
else
{
/* spinlock is essential on machines with weak memory ordering! */
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
LocalHotStandbyActive = XLogRecoveryCtl - > SharedHotStandbyActive ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
return LocalHotStandbyActive ;
}
}
/*
* Like HotStandbyActive ( ) , but to be used only in WAL replay code ,
* where we don ' t need to ask any other process what the state is .
*/
static bool
HotStandbyActiveInReplay ( void )
{
Assert ( AmStartupProcess ( ) | | ! IsPostmasterEnvironment ) ;
return LocalHotStandbyActive ;
}
/*
* Get latest redo apply position .
*
* Exported to allow WALReceiver to read the pointer directly .
*/
XLogRecPtr
GetXLogReplayRecPtr ( TimeLineID * replayTLI )
{
XLogRecPtr recptr ;
TimeLineID tli ;
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
recptr = XLogRecoveryCtl - > lastReplayedEndRecPtr ;
tli = XLogRecoveryCtl - > lastReplayedTLI ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
if ( replayTLI )
* replayTLI = tli ;
return recptr ;
}
/*
* Get position of last applied , or the record being applied .
*
2022-04-07 14:01:28 +02:00
* This is different from GetXLogReplayRecPtr ( ) in that if a WAL
2022-02-16 08:30:38 +01:00
* record is currently being applied , this includes that record .
*/
XLogRecPtr
GetCurrentReplayRecPtr ( TimeLineID * replayEndTLI )
{
XLogRecPtr recptr ;
TimeLineID tli ;
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
recptr = XLogRecoveryCtl - > replayEndRecPtr ;
tli = XLogRecoveryCtl - > replayEndTLI ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
if ( replayEndTLI )
* replayEndTLI = tli ;
return recptr ;
}
/*
* Save timestamp of latest processed commit / abort record .
*
* We keep this in XLogRecoveryCtl , not a simple static variable , so that it can be
* seen by processes other than the startup process . Note in particular
* that CreateRestartPoint is executed in the checkpointer .
*/
static void
SetLatestXTime ( TimestampTz xtime )
{
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
XLogRecoveryCtl - > recoveryLastXTime = xtime ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
}
/*
* Fetch timestamp of latest processed commit / abort record .
*/
TimestampTz
GetLatestXTime ( void )
{
TimestampTz xtime ;
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
xtime = XLogRecoveryCtl - > recoveryLastXTime ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
return xtime ;
}
/*
* Save timestamp of the next chunk of WAL records to apply .
*
* We keep this in XLogRecoveryCtl , not a simple static variable , so that it can be
* seen by all backends .
*/
static void
SetCurrentChunkStartTime ( TimestampTz xtime )
{
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
XLogRecoveryCtl - > currentChunkStartTime = xtime ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
}
/*
* Fetch timestamp of latest processed commit / abort record .
* Startup process maintains an accurate local copy in XLogReceiptTime
*/
TimestampTz
GetCurrentChunkReplayStartTime ( void )
{
TimestampTz xtime ;
SpinLockAcquire ( & XLogRecoveryCtl - > info_lck ) ;
xtime = XLogRecoveryCtl - > currentChunkStartTime ;
SpinLockRelease ( & XLogRecoveryCtl - > info_lck ) ;
return xtime ;
}
/*
* Returns time of receipt of current chunk of XLOG data , as well as
* whether it was received from streaming replication or from archives .
*/
void
GetXLogReceiptTime ( TimestampTz * rtime , bool * fromStream )
{
/*
* This must be executed in the startup process , since we don ' t export the
* relevant state to shared memory .
*/
Assert ( InRecovery ) ;
* rtime = XLogReceiptTime ;
* fromStream = ( XLogReceiptSource = = XLOG_FROM_STREAM ) ;
}
/*
* Note that text field supplied is a parameter name and does not require
* translation
*/
void
RecoveryRequiresIntParameter ( const char * param_name , int currValue , int minValue )
{
if ( currValue < minValue )
{
if ( HotStandbyActiveInReplay ( ) )
{
bool warned_for_promote = false ;
ereport ( WARNING ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " hot standby is not possible because of insufficient parameter settings " ) ,
errdetail ( " %s = %d is a lower setting than on the primary server, where its value was %d. " ,
param_name ,
currValue ,
minValue ) ) ) ;
SetRecoveryPause ( true ) ;
ereport ( LOG ,
( errmsg ( " recovery has paused " ) ,
errdetail ( " If recovery is unpaused, the server will shut down. " ) ,
errhint ( " You can then restart the server after making the necessary configuration changes. " ) ) ) ;
while ( GetRecoveryPauseState ( ) ! = RECOVERY_NOT_PAUSED )
{
HandleStartupProcInterrupts ( ) ;
if ( CheckForStandbyTrigger ( ) )
{
if ( ! warned_for_promote )
ereport ( WARNING ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " promotion is not possible because of insufficient parameter settings " ) ,
/*
* Repeat the detail from above so it ' s easy to find
* in the log .
*/
errdetail ( " %s = %d is a lower setting than on the primary server, where its value was %d. " ,
param_name ,
currValue ,
minValue ) ,
errhint ( " Restart the server after making the necessary configuration changes. " ) ) ) ;
warned_for_promote = true ;
}
/*
* If recovery pause is requested then set it paused . While
* we are in the loop , user might resume and pause again so
* set this every time .
*/
ConfirmRecoveryPaused ( ) ;
/*
* We wait on a condition variable that will wake us as soon
* as the pause ends , but we use a timeout so we can check the
* above conditions periodically too .
*/
ConditionVariableTimedSleep ( & XLogRecoveryCtl - > recoveryNotPausedCV , 1000 ,
WAIT_EVENT_RECOVERY_PAUSE ) ;
}
ConditionVariableCancelSleep ( ) ;
}
ereport ( FATAL ,
( errcode ( ERRCODE_INVALID_PARAMETER_VALUE ) ,
errmsg ( " recovery aborted because of insufficient parameter settings " ) ,
/* Repeat the detail from above so it's easy to find in the log. */
errdetail ( " %s = %d is a lower setting than on the primary server, where its value was %d. " ,
param_name ,
currValue ,
minValue ) ,
errhint ( " You can restart the server after making the necessary configuration changes. " ) ) ) ;
}
}