If recovery.conf is created after "pg_ctl stop -m i", do crash recovery.

If you create a base backup using an atomic filesystem snapshot, and try to
perform PITR starting from that base backup, or if you just kill a master
server and create recovery.conf to put it into standby mode, we don't know
how far we need to recover before reaching consistency. Normally in crash
recovery, we replay all the WAL present in pg_xlog, and assume that we're
consistent after that. And normally in archive recovery, minRecoveryPoint,
backupEndRequired, or backupEndPoint is set in the control file, indicating
how far we need to replay to reach consistency. But if the server was
previously up and running normally, and you kill -9 it or take an atomic
filesystem snapshot, none of those fields are set in the control file.

The solution is to perform crash recovery first, replaying all the WAL in
pg_xlog. After that's done, we assume that the system is consistent like in
normal crash recovery, and switch to archive recovery mode after that.

Per report from Kyotaro HORIGUCHI. In his scenario, recovery.conf was
created after "pg_ctl stop -m i". I'm not sure we need to support that exact
scenario, but we should support backing up using a filesystem snapshot,
which looks identical.

This issue goes back to at least 9.0, where hot standby was introduced and
we started to track when consistency is reached. In 9.1 and 9.2, we would
open up for hot standby too early, and queries could briefly see an
inconsistent state. But 9.2 made it more visible, as we started to PANIC if
we see a reference to a non-existing page during recovery, if we've already
reached consistency. This is a fairly big patch, so back-patch to 9.2 only,
where the issue is more visible. We can consider back-patching further after
this has received some more testing in 9.2 and master.
This commit is contained in:
Heikki Linnakangas 2013-02-22 11:43:04 +02:00
parent a730183926
commit abf5c5c9a4
1 changed files with 179 additions and 78 deletions

View File

@ -189,7 +189,18 @@ static bool LocalHotStandbyActive = false;
*/
static int LocalXLogInsertAllowed = -1;
/* Are we recovering using offline XLOG archives? (only valid in the startup process) */
/*
* When ArchiveRecoveryRequested is set, archive recovery was requested,
* ie. recovery.conf file was present. When InArchiveRecovery is set, we are
* currently recovering using offline XLOG archives. These variables are only
* valid in the startup process.
*
* When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
* currently performing crash recovery using only XLOG files in pg_xlog, but
* will switch to using offline XLOG archives as soon as we reach the end of
* WAL in pg_xlog.
*/
static bool ArchiveRecoveryRequested = false;
bool InArchiveRecovery = false;
/* Was the last xlog file restored from archive, or local? */
@ -207,10 +218,13 @@ static TimestampTz recoveryTargetTime;
static char *recoveryTargetName;
/* options taken from recovery.conf for XLOG streaming */
bool StandbyMode = false;
static bool StandbyModeRequested = false;
static char *PrimaryConnInfo = NULL;
static char *TriggerFile = NULL;
/* are we currently in standby mode? */
bool StandbyMode = false;
/* whether request for fast promotion has been made yet */
static bool fast_promote = false;
@ -3217,10 +3231,10 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
private->emode = emode;
private->randAccess = (RecPtr != InvalidXLogRecPtr);
/* This is the first try to read this page. */
/* This is the first attempt to read this page. */
lastSourceFailed = false;
do
for (;;)
{
char *errormsg;
@ -3229,8 +3243,6 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
EndRecPtr = xlogreader->EndRecPtr;
if (record == NULL)
{
lastSourceFailed = true;
if (readFile >= 0)
{
close(readFile);
@ -3247,22 +3259,16 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
ereport(emode_for_corrupt_record(emode,
RecPtr ? RecPtr : EndRecPtr),
(errmsg_internal("%s", errormsg) /* already translated */));
/* Give up, or retry if we're in standby mode. */
continue;
}
/*
* Check page TLI is one of the expected values.
*/
if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
{
char fname[MAXFNAMELEN];
XLogSegNo segno;
int32 offset;
lastSourceFailed = true;
XLByteToSeg(xlogreader->latestPagePtr, segno);
offset = xlogreader->latestPagePtr % XLogSegSize;
XLogFileName(fname, xlogreader->readPageTLI, segno);
@ -3273,11 +3279,73 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
fname,
offset)));
record = NULL;
continue;
}
} while (StandbyMode && record == NULL && !CheckForStandbyTrigger());
return record;
if (record)
{
/* Great, got a record */
return record;
}
else
{
/* No valid record available from this source */
lastSourceFailed = true;
/*
* If archive recovery was requested, but we were still doing crash
* recovery, switch to archive recovery and retry using the offline
* archive. We have now replayed all the valid WAL in pg_xlog, so
* we are presumably now consistent.
*
* We require that there's at least some valid WAL present in
* pg_xlog, however (!fetch_ckpt). We could recover using the WAL
* from the archive, even if pg_xlog is completely empty, but we'd
* have no idea how far we'd have to replay to reach consistency.
* So err on the safe side and give up.
*/
if (!InArchiveRecovery && ArchiveRecoveryRequested &&
!fetching_ckpt)
{
ereport(DEBUG1,
(errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
InArchiveRecovery = true;
if (StandbyModeRequested)
StandbyMode = true;
/* initialize minRecoveryPoint to this record */
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
if (ControlFile->minRecoveryPoint < EndRecPtr)
{
ControlFile->minRecoveryPoint = EndRecPtr;
ControlFile->minRecoveryPointTLI = ThisTimeLineID;
}
/* update local copy */
minRecoveryPoint = ControlFile->minRecoveryPoint;
minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
UpdateControlFile();
LWLockRelease(ControlFileLock);
CheckRecoveryConsistency();
/*
* Before we retry, reset lastSourceFailed and currentSource
* so that we will check the archive next.
*/
lastSourceFailed = false;
currentSource = 0;
continue;
}
/* In standby mode, loop back to retry. Otherwise, give up. */
if (StandbyMode && !CheckForStandbyTrigger())
continue;
else
return NULL;
}
}
}
/*
@ -4213,7 +4281,7 @@ readRecoveryCommandFile(void)
}
else if (strcmp(item->name, "standby_mode") == 0)
{
if (!parse_bool(item->value, &StandbyMode))
if (!parse_bool(item->value, &StandbyModeRequested))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("parameter \"%s\" requires a Boolean value",
@ -4244,7 +4312,7 @@ readRecoveryCommandFile(void)
/*
* Check for compulsory parameters
*/
if (StandbyMode)
if (StandbyModeRequested)
{
if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
ereport(WARNING,
@ -4261,7 +4329,7 @@ readRecoveryCommandFile(void)
}
/* Enable fetching from archive recovery area */
InArchiveRecovery = true;
ArchiveRecoveryRequested = true;
/*
* If user specified recovery_target_timeline, validate it or compute the
@ -4271,6 +4339,11 @@ readRecoveryCommandFile(void)
*/
if (rtliGiven)
{
/*
* Temporarily set InArchiveRecovery, so that existsTimeLineHistory
* or findNewestTimeLine below will check the archive.
*/
InArchiveRecovery = true;
if (rtli)
{
/* Timeline 1 does not have a history file, all else should */
@ -4287,6 +4360,7 @@ readRecoveryCommandFile(void)
recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
recoveryTargetIsLatest = true;
}
InArchiveRecovery = false;
}
FreeConfigVariables(head);
@ -4850,9 +4924,9 @@ StartupXLOG(void)
archiveCleanupCommand ? archiveCleanupCommand : "",
sizeof(XLogCtl->archiveCleanupCommand));
if (InArchiveRecovery)
if (ArchiveRecoveryRequested)
{
if (StandbyMode)
if (StandbyModeRequested)
ereport(LOG,
(errmsg("entering standby mode")));
else if (recoveryTarget == RECOVERY_TARGET_XID)
@ -4892,7 +4966,7 @@ StartupXLOG(void)
* Take ownership of the wakeup latch if we're going to sleep during
* recovery.
*/
if (StandbyMode)
if (StandbyModeRequested)
OwnLatch(&XLogCtl->recoveryWakeupLatch);
/* Set up XLOG reader facility */
@ -4908,6 +4982,15 @@ StartupXLOG(void)
if (read_backup_label(&checkPointLoc, &backupEndRequired,
&backupFromStandby))
{
/*
* Archive recovery was requested, and thanks to the backup label file,
* we know how far we need to replay to reach consistency. Enter
* archive recovery directly.
*/
InArchiveRecovery = true;
if (StandbyModeRequested)
StandbyMode = true;
/*
* When a backup_label file is present, we want to roll forward from
* the checkpoint it identifies, rather than using pg_control.
@ -4948,6 +5031,33 @@ StartupXLOG(void)
}
else
{
/*
* It's possible that archive recovery was requested, but we don't
* know how far we need to replay the WAL before we reach consistency.
* This can happen for example if a base backup is taken from a running
* server using an atomic filesystem snapshot, without calling
* pg_start/stop_backup. Or if you just kill a running master server
* and put it into archive recovery by creating a recovery.conf file.
*
* Our strategy in that case is to perform crash recovery first,
* replaying all the WAL present in pg_xlog, and only enter archive
* recovery after that.
*
* But usually we already know how far we need to replay the WAL (up to
* minRecoveryPoint, up to backupEndPoint, or until we see an
* end-of-backup record), and we can enter archive recovery directly.
*/
if (ArchiveRecoveryRequested &&
(ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
ControlFile->backupEndRequired ||
ControlFile->backupEndPoint != InvalidXLogRecPtr ||
ControlFile->state == DB_SHUTDOWNED))
{
InArchiveRecovery = true;
if (StandbyModeRequested)
StandbyMode = true;
}
/*
* Get the last valid checkpoint record. If the latest one according
* to pg_control is broken, try the next-to-last one.
@ -5116,7 +5226,7 @@ StartupXLOG(void)
}
else if (ControlFile->state != DB_SHUTDOWNED)
InRecovery = true;
else if (InArchiveRecovery)
else if (ArchiveRecoveryRequested)
{
/* force recovery due to presence of recovery.conf */
InRecovery = true;
@ -5155,15 +5265,6 @@ StartupXLOG(void)
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = checkPointLoc;
ControlFile->checkPointCopy = checkPoint;
if (InArchiveRecovery)
{
/* initialize minRecoveryPoint if not set yet */
if (ControlFile->minRecoveryPoint < checkPoint.redo)
{
ControlFile->minRecoveryPoint = checkPoint.redo;
ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
}
}
/*
* Set backupStartPoint if we're starting recovery from a base backup.
@ -5243,7 +5344,7 @@ StartupXLOG(void)
* control file and we've established a recovery snapshot from a
* running-xacts WAL record.
*/
if (InArchiveRecovery && EnableHotStandby)
if (ArchiveRecoveryRequested && EnableHotStandby)
{
TransactionId *xids;
int nxids;
@ -5344,7 +5445,7 @@ StartupXLOG(void)
* process in addition to postmaster! Also, fsync requests are
* subsequently to be handled by the checkpointer, not locally.
*/
if (InArchiveRecovery && IsUnderPostmaster)
if (ArchiveRecoveryRequested && IsUnderPostmaster)
{
PublishStartupProcessInformation();
SetForwardFsyncRequests();
@ -5601,7 +5702,7 @@ StartupXLOG(void)
* We don't need the latch anymore. It's not strictly necessary to disown
* it, but let's do it for the sake of tidiness.
*/
if (StandbyMode)
if (StandbyModeRequested)
DisownLatch(&XLogCtl->recoveryWakeupLatch);
/*
@ -5646,7 +5747,7 @@ StartupXLOG(void)
* crashes while an online backup is in progress. We must not treat
* that as an error, or the database will refuse to start up.
*/
if (InArchiveRecovery || ControlFile->backupEndRequired)
if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
{
if (ControlFile->backupEndRequired)
ereport(FATAL,
@ -5677,10 +5778,12 @@ StartupXLOG(void)
* In a normal crash recovery, we can just extend the timeline we were in.
*/
PrevTimeLineID = ThisTimeLineID;
if (InArchiveRecovery)
if (ArchiveRecoveryRequested)
{
char reason[200];
Assert(InArchiveRecovery);
ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
ereport(LOG,
(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
@ -5720,7 +5823,7 @@ StartupXLOG(void)
* that we also have a copy of the last block of the old WAL in readBuf;
* we will use that below.)
*/
if (InArchiveRecovery)
if (ArchiveRecoveryRequested)
exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
/*
@ -7706,7 +7809,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
* record, the backup was canceled and the end-of-backup record will
* never arrive.
*/
if (InArchiveRecovery &&
if (ArchiveRecoveryRequested &&
!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
ereport(PANIC,
@ -9118,7 +9221,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
* Request a restartpoint if we've replayed too much xlog since the
* last one.
*/
if (StandbyMode && bgwriterLaunched)
if (StandbyModeRequested && bgwriterLaunched)
{
if (XLogCheckpointNeeded(readSegNo))
{
@ -9141,27 +9244,18 @@ retry:
(readSource == XLOG_FROM_STREAM &&
receivedUpto < targetPagePtr + reqLen))
{
if (StandbyMode)
if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
private->randAccess,
private->fetching_ckpt,
targetRecPtr))
{
if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
private->randAccess,
private->fetching_ckpt,
targetRecPtr))
goto triggered;
}
/* In archive or crash recovery. */
else if (readFile < 0)
{
int source;
if (readFile >= 0)
close(readFile);
readFile = -1;
readLen = 0;
readSource = 0;
if (InArchiveRecovery)
source = XLOG_FROM_ANY;
else
source = XLOG_FROM_PG_XLOG;
readFile = XLogFileReadAnyTLI(readSegNo, emode, source);
if (readFile < 0)
return -1;
return -1;
}
}
@ -9234,22 +9328,16 @@ next_record_is_invalid:
goto retry;
else
return -1;
triggered:
if (readFile >= 0)
close(readFile);
readFile = -1;
readLen = 0;
readSource = 0;
return -1;
}
/*
* In standby mode, wait for WAL at position 'RecPtr' to become available, either
* via restore_command succeeding to restore the segment, or via walreceiver
* having streamed the record (or via someone copying the segment directly to
* pg_xlog, but that is not documented or recommended).
* Open the WAL segment containing WAL position 'RecPtr'.
*
* The segment can be fetched via restore_command, or via walreceiver having
* streamed the record, or it can already be present in pg_xlog. Checking
* pg_xlog is mainly for crash recovery, but it will be polled in standby mode
* too, in case someone copies a new segment directly to pg_xlog. That is not
* documented or recommended, though.
*
* If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
* prepare to read WAL starting from RedoStartLSN after this.
@ -9259,6 +9347,10 @@ triggered:
* 'tliRecPtr' is the position of the WAL record we're interested in. It is
* used to decide which timeline to stream the requested WAL from.
*
* If the the record is not immediately available, the function returns false
* if we're not in standby mode. In standby mode, waits for it to become
* available.
*
* When the requested record becomes available, the function opens the file
* containing it (if not open already), and returns true. When end of standby
* mode is triggered by the user, and there is no more WAL available, returns
@ -9292,7 +9384,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* part of advancing to the next state.
*-------
*/
if (currentSource == 0)
if (!InArchiveRecovery)
currentSource = XLOG_FROM_PG_XLOG;
else if (currentSource == 0)
currentSource = XLOG_FROM_ARCHIVE;
for (;;)
@ -9307,7 +9401,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
*/
if (lastSourceFailed)
{
switch (currentSource)
{
case XLOG_FROM_ARCHIVE:
@ -9321,12 +9414,19 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* file, we still finish replaying as much as we can from
* archive and pg_xlog before failover.
*/
if (CheckForStandbyTrigger())
if (StandbyMode && CheckForStandbyTrigger())
{
ShutdownWalRcv();
return false;
}
/*
* Not in standby mode, and we've now tried the archive and
* pg_xlog.
*/
if (!StandbyMode)
return false;
/*
* If primary_conninfo is set, launch walreceiver to try to
* stream the missing WAL.
@ -9431,7 +9531,8 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* in the archive over ones in pg_xlog, so try the next file
* again from the archive first.
*/
currentSource = XLOG_FROM_ARCHIVE;
if (InArchiveRecovery)
currentSource = XLOG_FROM_ARCHIVE;
}
if (currentSource != oldSource)
@ -9584,9 +9685,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
* process.
*/
HandleStartupProcInterrupts();
}
} while (StandbyMode);
return false; /* not reached */
return false;
}
/*