Fix corner-case failure to detect improper timeline switch.

rescanLatestTimeLine() contains a guard against switching to
a timeline that forked off from the current one prior to the
current recovery point, but that guard does not work if the
timeline switch occurs before the first WAL recod (which must
be the checkpoint record) is read. Without this patch, an
improper timeline switch is therefore possible in such cases.

This happens because rescanLatestTimeLine() relies on the global
variable EndRecPtr to understand the current position of WAL
replay. However, EndRecPtr at this point in the code contains
the endpoint of the last-replayed record, not the startpoint or
endpoint of the record being replayed now. Thus, before any
records have been replayed, it's zero, which causes the sanity
check to always pass.

To fix, pass down the correct timeline explicitly. The
EndRecPtr value we want is the one from the xlogreader, which
will be the starting position of the record we're about to
try to read, rather than the global variable, which is the
ending position of the last record we successfully read.
They're usually the same, but not in the corner case described
here.

No back-patch, because in v14 and earlier branhes, we were using
the wrong TLI here as well as the wrong LSN. In master, that was
fixed by commit 4a92a1c3d1, but
that and it's prerequisite patches are too invasive to
back-patch for such a minor issue.

Patch by me, reviewed by Amul Sul.

Discussion: http://postgr.es/m/CA+Tgmoao96EuNeSPd+hspRKcsCddu=b1h-QNRuKfY8VmfNQdfg@mail.gmail.com
This commit is contained in:
Robert Haas 2021-11-24 08:13:10 -05:00
parent f79962d826
commit e7ea2fa342
1 changed files with 16 additions and 9 deletions

View File

@ -924,7 +924,8 @@ static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
bool fetching_ckpt, XLogRecPtr tliRecPtr,
TimeLineID replayTLI);
TimeLineID replayTLI,
XLogRecPtr replayLSN);
static void XLogShutdownWalRcv(void);
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
static void XLogFileClose(void);
@ -946,7 +947,8 @@ static bool PerformRecoveryXLogAction(void);
static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
XLogRecPtr RecPtr, int whichChkpt, bool report,
TimeLineID replayTLI);
static bool rescanLatestTimeLine(TimeLineID replayTLI);
static bool rescanLatestTimeLine(TimeLineID replayTLI,
XLogRecPtr replayLSN);
static void InitControlFile(uint64 sysidentifier);
static void WriteControlFile(void);
static void ReadControlFile(void);
@ -4620,7 +4622,7 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
* one and returns 'true'.
*/
static bool
rescanLatestTimeLine(TimeLineID replayTLI)
rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
{
List *newExpectedTLEs;
bool found;
@ -4671,13 +4673,13 @@ rescanLatestTimeLine(TimeLineID replayTLI)
* next timeline was forked off from it *after* the current recovery
* location.
*/
if (currentTle->end < EndRecPtr)
if (currentTle->end < replayLSN)
{
ereport(LOG,
(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
newtarget,
replayTLI,
LSN_FORMAT_ARGS(EndRecPtr))));
LSN_FORMAT_ARGS(replayLSN))));
return false;
}
@ -12473,7 +12475,8 @@ retry:
private->randAccess,
private->fetching_ckpt,
targetRecPtr,
private->replayTLI))
private->replayTLI,
xlogreader->EndRecPtr))
{
if (readFile >= 0)
close(readFile);
@ -12626,6 +12629,10 @@ next_record_is_invalid:
* 'tliRecPtr' is the position of the WAL record we're interested in. It is
* used to decide which timeline to stream the requested WAL from.
*
* 'replayLSN' is the current replay LSN, so that if we scan for new
* timelines, we can reject a switch to a timeline that branched off before
* this point.
*
* If the record is not immediately available, the function returns false
* if we're not in standby mode. In standby mode, waits for it to become
* available.
@ -12638,7 +12645,7 @@ next_record_is_invalid:
static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
bool fetching_ckpt, XLogRecPtr tliRecPtr,
TimeLineID replayTLI)
TimeLineID replayTLI, XLogRecPtr replayLSN)
{
static TimestampTz last_fail_time = 0;
TimestampTz now;
@ -12761,7 +12768,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
*/
if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
{
if (rescanLatestTimeLine(replayTLI))
if (rescanLatestTimeLine(replayTLI, replayLSN))
{
currentSource = XLOG_FROM_ARCHIVE;
break;
@ -12888,7 +12895,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
*/
if (recoveryTargetTimeLineGoal ==
RECOVERY_TARGET_TIMELINE_LATEST)
rescanLatestTimeLine(replayTLI);
rescanLatestTimeLine(replayTLI, replayLSN);
startWalReceiver = true;
}