Fix bugs in the hot standby known-assigned-xids tracking logic. If there's

an old transaction running in the master, and a lot of transactions have
started and finished since, and a WAL-record is written in the gap between
the creating the running-xacts snapshot and WAL-logging it, recovery will fail
with "too many KnownAssignedXids" error. This bug was reported by
Joachim Wieland on Nov 19th.

In the same scenario, when fewer transactions have started so that all the
xids fit in KnownAssignedXids despite the first bug, a more serious bug
arises. We incorrectly initialize the clog code with the oldest still running
transaction, and when we see the WAL record belonging to a transaction with
an XID larger than one that committed already before the checkpoint we're
recovering from, we zero the clog page containing the already committed
transaction, leading to data loss.

In hindsight, trying to track xids in the known-assigned-xids array before
seeing the running-xacts record was too complicated. To fix that, hold
XidGenLock while the running-xacts snapshot is taken and WAL-logged. That
ensures that no transaction can begin or end in that gap, so that in recvoery
we know that the snapshot contains all transactions running at that point in
WAL.
This commit is contained in:
Heikki Linnakangas 2010-12-07 09:23:30 +01:00
parent 8b56928097
commit 5a031a5556
4 changed files with 47 additions and 101 deletions

View File

@ -5987,8 +5987,6 @@ StartupXLOG(void)
StartupSUBTRANS(oldestActiveXID); StartupSUBTRANS(oldestActiveXID);
StartupMultiXact(); StartupMultiXact();
ProcArrayInitRecoveryInfo(oldestActiveXID);
/* /*
* If we're beginning at a shutdown checkpoint, we know that * If we're beginning at a shutdown checkpoint, we know that
* nothing was running on the master at this point. So fake-up an * nothing was running on the master at this point. So fake-up an

View File

@ -434,19 +434,6 @@ ProcArrayClearTransaction(PGPROC *proc)
proc->subxids.overflowed = false; proc->subxids.overflowed = false;
} }
/*
* ProcArrayInitRecoveryInfo
*
* When trying to assemble our snapshot we only care about xids after this value.
* See comments for LogStandbySnapshot().
*/
void
ProcArrayInitRecoveryInfo(TransactionId oldestActiveXid)
{
latestObservedXid = oldestActiveXid;
TransactionIdRetreat(latestObservedXid);
}
/* /*
* ProcArrayApplyRecoveryInfo -- apply recovery info about xids * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
* *
@ -523,11 +510,9 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
*/ */
/* /*
* Remove all xids except xids later than the snapshot. We don't know * Release any locks belonging to old transactions that are not
* exactly which ones that is until precisely now, so that is why we allow * running according to the running-xacts record.
* xids to be added only to remove most of them again here.
*/ */
ExpireOldKnownAssignedTransactionIds(running->nextXid);
StandbyReleaseOldLocks(running->nextXid); StandbyReleaseOldLocks(running->nextXid);
/* /*
@ -536,9 +521,8 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
/* /*
* Combine the running xact data with already known xids, if any exist. * KnownAssignedXids is sorted so we cannot just add the xids, we have to
* KnownAssignedXids is sorted so we cannot just add new xids, we have to * sort them first.
* combine them first, sort them and then re-add to KnownAssignedXids.
* *
* Some of the new xids are top-level xids and some are subtransactions. * Some of the new xids are top-level xids and some are subtransactions.
* We don't call SubtransSetParent because it doesn't matter yet. If we * We don't call SubtransSetParent because it doesn't matter yet. If we
@ -547,51 +531,32 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
* xids to subtrans. If RunningXacts is overflowed then we don't have * xids to subtrans. If RunningXacts is overflowed then we don't have
* enough information to correctly update subtrans anyway. * enough information to correctly update subtrans anyway.
*/ */
Assert(procArray->numKnownAssignedXids == 0);
/* /*
* Allocate a temporary array so we can combine xids. The total of both * Allocate a temporary array to avoid modifying the array passed as
* arrays should never normally exceed TOTAL_MAX_CACHED_SUBXIDS. * argument.
*/ */
xids = palloc(sizeof(TransactionId) * TOTAL_MAX_CACHED_SUBXIDS); xids = palloc(sizeof(TransactionId) * running->xcnt);
/* /*
* Get the remaining KnownAssignedXids. In most cases there won't be any * Add to the temp array any xids which have not already completed.
* at all since this exists only to catch a theoretical race condition.
*/
nxids = KnownAssignedXidsGet(xids, InvalidTransactionId);
if (nxids > 0)
KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
/*
* Now we have a copy of any KnownAssignedXids we can zero the array
* before we re-insert combined snapshot.
*/
KnownAssignedXidsRemovePreceding(InvalidTransactionId);
/*
* Add to the temp array any xids which have not already completed, taking
* care not to overflow in extreme cases.
*/ */
nxids = 0;
for (i = 0; i < running->xcnt; i++) for (i = 0; i < running->xcnt; i++)
{ {
TransactionId xid = running->xids[i]; TransactionId xid = running->xids[i];
/* /*
* The running-xacts snapshot can contain xids that were running at * The running-xacts snapshot can contain xids that were still visible
* the time of the snapshot, yet complete before the snapshot was * in the procarray when the snapshot was taken, but were already
* written to WAL. They're running now, so ignore them. * WAL-logged as completed. They're not running anymore, so ignore
* them.
*/ */
if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
continue; continue;
xids[nxids++] = xid; xids[nxids++] = xid;
/*
* Test for overflow only after we have filtered out already complete
* transactions.
*/
if (nxids > TOTAL_MAX_CACHED_SUBXIDS)
elog(ERROR, "too many xids to add into KnownAssignedXids");
} }
if (nxids > 0) if (nxids > 0)
@ -602,20 +567,11 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
*/ */
qsort(xids, nxids, sizeof(TransactionId), xidComparator); qsort(xids, nxids, sizeof(TransactionId), xidComparator);
/*
* Re-initialise latestObservedXid to the highest xid we've seen.
*/
latestObservedXid = xids[nxids - 1];
/* /*
* Add the sorted snapshot into KnownAssignedXids * Add the sorted snapshot into KnownAssignedXids
*/ */
for (i = 0; i < nxids; i++) for (i = 0; i < nxids; i++)
{ KnownAssignedXidsAdd(xids[i], xids[i], true);
TransactionId xid = xids[i];
KnownAssignedXidsAdd(xid, xid, true);
}
KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
} }
@ -623,52 +579,41 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
pfree(xids); pfree(xids);
/* /*
* Now we've got the running xids we need to set the global values thare * Now we've got the running xids we need to set the global values that
* used to track snapshots as they evolve further * are used to track snapshots as they evolve further.
* *
* * latestCompletedXid which will be the xmax for snapshots * * - latestCompletedXid which will be the xmax for snapshots
* lastOverflowedXid which shows whether snapshots overflow * nextXid * - lastOverflowedXid which shows whether snapshots overflow
* - nextXid
* *
* If the snapshot overflowed, then we still initialise with what we know, * If the snapshot overflowed, then we still initialise with what we know,
* but the recovery snapshot isn't fully valid yet because we know there * but the recovery snapshot isn't fully valid yet because we know there
* are some subxids missing. We don't know the specific subxids that are * are some subxids missing. We don't know the specific subxids that are
* missing, so conservatively assume the last one is latestObservedXid. * missing, so conservatively assume the last one is latestObservedXid.
* If no missing subxids, try to clear lastOverflowedXid.
*
* If the snapshot didn't overflow it's still possible that an overflow
* occurred in the gap between taking snapshot and logging record, so we
* also need to check if lastOverflowedXid is already ahead of us.
*/ */
latestObservedXid = running->nextXid;
TransactionIdRetreat(latestObservedXid);
if (running->subxid_overflow) if (running->subxid_overflow)
{ {
standbyState = STANDBY_SNAPSHOT_PENDING; standbyState = STANDBY_SNAPSHOT_PENDING;
standbySnapshotPendingXmin = latestObservedXid; standbySnapshotPendingXmin = latestObservedXid;
if (TransactionIdFollows(latestObservedXid,
procArray->lastOverflowedXid))
procArray->lastOverflowedXid = latestObservedXid; procArray->lastOverflowedXid = latestObservedXid;
} }
else if (TransactionIdFollows(procArray->lastOverflowedXid,
latestObservedXid))
{
standbyState = STANDBY_SNAPSHOT_PENDING;
standbySnapshotPendingXmin = procArray->lastOverflowedXid;
}
else else
{ {
standbyState = STANDBY_SNAPSHOT_READY; standbyState = STANDBY_SNAPSHOT_READY;
standbySnapshotPendingXmin = InvalidTransactionId; standbySnapshotPendingXmin = InvalidTransactionId;
if (TransactionIdFollows(running->oldestRunningXid,
procArray->lastOverflowedXid))
procArray->lastOverflowedXid = InvalidTransactionId; procArray->lastOverflowedXid = InvalidTransactionId;
} }
/* /*
* If a transaction completed in the gap between taking and logging the * If a transaction wrote a commit record in the gap between taking and
* snapshot then latestCompletedXid may already be higher than the value * logging the snapshot then latestCompletedXid may already be higher
* from the snapshot, so check before we use the incoming value. * than the value from the snapshot, so check before we use the incoming
* value.
*/ */
if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
running->latestCompletedXid)) running->latestCompletedXid))
@ -1407,6 +1352,10 @@ GetSnapshotData(Snapshot snapshot)
* Similar to GetSnapshotData but returns more information. We include * Similar to GetSnapshotData but returns more information. We include
* all PGPROCs with an assigned TransactionId, even VACUUM processes. * all PGPROCs with an assigned TransactionId, even VACUUM processes.
* *
* We acquire XidGenLock, but the caller is responsible for releasing it.
* This ensures that no new XIDs enter the proc array until the caller has
* WAL-logged this snapshot, and releases the lock.
*
* The returned data structure is statically allocated; caller should not * The returned data structure is statically allocated; caller should not
* modify it, and must not assume it is valid past the next call. * modify it, and must not assume it is valid past the next call.
* *
@ -1526,7 +1475,7 @@ GetRunningTransactionData(void)
CurrentRunningXacts->oldestRunningXid = oldestRunningXid; CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
CurrentRunningXacts->latestCompletedXid = latestCompletedXid; CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
LWLockRelease(XidGenLock); /* We don't release XidGenLock here, the caller is responsible for that */
LWLockRelease(ProcArrayLock); LWLockRelease(ProcArrayLock);
Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
@ -2337,10 +2286,8 @@ DisplayXidCache(void)
* unobserved XIDs. * unobserved XIDs.
* *
* RecordKnownAssignedTransactionIds() should be run for *every* WAL record * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
* type apart from XLOG_RUNNING_XACTS (since that initialises the first * associated with a transaction. Must be called for each record after we
* snapshot so that RecordKnownAssignedTransactionIds() can be called). Must * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
* be called for each record after we have executed StartupCLOG() et al,
* since we must ExtendCLOG() etc..
* *
* Called during recovery in analogy with and in place of GetNewTransactionId() * Called during recovery in analogy with and in place of GetNewTransactionId()
*/ */
@ -2348,12 +2295,19 @@ void
RecordKnownAssignedTransactionIds(TransactionId xid) RecordKnownAssignedTransactionIds(TransactionId xid)
{ {
Assert(standbyState >= STANDBY_INITIALIZED); Assert(standbyState >= STANDBY_INITIALIZED);
Assert(TransactionIdIsValid(latestObservedXid));
Assert(TransactionIdIsValid(xid)); Assert(TransactionIdIsValid(xid));
elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u", elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
xid, latestObservedXid); xid, latestObservedXid);
/*
* If the KnownAssignedXids machinery isn't up yet, do nothing.
*/
if (standbyState <= STANDBY_INITIALIZED)
return;
Assert(TransactionIdIsValid(latestObservedXid));
/* /*
* When a newly observed xid arrives, it is frequently the case that it is * When a newly observed xid arrives, it is frequently the case that it is
* *not* the next xid in sequence. When this occurs, we must treat the * *not* the next xid in sequence. When this occurs, we must treat the

View File

@ -671,7 +671,7 @@ StandbyReleaseAllLocks(void)
/* /*
* StandbyReleaseOldLocks * StandbyReleaseOldLocks
* Release standby locks held by XIDs < removeXid, as long * Release standby locks held by XIDs < removeXid, as long
* as their not prepared transactions. * as they're not prepared transactions.
*/ */
void void
StandbyReleaseOldLocks(TransactionId removeXid) StandbyReleaseOldLocks(TransactionId removeXid)
@ -848,14 +848,9 @@ LogStandbySnapshot(TransactionId *oldestActiveXid, TransactionId *nextXid)
* record we write, because standby will open up when it sees this. * record we write, because standby will open up when it sees this.
*/ */
running = GetRunningTransactionData(); running = GetRunningTransactionData();
/*
* The gap between GetRunningTransactionData() and
* LogCurrentRunningXacts() is what most of the fuss is about here, so
* artifically extending this interval is a great way to test the little
* used parts of the code.
*/
LogCurrentRunningXacts(running); LogCurrentRunningXacts(running);
/* GetRunningTransactionData() acquired XidGenLock, we must release it */
LWLockRelease(XidGenLock);
*oldestActiveXid = running->oldestRunningXid; *oldestActiveXid = running->oldestRunningXid;
*nextXid = running->nextXid; *nextXid = running->nextXid;

View File

@ -28,7 +28,6 @@ extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid); extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
extern void ProcArrayClearTransaction(PGPROC *proc); extern void ProcArrayClearTransaction(PGPROC *proc);
extern void ProcArrayInitRecoveryInfo(TransactionId oldestActiveXid);
extern void ProcArrayApplyRecoveryInfo(RunningTransactions running); extern void ProcArrayApplyRecoveryInfo(RunningTransactions running);
extern void ProcArrayApplyXidAssignment(TransactionId topxid, extern void ProcArrayApplyXidAssignment(TransactionId topxid,
int nsubxids, TransactionId *subxids); int nsubxids, TransactionId *subxids);