From bc0581f8fb8b52a497e7ea06454f276b99abb72b Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 18 Jul 2023 13:44:35 +0900 Subject: [PATCH] Fix recovery of 2PC transaction during crash recovery A crash in the middle of a checkpoint with some two-phase state data already flushed to disk by this checkpoint could cause a follow-up crash recovery to recover twice the same transaction, once from what has been found in pg_twophase/ at the beginning of recovery and a second time when replaying its corresponding record. This would lead to FATAL failures in the startup process during recovery, where the same transaction would have a state recovered twice instead of once: LOG: recovering prepared transaction 731 from shared memory LOG: recovering prepared transaction 731 from shared memory FATAL: lock ExclusiveLock on object 731/0/0 is already held This issue is fixed by skipping the addition of any 2PC state coming from a record whose equivalent 2PC state file has already been loaded in TwoPhaseState at the beginning of recovery by restoreTwoPhaseData(), which is OK as long as the system has not reached a consistent state. The timing to get a messed up recovery processing is very racy, and would very unlikely happen. The thread that has reported the issue has demonstrated the bug using injection points to force a PANIC in the middle of a checkpoint. Issue introduced in 728bd99, so backpatch all the way down. Reported-by: "suyu.cmj" Author: "suyu.cmj" Author: Michael Paquier Discussion: https://postgr.es/m/109e6994-b971-48cb-84f6-829646f18b4c.mengjuan.cmj@alibaba-inc.com Backpatch-through: 11 --- src/backend/access/transam/twophase.c | 33 +++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index d145836a79..9e8c1f8415 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -2509,6 +2509,39 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, * that it got added in the redo phase */ + /* + * In the event of a crash while a checkpoint was running, it may be + * possible that some two-phase data found its way to disk while its + * corresponding record needs to be replayed in the follow-up recovery. + * As the 2PC data was on disk, it has already been restored at the + * beginning of recovery with restoreTwoPhaseData(), so skip this record + * to avoid duplicates in TwoPhaseState. If a consistent state has been + * reached, the record is added to TwoPhaseState and it should have no + * corresponding file in pg_twophase. + */ + if (!XLogRecPtrIsInvalid(start_lsn)) + { + char path[MAXPGPATH]; + + TwoPhaseFilePath(path, hdr->xid); + + if (access(path, F_OK) == 0) + { + ereport(reachedConsistency ? ERROR : WARNING, + (errmsg("could not recover two-phase state file for transaction %u", + hdr->xid), + errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.", + (uint32) (start_lsn >> 32), + (uint32) start_lsn))); + return; + } + + if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access file \"%s\": %m", path))); + } + /* Get a free gxact from the freelist */ if (TwoPhaseState->freeGXacts == NULL) ereport(ERROR,