From 9c9b619473cd9d2f3f3181bd9cd6862c64abb4d3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 3 Apr 2007 16:34:36 +0000 Subject: [PATCH] Remove the CheckpointStartLock in favor of having backends show whether they are in their commit critical sections via flags in the ProcArray. Checkpoint can watch the ProcArray to determine when it's safe to proceed. This is a considerably better solution to the original problem of race conditions between checkpoint and transaction commit: it speeds up commit, since there's one less lock to fool with, and it prevents the problem of checkpoint being delayed indefinitely when there's a constant flow of commits. Heikki, with some kibitzing from Tom. --- src/backend/access/transam/twophase.c | 24 +++---- src/backend/access/transam/xact.c | 48 +++++++------- src/backend/access/transam/xlog.c | 53 +++++++++++---- src/backend/storage/ipc/procarray.c | 94 ++++++++++++++++++++++++++- src/backend/storage/lmgr/proc.c | 4 +- src/include/storage/lwlock.h | 3 +- src/include/storage/proc.h | 4 +- src/include/storage/procarray.h | 5 +- 8 files changed, 182 insertions(+), 53 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 55f6b88bcd..7d6680485b 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.28 2007/02/13 19:39:42 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.29 2007/04/03 16:34:35 tgl Exp $ * * NOTES * Each global transaction is associated with a global transaction @@ -279,6 +279,7 @@ MarkAsPreparing(TransactionId xid, const char *gid, gxact->proc.pid = 0; gxact->proc.databaseId = databaseid; gxact->proc.roleId = owner; + gxact->proc.inCommit = false; gxact->proc.inVacuum = false; gxact->proc.isAutovacuum = false; gxact->proc.lwWaiting = false; @@ -934,18 +935,18 @@ EndPrepare(GlobalTransaction gxact) * odds of a PANIC actually occurring should be very tiny given that we * were able to write the bogus CRC above. * - * We have to lock out checkpoint start here, too; otherwise a checkpoint + * We have to set inCommit here, too; otherwise a checkpoint * starting immediately after the WAL record is inserted could complete * without fsync'ing our state file. (This is essentially the same kind * of race condition as the COMMIT-to-clog-write case that - * RecordTransactionCommit uses CheckpointStartLock for; see notes there.) + * RecordTransactionCommit uses inCommit for; see notes there.) * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. */ START_CRIT_SECTION(); - LWLockAcquire(CheckpointStartLock, LW_SHARED); + MyProc->inCommit = true; gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, records.head); @@ -982,10 +983,11 @@ EndPrepare(GlobalTransaction gxact) MarkAsPrepared(gxact); /* - * Now we can release the checkpoint start lock: a checkpoint starting - * after this will certainly see the gxact as a candidate for fsyncing. + * Now we can mark ourselves as out of the commit critical section: + * a checkpoint starting after this will certainly see the gxact as a + * candidate for fsyncing. */ - LWLockRelease(CheckpointStartLock); + MyProc->inCommit = false; END_CRIT_SECTION(); @@ -1649,7 +1651,7 @@ RecoverPreparedTransactions(void) * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit: in particular, - * we must take the CheckpointStartLock to avoid a race condition. + * we must set the inCommit flag to avoid a race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), * so it is never possible to optimize out the commit record. @@ -1669,7 +1671,7 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - LWLockAcquire(CheckpointStartLock, LW_SHARED); + MyProc->inCommit = true; /* Emit the XLOG commit record */ xlrec.xid = xid; @@ -1713,8 +1715,8 @@ RecordTransactionCommitPrepared(TransactionId xid, /* to avoid race conditions, the parent must commit first */ TransactionIdCommitTree(nchildren, children); - /* Checkpoint is allowed again */ - LWLockRelease(CheckpointStartLock); + /* Checkpoint can proceed now */ + MyProc->inCommit = false; END_CRIT_SECTION(); } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index f8058a8f5e..d47dae0c41 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.238 2007/03/22 19:55:04 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.239 2007/04/03 16:34:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -716,35 +716,37 @@ RecordTransactionCommit(void) START_CRIT_SECTION(); - /* - * If our transaction made any transaction-controlled XLOG entries, we - * need to lock out checkpoint start between writing our XLOG record - * and updating pg_clog. Otherwise it is possible for the checkpoint - * to set REDO after the XLOG record but fail to flush the pg_clog - * update to disk, leading to loss of the transaction commit if we - * crash a little later. Slightly klugy fix for problem discovered - * 2004-08-10. - * - * (If it made no transaction-controlled XLOG entries, its XID appears - * nowhere in permanent storage, so no one else will ever care if it - * committed; so it doesn't matter if we lose the commit flag.) - * - * Note we only need a shared lock. - */ - madeTCentries = (MyLastRecPtr.xrecoff != 0); - if (madeTCentries) - LWLockAcquire(CheckpointStartLock, LW_SHARED); - /* * We only need to log the commit in XLOG if the transaction made any * transaction-controlled XLOG entries or will delete files. */ + madeTCentries = (MyLastRecPtr.xrecoff != 0); if (madeTCentries || nrels > 0) { XLogRecData rdata[3]; int lastrdata = 0; xl_xact_commit xlrec; + /* + * Mark ourselves as within our "commit critical section". This + * forces any concurrent checkpoint to wait until we've updated + * pg_clog. Without this, it is possible for the checkpoint to + * set REDO after the XLOG record but fail to flush the pg_clog + * update to disk, leading to loss of the transaction commit if + * the system crashes a little later. + * + * Note: we could, but don't bother to, set this flag in + * RecordTransactionAbort. That's because loss of a transaction + * abort is noncritical; the presumption would be that it aborted, + * anyway. + * + * It's safe to change the inCommit flag of our own backend + * without holding the ProcArrayLock, since we're the only one + * modifying it. This makes checkpoint's determination of which + * xacts are inCommit a bit fuzzy, but it doesn't matter. + */ + MyProc->inCommit = true; + xlrec.xtime = time(NULL); xlrec.nrels = nrels; xlrec.nsubxacts = nchildren; @@ -825,9 +827,8 @@ RecordTransactionCommit(void) TransactionIdCommitTree(nchildren, children); } - /* Unlock checkpoint lock if we acquired it */ - if (madeTCentries) - LWLockRelease(CheckpointStartLock); + /* Checkpoint can proceed now */ + MyProc->inCommit = false; END_CRIT_SECTION(); } @@ -1961,6 +1962,7 @@ AbortTransaction(void) MyProc->xid = InvalidTransactionId; MyProc->xmin = InvalidTransactionId; MyProc->inVacuum = false; /* must be cleared with xid/xmin */ + MyProc->inCommit = false; /* be sure this gets cleared */ /* Clear the subtransaction-XID cache too while holding the lock */ MyProc->subxids.nxids = 0; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f3e02dd631..f70a3d5dd0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.266 2007/04/03 04:14:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.267 2007/04/03 16:34:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -5366,6 +5366,8 @@ CreateCheckPoint(bool shutdown, bool force) int nsegsadded = 0; int nsegsremoved = 0; int nsegsrecycled = 0; + TransactionId *inCommitXids; + int nInCommit; /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. @@ -5392,14 +5394,9 @@ CreateCheckPoint(bool shutdown, bool force) checkPoint.time = time(NULL); /* - * We must hold CheckpointStartLock while determining the checkpoint REDO - * pointer. This ensures that any concurrent transaction commits will be - * either not yet logged, or logged and recorded in pg_clog. See notes in - * RecordTransactionCommit(). + * We must hold WALInsertLock while examining insert state to determine + * the checkpoint REDO pointer. */ - LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE); - - /* And we need WALInsertLock too */ LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); /* @@ -5431,7 +5428,6 @@ CreateCheckPoint(bool shutdown, bool force) ControlFile->checkPointCopy.redo.xrecoff) { LWLockRelease(WALInsertLock); - LWLockRelease(CheckpointStartLock); LWLockRelease(CheckpointLock); END_CRIT_SECTION(); return; @@ -5476,17 +5472,48 @@ CreateCheckPoint(bool shutdown, bool force) } /* - * Now we can release insert lock and checkpoint start lock, allowing - * other xacts to proceed even while we are flushing disk buffers. + * Now we can release WAL insert lock, allowing other xacts to proceed + * while we are flushing disk buffers. */ LWLockRelease(WALInsertLock); - LWLockRelease(CheckpointStartLock); - if (!shutdown) ereport(DEBUG2, (errmsg("checkpoint starting"))); + /* + * Before flushing data, we must wait for any transactions that are + * currently in their commit critical sections. If an xact inserted its + * commit record into XLOG just before the REDO point, then a crash + * restart from the REDO point would not replay that record, which means + * that our flushing had better include the xact's update of pg_clog. So + * we wait till he's out of his commit critical section before proceeding. + * See notes in RecordTransactionCommit(). + * + * Because we've already released WALInsertLock, this test is a bit fuzzy: + * it is possible that we will wait for xacts we didn't really need to + * wait for. But the delay should be short and it seems better to make + * checkpoint take a bit longer than to hold locks longer than necessary. + * (In fact, the whole reason we have this issue is that xact.c does + * commit record XLOG insertion and clog update as two separate steps + * protected by different locks, but again that seems best on grounds + * of minimizing lock contention.) + * + * A transaction that has not yet set inCommit when we look cannot be + * at risk, since he's not inserted his commit record yet; and one that's + * already cleared it is not at risk either, since he's done fixing clog + * and we will correctly flush the update below. So we cannot miss any + * xacts we need to wait for. + */ + nInCommit = GetTransactionsInCommit(&inCommitXids); + if (nInCommit > 0) + { + do { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveTransactionsInCommit(inCommitXids, nInCommit)); + } + pfree(inCommitXids); + /* * Get the other info we need for the checkpoint record. */ diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 12e61c8926..22d031f9eb 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -23,7 +23,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.23 2007/03/25 19:45:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.24 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -739,6 +739,98 @@ restart: return (num != 0); } +/* + * GetTransactionsInCommit -- Get the XIDs of transactions that are committing + * + * Constructs an array of XIDs of transactions that are currently in commit + * critical sections, as shown by having inCommit set in their PGPROC entries. + * + * *xids_p is set to a palloc'd array that should be freed by the caller. + * The return value is the number of valid entries. + * + * Note that because backends set or clear inCommit without holding any lock, + * the result is somewhat indeterminate, but we don't really care. Even in + * a multiprocessor with delayed writes to shared memory, it should be certain + * that setting of inCommit will propagate to shared memory when the backend + * takes the WALInsertLock, so we cannot fail to see an xact as inCommit if + * it's already inserted its commit record. Whether it takes a little while + * for clearing of inCommit to propagate is unimportant for correctness. + */ +int +GetTransactionsInCommit(TransactionId **xids_p) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId *xids; + int nxids; + int index; + + xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId)); + nxids = 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *proc = arrayP->procs[index]; + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId pxid = proc->xid; + + if (proc->inCommit && TransactionIdIsValid(pxid)) + xids[nxids++] = pxid; + } + + LWLockRelease(ProcArrayLock); + + *xids_p = xids; + return nxids; +} + +/* + * HaveTransactionsInCommit -- Are any of the specified XIDs in commit? + * + * This is used with the results of GetTransactionsInCommit to see if any + * of the specified XIDs are still in their commit critical sections. + * + * Note: this is O(N^2) in the number of xacts that are/were in commit, but + * those numbers should be small enough for it not to be a problem. + */ +bool +HaveTransactionsInCommit(TransactionId *xids, int nxids) +{ + bool result = false; + ProcArrayStruct *arrayP = procArray; + int index; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *proc = arrayP->procs[index]; + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId pxid = proc->xid; + + if (proc->inCommit && TransactionIdIsValid(pxid)) + { + int i; + + for (i = 0; i < nxids; i++) + { + if (xids[i] == pxid) + { + result = true; + break; + } + } + if (result) + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; +} + /* * BackendPidGetProc -- get a backend's PGPROC given its PID * diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 87a310aabe..2691a50582 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.186 2007/03/07 13:35:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.187 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -259,6 +259,7 @@ InitProcess(void) /* databaseId and roleId will be filled in later */ MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; + MyProc->inCommit = false; MyProc->inVacuum = false; MyProc->isAutovacuum = IsAutoVacuumWorkerProcess(); MyProc->lwWaiting = false; @@ -392,6 +393,7 @@ InitAuxiliaryProcess(void) MyProc->xmin = InvalidTransactionId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; + MyProc->inCommit = false; MyProc->inVacuum = false; MyProc->isAutovacuum = IsAutoVacuumLauncherProcess(); /* is this needed? */ MyProc->lwWaiting = false; diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index b4503f9213..c47256a159 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.34 2007/02/15 23:23:23 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.35 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -49,7 +49,6 @@ typedef enum LWLockId WALWriteLock, ControlFileLock, CheckpointLock, - CheckpointStartLock, CLogControlLock, SubtransControlLock, MultiXactGenLock, diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 2b20eda828..772cf52cdf 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.96 2007/03/07 13:35:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.97 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -74,6 +74,8 @@ struct PGPROC Oid databaseId; /* OID of database this backend is using */ Oid roleId; /* OID of role using this backend */ + bool inCommit; /* true if within commit critical section */ + bool inVacuum; /* true if current xact is a LAZY VACUUM */ bool isAutovacuum; /* true if it's autovacuum */ diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 93e82e4368..cd2df66380 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.12 2007/01/16 13:28:57 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.13 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -26,6 +26,9 @@ extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsActive(TransactionId xid); extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum); +extern int GetTransactionsInCommit(TransactionId **xids_p); +extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids); + extern PGPROC *BackendPidGetProc(int pid); extern int BackendXidGetPid(TransactionId xid); extern bool IsBackendPid(int pid);