diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 55f6b88bcd..7d6680485b 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.28 2007/02/13 19:39:42 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.29 2007/04/03 16:34:35 tgl Exp $ * * NOTES * Each global transaction is associated with a global transaction @@ -279,6 +279,7 @@ MarkAsPreparing(TransactionId xid, const char *gid, gxact->proc.pid = 0; gxact->proc.databaseId = databaseid; gxact->proc.roleId = owner; + gxact->proc.inCommit = false; gxact->proc.inVacuum = false; gxact->proc.isAutovacuum = false; gxact->proc.lwWaiting = false; @@ -934,18 +935,18 @@ EndPrepare(GlobalTransaction gxact) * odds of a PANIC actually occurring should be very tiny given that we * were able to write the bogus CRC above. * - * We have to lock out checkpoint start here, too; otherwise a checkpoint + * We have to set inCommit here, too; otherwise a checkpoint * starting immediately after the WAL record is inserted could complete * without fsync'ing our state file. (This is essentially the same kind * of race condition as the COMMIT-to-clog-write case that - * RecordTransactionCommit uses CheckpointStartLock for; see notes there.) + * RecordTransactionCommit uses inCommit for; see notes there.) * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. */ START_CRIT_SECTION(); - LWLockAcquire(CheckpointStartLock, LW_SHARED); + MyProc->inCommit = true; gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, records.head); @@ -982,10 +983,11 @@ EndPrepare(GlobalTransaction gxact) MarkAsPrepared(gxact); /* - * Now we can release the checkpoint start lock: a checkpoint starting - * after this will certainly see the gxact as a candidate for fsyncing. + * Now we can mark ourselves as out of the commit critical section: + * a checkpoint starting after this will certainly see the gxact as a + * candidate for fsyncing. */ - LWLockRelease(CheckpointStartLock); + MyProc->inCommit = false; END_CRIT_SECTION(); @@ -1649,7 +1651,7 @@ RecoverPreparedTransactions(void) * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit: in particular, - * we must take the CheckpointStartLock to avoid a race condition. + * we must set the inCommit flag to avoid a race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), * so it is never possible to optimize out the commit record. @@ -1669,7 +1671,7 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - LWLockAcquire(CheckpointStartLock, LW_SHARED); + MyProc->inCommit = true; /* Emit the XLOG commit record */ xlrec.xid = xid; @@ -1713,8 +1715,8 @@ RecordTransactionCommitPrepared(TransactionId xid, /* to avoid race conditions, the parent must commit first */ TransactionIdCommitTree(nchildren, children); - /* Checkpoint is allowed again */ - LWLockRelease(CheckpointStartLock); + /* Checkpoint can proceed now */ + MyProc->inCommit = false; END_CRIT_SECTION(); } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index f8058a8f5e..d47dae0c41 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.238 2007/03/22 19:55:04 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.239 2007/04/03 16:34:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -716,35 +716,37 @@ RecordTransactionCommit(void) START_CRIT_SECTION(); - /* - * If our transaction made any transaction-controlled XLOG entries, we - * need to lock out checkpoint start between writing our XLOG record - * and updating pg_clog. Otherwise it is possible for the checkpoint - * to set REDO after the XLOG record but fail to flush the pg_clog - * update to disk, leading to loss of the transaction commit if we - * crash a little later. Slightly klugy fix for problem discovered - * 2004-08-10. - * - * (If it made no transaction-controlled XLOG entries, its XID appears - * nowhere in permanent storage, so no one else will ever care if it - * committed; so it doesn't matter if we lose the commit flag.) - * - * Note we only need a shared lock. - */ - madeTCentries = (MyLastRecPtr.xrecoff != 0); - if (madeTCentries) - LWLockAcquire(CheckpointStartLock, LW_SHARED); - /* * We only need to log the commit in XLOG if the transaction made any * transaction-controlled XLOG entries or will delete files. */ + madeTCentries = (MyLastRecPtr.xrecoff != 0); if (madeTCentries || nrels > 0) { XLogRecData rdata[3]; int lastrdata = 0; xl_xact_commit xlrec; + /* + * Mark ourselves as within our "commit critical section". This + * forces any concurrent checkpoint to wait until we've updated + * pg_clog. Without this, it is possible for the checkpoint to + * set REDO after the XLOG record but fail to flush the pg_clog + * update to disk, leading to loss of the transaction commit if + * the system crashes a little later. + * + * Note: we could, but don't bother to, set this flag in + * RecordTransactionAbort. That's because loss of a transaction + * abort is noncritical; the presumption would be that it aborted, + * anyway. + * + * It's safe to change the inCommit flag of our own backend + * without holding the ProcArrayLock, since we're the only one + * modifying it. This makes checkpoint's determination of which + * xacts are inCommit a bit fuzzy, but it doesn't matter. + */ + MyProc->inCommit = true; + xlrec.xtime = time(NULL); xlrec.nrels = nrels; xlrec.nsubxacts = nchildren; @@ -825,9 +827,8 @@ RecordTransactionCommit(void) TransactionIdCommitTree(nchildren, children); } - /* Unlock checkpoint lock if we acquired it */ - if (madeTCentries) - LWLockRelease(CheckpointStartLock); + /* Checkpoint can proceed now */ + MyProc->inCommit = false; END_CRIT_SECTION(); } @@ -1961,6 +1962,7 @@ AbortTransaction(void) MyProc->xid = InvalidTransactionId; MyProc->xmin = InvalidTransactionId; MyProc->inVacuum = false; /* must be cleared with xid/xmin */ + MyProc->inCommit = false; /* be sure this gets cleared */ /* Clear the subtransaction-XID cache too while holding the lock */ MyProc->subxids.nxids = 0; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f3e02dd631..f70a3d5dd0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.266 2007/04/03 04:14:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.267 2007/04/03 16:34:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -5366,6 +5366,8 @@ CreateCheckPoint(bool shutdown, bool force) int nsegsadded = 0; int nsegsremoved = 0; int nsegsrecycled = 0; + TransactionId *inCommitXids; + int nInCommit; /* * Acquire CheckpointLock to ensure only one checkpoint happens at a time. @@ -5392,14 +5394,9 @@ CreateCheckPoint(bool shutdown, bool force) checkPoint.time = time(NULL); /* - * We must hold CheckpointStartLock while determining the checkpoint REDO - * pointer. This ensures that any concurrent transaction commits will be - * either not yet logged, or logged and recorded in pg_clog. See notes in - * RecordTransactionCommit(). + * We must hold WALInsertLock while examining insert state to determine + * the checkpoint REDO pointer. */ - LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE); - - /* And we need WALInsertLock too */ LWLockAcquire(WALInsertLock, LW_EXCLUSIVE); /* @@ -5431,7 +5428,6 @@ CreateCheckPoint(bool shutdown, bool force) ControlFile->checkPointCopy.redo.xrecoff) { LWLockRelease(WALInsertLock); - LWLockRelease(CheckpointStartLock); LWLockRelease(CheckpointLock); END_CRIT_SECTION(); return; @@ -5476,17 +5472,48 @@ CreateCheckPoint(bool shutdown, bool force) } /* - * Now we can release insert lock and checkpoint start lock, allowing - * other xacts to proceed even while we are flushing disk buffers. + * Now we can release WAL insert lock, allowing other xacts to proceed + * while we are flushing disk buffers. */ LWLockRelease(WALInsertLock); - LWLockRelease(CheckpointStartLock); - if (!shutdown) ereport(DEBUG2, (errmsg("checkpoint starting"))); + /* + * Before flushing data, we must wait for any transactions that are + * currently in their commit critical sections. If an xact inserted its + * commit record into XLOG just before the REDO point, then a crash + * restart from the REDO point would not replay that record, which means + * that our flushing had better include the xact's update of pg_clog. So + * we wait till he's out of his commit critical section before proceeding. + * See notes in RecordTransactionCommit(). + * + * Because we've already released WALInsertLock, this test is a bit fuzzy: + * it is possible that we will wait for xacts we didn't really need to + * wait for. But the delay should be short and it seems better to make + * checkpoint take a bit longer than to hold locks longer than necessary. + * (In fact, the whole reason we have this issue is that xact.c does + * commit record XLOG insertion and clog update as two separate steps + * protected by different locks, but again that seems best on grounds + * of minimizing lock contention.) + * + * A transaction that has not yet set inCommit when we look cannot be + * at risk, since he's not inserted his commit record yet; and one that's + * already cleared it is not at risk either, since he's done fixing clog + * and we will correctly flush the update below. So we cannot miss any + * xacts we need to wait for. + */ + nInCommit = GetTransactionsInCommit(&inCommitXids); + if (nInCommit > 0) + { + do { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveTransactionsInCommit(inCommitXids, nInCommit)); + } + pfree(inCommitXids); + /* * Get the other info we need for the checkpoint record. */ diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 12e61c8926..22d031f9eb 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -23,7 +23,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.23 2007/03/25 19:45:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.24 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -739,6 +739,98 @@ restart: return (num != 0); } +/* + * GetTransactionsInCommit -- Get the XIDs of transactions that are committing + * + * Constructs an array of XIDs of transactions that are currently in commit + * critical sections, as shown by having inCommit set in their PGPROC entries. + * + * *xids_p is set to a palloc'd array that should be freed by the caller. + * The return value is the number of valid entries. + * + * Note that because backends set or clear inCommit without holding any lock, + * the result is somewhat indeterminate, but we don't really care. Even in + * a multiprocessor with delayed writes to shared memory, it should be certain + * that setting of inCommit will propagate to shared memory when the backend + * takes the WALInsertLock, so we cannot fail to see an xact as inCommit if + * it's already inserted its commit record. Whether it takes a little while + * for clearing of inCommit to propagate is unimportant for correctness. + */ +int +GetTransactionsInCommit(TransactionId **xids_p) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId *xids; + int nxids; + int index; + + xids = (TransactionId *) palloc(arrayP->maxProcs * sizeof(TransactionId)); + nxids = 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *proc = arrayP->procs[index]; + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId pxid = proc->xid; + + if (proc->inCommit && TransactionIdIsValid(pxid)) + xids[nxids++] = pxid; + } + + LWLockRelease(ProcArrayLock); + + *xids_p = xids; + return nxids; +} + +/* + * HaveTransactionsInCommit -- Are any of the specified XIDs in commit? + * + * This is used with the results of GetTransactionsInCommit to see if any + * of the specified XIDs are still in their commit critical sections. + * + * Note: this is O(N^2) in the number of xacts that are/were in commit, but + * those numbers should be small enough for it not to be a problem. + */ +bool +HaveTransactionsInCommit(TransactionId *xids, int nxids) +{ + bool result = false; + ProcArrayStruct *arrayP = procArray; + int index; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *proc = arrayP->procs[index]; + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId pxid = proc->xid; + + if (proc->inCommit && TransactionIdIsValid(pxid)) + { + int i; + + for (i = 0; i < nxids; i++) + { + if (xids[i] == pxid) + { + result = true; + break; + } + } + if (result) + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; +} + /* * BackendPidGetProc -- get a backend's PGPROC given its PID * diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 87a310aabe..2691a50582 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.186 2007/03/07 13:35:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.187 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -259,6 +259,7 @@ InitProcess(void) /* databaseId and roleId will be filled in later */ MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; + MyProc->inCommit = false; MyProc->inVacuum = false; MyProc->isAutovacuum = IsAutoVacuumWorkerProcess(); MyProc->lwWaiting = false; @@ -392,6 +393,7 @@ InitAuxiliaryProcess(void) MyProc->xmin = InvalidTransactionId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; + MyProc->inCommit = false; MyProc->inVacuum = false; MyProc->isAutovacuum = IsAutoVacuumLauncherProcess(); /* is this needed? */ MyProc->lwWaiting = false; diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index b4503f9213..c47256a159 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.34 2007/02/15 23:23:23 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.35 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -49,7 +49,6 @@ typedef enum LWLockId WALWriteLock, ControlFileLock, CheckpointLock, - CheckpointStartLock, CLogControlLock, SubtransControlLock, MultiXactGenLock, diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 2b20eda828..772cf52cdf 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.96 2007/03/07 13:35:03 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.97 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -74,6 +74,8 @@ struct PGPROC Oid databaseId; /* OID of database this backend is using */ Oid roleId; /* OID of role using this backend */ + bool inCommit; /* true if within commit critical section */ + bool inVacuum; /* true if current xact is a LAZY VACUUM */ bool isAutovacuum; /* true if it's autovacuum */ diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 93e82e4368..cd2df66380 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.12 2007/01/16 13:28:57 alvherre Exp $ + * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.13 2007/04/03 16:34:36 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -26,6 +26,9 @@ extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsActive(TransactionId xid); extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum); +extern int GetTransactionsInCommit(TransactionId **xids_p); +extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids); + extern PGPROC *BackendPidGetProc(int pid); extern int BackendXidGetPid(TransactionId xid); extern bool IsBackendPid(int pid);