467 lines
18 KiB
C
467 lines
18 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* proc.h
|
|
* per-process shared memory data structures
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/storage/proc.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef _PROC_H_
|
|
#define _PROC_H_
|
|
|
|
#include "access/clog.h"
|
|
#include "access/xlogdefs.h"
|
|
#include "lib/ilist.h"
|
|
#include "storage/latch.h"
|
|
#include "storage/lock.h"
|
|
#include "storage/pg_sema.h"
|
|
#include "storage/proclist_types.h"
|
|
|
|
/*
|
|
* Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
|
|
* for non-aborted subtransactions of its current top transaction. These
|
|
* have to be treated as running XIDs by other backends.
|
|
*
|
|
* We also keep track of whether the cache overflowed (ie, the transaction has
|
|
* generated at least one subtransaction that didn't fit in the cache).
|
|
* If none of the caches have overflowed, we can assume that an XID that's not
|
|
* listed anywhere in the PGPROC array is not a running transaction. Else we
|
|
* have to look at pg_subtrans.
|
|
*
|
|
* See src/test/isolation/specs/subxid-overflow.spec if you change this.
|
|
*/
|
|
#define PGPROC_MAX_CACHED_SUBXIDS 64 /* XXX guessed-at value */
|
|
|
|
typedef struct XidCacheStatus
|
|
{
|
|
/* number of cached subxids, never more than PGPROC_MAX_CACHED_SUBXIDS */
|
|
uint8 count;
|
|
/* has PGPROC->subxids overflowed */
|
|
bool overflowed;
|
|
} XidCacheStatus;
|
|
|
|
struct XidCache
|
|
{
|
|
TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS];
|
|
};
|
|
|
|
/*
|
|
* Flags for PGPROC->statusFlags and PROC_HDR->statusFlags[]
|
|
*/
|
|
#define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */
|
|
#define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */
|
|
#define PROC_IN_SAFE_IC 0x04 /* currently running CREATE INDEX
|
|
* CONCURRENTLY or REINDEX
|
|
* CONCURRENTLY on non-expressional,
|
|
* non-partial index */
|
|
#define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */
|
|
#define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical
|
|
* decoding outside xact */
|
|
#define PROC_AFFECTS_ALL_HORIZONS 0x20 /* this proc's xmin must be
|
|
* included in vacuum horizons
|
|
* in all databases */
|
|
|
|
/* flags reset at EOXact */
|
|
#define PROC_VACUUM_STATE_MASK \
|
|
(PROC_IN_VACUUM | PROC_IN_SAFE_IC | PROC_VACUUM_FOR_WRAPAROUND)
|
|
|
|
/*
|
|
* Xmin-related flags. Make sure any flags that affect how the process' Xmin
|
|
* value is interpreted by VACUUM are included here.
|
|
*/
|
|
#define PROC_XMIN_FLAGS (PROC_IN_VACUUM | PROC_IN_SAFE_IC)
|
|
|
|
/*
|
|
* We allow a small number of "weak" relation locks (AccessShareLock,
|
|
* RowShareLock, RowExclusiveLock) to be recorded in the PGPROC structure
|
|
* rather than the main lock table. This eases contention on the lock
|
|
* manager LWLocks. See storage/lmgr/README for additional details.
|
|
*/
|
|
#define FP_LOCK_SLOTS_PER_BACKEND 16
|
|
|
|
/*
|
|
* An invalid pgprocno. Must be larger than the maximum number of PGPROC
|
|
* structures we could possibly have. See comments for MAX_BACKENDS.
|
|
*/
|
|
#define INVALID_PGPROCNO PG_INT32_MAX
|
|
|
|
/*
|
|
* Flags for PGPROC.delayChkptFlags
|
|
*
|
|
* These flags can be used to delay the start or completion of a checkpoint
|
|
* for short periods. A flag is in effect if the corresponding bit is set in
|
|
* the PGPROC of any backend.
|
|
*
|
|
* For our purposes here, a checkpoint has three phases: (1) determine the
|
|
* location to which the redo pointer will be moved, (2) write all the
|
|
* data durably to disk, and (3) WAL-log the checkpoint.
|
|
*
|
|
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
|
|
* to phase 2. This is useful when we are performing a WAL-logged modification
|
|
* of data that will be flushed to disk in phase 2. By setting this flag
|
|
* before writing WAL and clearing it after we've both written WAL and
|
|
* performed the corresponding modification, we ensure that if the WAL record
|
|
* is inserted prior to the new redo point, the corresponding data changes will
|
|
* also be flushed to disk before the checkpoint can complete. (In the
|
|
* extremely common case where the data being modified is in shared buffers
|
|
* and we acquire an exclusive content lock on the relevant buffers before
|
|
* writing WAL, this mechanism is not needed, because phase 2 will block
|
|
* until we release the content lock and then flush the modified data to
|
|
* disk.)
|
|
*
|
|
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
|
|
* to phase 3. This is useful if we are performing a WAL-logged operation that
|
|
* might invalidate buffers, such as relation truncation. In this case, we need
|
|
* to ensure that any buffers which were invalidated and thus not flushed by
|
|
* the checkpoint are actually destroyed on disk. Replay can cope with a file
|
|
* or block that doesn't exist, but not with a block that has the wrong
|
|
* contents.
|
|
*/
|
|
#define DELAY_CHKPT_START (1<<0)
|
|
#define DELAY_CHKPT_COMPLETE (1<<1)
|
|
|
|
typedef enum
|
|
{
|
|
PROC_WAIT_STATUS_OK,
|
|
PROC_WAIT_STATUS_WAITING,
|
|
PROC_WAIT_STATUS_ERROR,
|
|
} ProcWaitStatus;
|
|
|
|
/*
|
|
* Each backend has a PGPROC struct in shared memory. There is also a list of
|
|
* currently-unused PGPROC structs that will be reallocated to new backends.
|
|
*
|
|
* links: list link for any list the PGPROC is in. When waiting for a lock,
|
|
* the PGPROC is linked into that lock's waitProcs queue. A recycled PGPROC
|
|
* is linked into ProcGlobal's freeProcs list.
|
|
*
|
|
* Note: twophase.c also sets up a dummy PGPROC struct for each currently
|
|
* prepared transaction. These PGPROCs appear in the ProcArray data structure
|
|
* so that the prepared transactions appear to be still running and are
|
|
* correctly shown as holding locks. A prepared transaction PGPROC can be
|
|
* distinguished from a real one at need by the fact that it has pid == 0.
|
|
* The semaphore and lock-activity fields in a prepared-xact PGPROC are unused,
|
|
* but its myProcLocks[] lists are valid.
|
|
*
|
|
* We allow many fields of this struct to be accessed without locks, such as
|
|
* delayChkptFlags and isBackgroundWorker. However, keep in mind that writing
|
|
* mirrored ones (see below) requires holding ProcArrayLock or XidGenLock in
|
|
* at least shared mode, so that pgxactoff does not change concurrently.
|
|
*
|
|
* Mirrored fields:
|
|
*
|
|
* Some fields in PGPROC (see "mirrored in ..." comment) are mirrored into an
|
|
* element of more densely packed ProcGlobal arrays. These arrays are indexed
|
|
* by PGPROC->pgxactoff. Both copies need to be maintained coherently.
|
|
*
|
|
* NB: The pgxactoff indexed value can *never* be accessed without holding
|
|
* locks.
|
|
*
|
|
* See PROC_HDR for details.
|
|
*/
|
|
struct PGPROC
|
|
{
|
|
/* proc->links MUST BE FIRST IN STRUCT (see ProcSleep,ProcWakeup,etc) */
|
|
dlist_node links; /* list link if process is in a list */
|
|
dlist_head *procgloballist; /* procglobal list that owns this PGPROC */
|
|
|
|
PGSemaphore sem; /* ONE semaphore to sleep on */
|
|
ProcWaitStatus waitStatus;
|
|
|
|
Latch procLatch; /* generic latch for process */
|
|
|
|
|
|
TransactionId xid; /* id of top-level transaction currently being
|
|
* executed by this proc, if running and XID
|
|
* is assigned; else InvalidTransactionId.
|
|
* mirrored in ProcGlobal->xids[pgxactoff] */
|
|
|
|
TransactionId xmin; /* minimal running XID as it was when we were
|
|
* starting our xact, excluding LAZY VACUUM:
|
|
* vacuum must not remove tuples deleted by
|
|
* xid >= xmin ! */
|
|
|
|
LocalTransactionId lxid; /* local id of top-level transaction currently
|
|
* being executed by this proc, if running;
|
|
* else InvalidLocalTransactionId */
|
|
int pid; /* Backend's process ID; 0 if prepared xact */
|
|
|
|
int pgxactoff; /* offset into various ProcGlobal->arrays with
|
|
* data mirrored from this PGPROC */
|
|
|
|
int pgprocno; /* Number of this PGPROC in
|
|
* ProcGlobal->allProcs array. This is set
|
|
* once by InitProcGlobal().
|
|
* ProcGlobal->allProcs[n].pgprocno == n */
|
|
|
|
/* These fields are zero while a backend is still starting up: */
|
|
BackendId backendId; /* This backend's backend ID (if assigned) */
|
|
Oid databaseId; /* OID of database this backend is using */
|
|
Oid roleId; /* OID of role using this backend */
|
|
|
|
Oid tempNamespaceId; /* OID of temp schema this backend is
|
|
* using */
|
|
|
|
bool isBackgroundWorker; /* true if background worker. */
|
|
|
|
/*
|
|
* While in hot standby mode, shows that a conflict signal has been sent
|
|
* for the current transaction. Set/cleared while holding ProcArrayLock,
|
|
* though not required. Accessed without lock, if needed.
|
|
*/
|
|
bool recoveryConflictPending;
|
|
|
|
/* Info about LWLock the process is currently waiting for, if any. */
|
|
uint8 lwWaiting; /* see LWLockWaitState */
|
|
uint8 lwWaitMode; /* lwlock mode being waited for */
|
|
proclist_node lwWaitLink; /* position in LW lock wait list */
|
|
|
|
/* Support for condition variables. */
|
|
proclist_node cvWaitLink; /* position in CV wait list */
|
|
|
|
/* Info about lock the process is currently waiting for, if any. */
|
|
/* waitLock and waitProcLock are NULL if not currently waiting. */
|
|
LOCK *waitLock; /* Lock object we're sleeping on ... */
|
|
PROCLOCK *waitProcLock; /* Per-holder info for awaited lock */
|
|
LOCKMODE waitLockMode; /* type of lock we're waiting for */
|
|
LOCKMASK heldLocks; /* bitmask for lock types already held on this
|
|
* lock object by this backend */
|
|
pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
|
|
* started */
|
|
|
|
int delayChkptFlags; /* for DELAY_CHKPT_* flags */
|
|
|
|
uint8 statusFlags; /* this backend's status flags, see PROC_*
|
|
* above. mirrored in
|
|
* ProcGlobal->statusFlags[pgxactoff] */
|
|
|
|
/*
|
|
* Info to allow us to wait for synchronous replication, if needed.
|
|
* waitLSN is InvalidXLogRecPtr if not waiting; set only by user backend.
|
|
* syncRepState must not be touched except by owning process or WALSender.
|
|
* syncRepLinks used only while holding SyncRepLock.
|
|
*/
|
|
XLogRecPtr waitLSN; /* waiting for this LSN or higher */
|
|
int syncRepState; /* wait state for sync rep */
|
|
dlist_node syncRepLinks; /* list link if process is in syncrep queue */
|
|
|
|
/*
|
|
* All PROCLOCK objects for locks held or awaited by this backend are
|
|
* linked into one of these lists, according to the partition number of
|
|
* their lock.
|
|
*/
|
|
dlist_head myProcLocks[NUM_LOCK_PARTITIONS];
|
|
|
|
XidCacheStatus subxidStatus; /* mirrored with
|
|
* ProcGlobal->subxidStates[i] */
|
|
struct XidCache subxids; /* cache for subtransaction XIDs */
|
|
|
|
/* Support for group XID clearing. */
|
|
/* true, if member of ProcArray group waiting for XID clear */
|
|
bool procArrayGroupMember;
|
|
/* next ProcArray group member waiting for XID clear */
|
|
pg_atomic_uint32 procArrayGroupNext;
|
|
|
|
/*
|
|
* latest transaction id among the transaction's main XID and
|
|
* subtransactions
|
|
*/
|
|
TransactionId procArrayGroupMemberXid;
|
|
|
|
uint32 wait_event_info; /* proc's wait information */
|
|
|
|
/* Support for group transaction status update. */
|
|
bool clogGroupMember; /* true, if member of clog group */
|
|
pg_atomic_uint32 clogGroupNext; /* next clog group member */
|
|
TransactionId clogGroupMemberXid; /* transaction id of clog group member */
|
|
XidStatus clogGroupMemberXidStatus; /* transaction status of clog
|
|
* group member */
|
|
int64 clogGroupMemberPage; /* clog page corresponding to
|
|
* transaction id of clog group member */
|
|
XLogRecPtr clogGroupMemberLsn; /* WAL location of commit record for clog
|
|
* group member */
|
|
|
|
/* Lock manager data, recording fast-path locks taken by this backend. */
|
|
LWLock fpInfoLock; /* protects per-backend fast-path state */
|
|
uint64 fpLockBits; /* lock modes held for each fast-path slot */
|
|
Oid fpRelId[FP_LOCK_SLOTS_PER_BACKEND]; /* slots for rel oids */
|
|
bool fpVXIDLock; /* are we holding a fast-path VXID lock? */
|
|
LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID
|
|
* lock */
|
|
|
|
/*
|
|
* Support for lock groups. Use LockHashPartitionLockByProc on the group
|
|
* leader to get the LWLock protecting these fields.
|
|
*/
|
|
PGPROC *lockGroupLeader; /* lock group leader, if I'm a member */
|
|
dlist_head lockGroupMembers; /* list of members, if I'm a leader */
|
|
dlist_node lockGroupLink; /* my member link, if I'm a member */
|
|
};
|
|
|
|
/* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
|
|
|
|
|
|
extern PGDLLIMPORT PGPROC *MyProc;
|
|
|
|
/*
|
|
* There is one ProcGlobal struct for the whole database cluster.
|
|
*
|
|
* Adding/Removing an entry into the procarray requires holding *both*
|
|
* ProcArrayLock and XidGenLock in exclusive mode (in that order). Both are
|
|
* needed because the dense arrays (see below) are accessed from
|
|
* GetNewTransactionId() and GetSnapshotData(), and we don't want to add
|
|
* further contention by both using the same lock. Adding/Removing a procarray
|
|
* entry is much less frequent.
|
|
*
|
|
* Some fields in PGPROC are mirrored into more densely packed arrays (e.g.
|
|
* xids), with one entry for each backend. These arrays only contain entries
|
|
* for PGPROCs that have been added to the shared array with ProcArrayAdd()
|
|
* (in contrast to PGPROC array which has unused PGPROCs interspersed).
|
|
*
|
|
* The dense arrays are indexed by PGPROC->pgxactoff. Any concurrent
|
|
* ProcArrayAdd() / ProcArrayRemove() can lead to pgxactoff of a procarray
|
|
* member to change. Therefore it is only safe to use PGPROC->pgxactoff to
|
|
* access the dense array while holding either ProcArrayLock or XidGenLock.
|
|
*
|
|
* As long as a PGPROC is in the procarray, the mirrored values need to be
|
|
* maintained in both places in a coherent manner.
|
|
*
|
|
* The denser separate arrays are beneficial for three main reasons: First, to
|
|
* allow for as tight loops accessing the data as possible. Second, to prevent
|
|
* updates of frequently changing data (e.g. xmin) from invalidating
|
|
* cachelines also containing less frequently changing data (e.g. xid,
|
|
* statusFlags). Third to condense frequently accessed data into as few
|
|
* cachelines as possible.
|
|
*
|
|
* There are two main reasons to have the data mirrored between these dense
|
|
* arrays and PGPROC. First, as explained above, a PGPROC's array entries can
|
|
* only be accessed with either ProcArrayLock or XidGenLock held, whereas the
|
|
* PGPROC entries do not require that (obviously there may still be locking
|
|
* requirements around the individual field, separate from the concerns
|
|
* here). That is particularly important for a backend to efficiently checks
|
|
* it own values, which it often can safely do without locking. Second, the
|
|
* PGPROC fields allow to avoid unnecessary accesses and modification to the
|
|
* dense arrays. A backend's own PGPROC is more likely to be in a local cache,
|
|
* whereas the cachelines for the dense array will be modified by other
|
|
* backends (often removing it from the cache for other cores/sockets). At
|
|
* commit/abort time a check of the PGPROC value can avoid accessing/dirtying
|
|
* the corresponding array value.
|
|
*
|
|
* Basically it makes sense to access the PGPROC variable when checking a
|
|
* single backend's data, especially when already looking at the PGPROC for
|
|
* other reasons already. It makes sense to look at the "dense" arrays if we
|
|
* need to look at many / most entries, because we then benefit from the
|
|
* reduced indirection and better cross-process cache-ability.
|
|
*
|
|
* When entering a PGPROC for 2PC transactions with ProcArrayAdd(), the data
|
|
* in the dense arrays is initialized from the PGPROC while it already holds
|
|
* ProcArrayLock.
|
|
*/
|
|
typedef struct PROC_HDR
|
|
{
|
|
/* Array of PGPROC structures (not including dummies for prepared txns) */
|
|
PGPROC *allProcs;
|
|
|
|
/* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */
|
|
TransactionId *xids;
|
|
|
|
/*
|
|
* Array mirroring PGPROC.subxidStatus for each PGPROC currently in the
|
|
* procarray.
|
|
*/
|
|
XidCacheStatus *subxidStates;
|
|
|
|
/*
|
|
* Array mirroring PGPROC.statusFlags for each PGPROC currently in the
|
|
* procarray.
|
|
*/
|
|
uint8 *statusFlags;
|
|
|
|
/* Length of allProcs array */
|
|
uint32 allProcCount;
|
|
/* Head of list of free PGPROC structures */
|
|
dlist_head freeProcs;
|
|
/* Head of list of autovacuum's free PGPROC structures */
|
|
dlist_head autovacFreeProcs;
|
|
/* Head of list of bgworker free PGPROC structures */
|
|
dlist_head bgworkerFreeProcs;
|
|
/* Head of list of walsender free PGPROC structures */
|
|
dlist_head walsenderFreeProcs;
|
|
/* First pgproc waiting for group XID clear */
|
|
pg_atomic_uint32 procArrayGroupFirst;
|
|
/* First pgproc waiting for group transaction status update */
|
|
pg_atomic_uint32 clogGroupFirst;
|
|
/* WALWriter process's latch */
|
|
Latch *walwriterLatch;
|
|
/* Checkpointer process's latch */
|
|
Latch *checkpointerLatch;
|
|
/* Current shared estimate of appropriate spins_per_delay value */
|
|
int spins_per_delay;
|
|
/* Buffer id of the buffer that Startup process waits for pin on, or -1 */
|
|
int startupBufferPinWaitBufId;
|
|
} PROC_HDR;
|
|
|
|
extern PGDLLIMPORT PROC_HDR *ProcGlobal;
|
|
|
|
extern PGDLLIMPORT PGPROC *PreparedXactProcs;
|
|
|
|
/* Accessor for PGPROC given a pgprocno. */
|
|
#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)])
|
|
|
|
/*
|
|
* We set aside some extra PGPROC structures for auxiliary processes,
|
|
* ie things that aren't full-fledged backends but need shmem access.
|
|
*
|
|
* Background writer, checkpointer, WAL writer and archiver run during normal
|
|
* operation. Startup process and WAL receiver also consume 2 slots, but WAL
|
|
* writer is launched only after startup has exited, so we only need 5 slots.
|
|
*/
|
|
#define NUM_AUXILIARY_PROCS 5
|
|
|
|
/* configurable options */
|
|
extern PGDLLIMPORT int DeadlockTimeout;
|
|
extern PGDLLIMPORT int StatementTimeout;
|
|
extern PGDLLIMPORT int LockTimeout;
|
|
extern PGDLLIMPORT int IdleInTransactionSessionTimeout;
|
|
extern PGDLLIMPORT int IdleSessionTimeout;
|
|
extern PGDLLIMPORT bool log_lock_waits;
|
|
|
|
|
|
/*
|
|
* Function Prototypes
|
|
*/
|
|
extern int ProcGlobalSemas(void);
|
|
extern Size ProcGlobalShmemSize(void);
|
|
extern void InitProcGlobal(void);
|
|
extern void InitProcess(void);
|
|
extern void InitProcessPhase2(void);
|
|
extern void InitAuxiliaryProcess(void);
|
|
|
|
extern void SetStartupBufferPinWaitBufId(int bufid);
|
|
extern int GetStartupBufferPinWaitBufId(void);
|
|
|
|
extern bool HaveNFreeProcs(int n, int *nfree);
|
|
extern void ProcReleaseLocks(bool isCommit);
|
|
|
|
extern ProcWaitStatus ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable);
|
|
extern void ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus);
|
|
extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
|
|
extern void CheckDeadLockAlert(void);
|
|
extern bool IsWaitingForLock(void);
|
|
extern void LockErrorCleanup(void);
|
|
|
|
extern void ProcWaitForSignal(uint32 wait_event_info);
|
|
extern void ProcSendSignal(int pgprocno);
|
|
|
|
extern PGPROC *AuxiliaryPidGetProc(int pid);
|
|
|
|
extern void BecomeLockGroupLeader(void);
|
|
extern bool BecomeLockGroupMember(PGPROC *leader, int pid);
|
|
|
|
#endif /* _PROC_H_ */
|