diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e5640793eb..9f417dee29 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -60,6 +60,7 @@ #include "storage/spin.h" #include "utils/builtins.h" #include "utils/guc.h" +#include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" @@ -736,6 +737,10 @@ static bool bgwriterLaunched = false; static int MyLockNo = 0; static bool holdingAllLocks = false; +#ifdef WAL_DEBUG +static MemoryContext walDebugCxt = NULL; +#endif + static void readRecoveryCommandFile(void); static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo); static bool recoveryStopsBefore(XLogRecord *record); @@ -1258,6 +1263,7 @@ begin:; if (XLOG_DEBUG) { StringInfoData buf; + MemoryContext oldCxt = MemoryContextSwitchTo(walDebugCxt); initStringInfo(&buf); appendStringInfo(&buf, "INSERT @ %X/%X: ", @@ -1282,10 +1288,11 @@ begin:; appendStringInfoString(&buf, " - "); RmgrTable[rechdr->xl_rmid].rm_desc(&buf, (XLogRecord *) recordbuf.data); - pfree(recordbuf.data); } elog(LOG, "%s", buf.data); - pfree(buf.data); + + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(walDebugCxt); } #endif @@ -4807,6 +4814,24 @@ XLOGShmemInit(void) char *allocptr; int i; +#ifdef WAL_DEBUG + /* + * Create a memory context for WAL debugging that's exempt from the + * normal "no pallocs in critical section" rule. Yes, that can lead to a + * PANIC if an allocation fails, but wal_debug is not for production use + * anyway. + */ + if (walDebugCxt == NULL) + { + walDebugCxt = AllocSetContextCreate(TopMemoryContext, + "WAL Debug", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + MemoryContextAllowInCriticalSection(walDebugCxt, true); + } +#endif + ControlFile = (ControlFileData *) ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile); XLogCtl = (XLogCtlData *) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 2ac3061d97..6c814ba0be 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -1305,19 +1305,6 @@ AbsorbFsyncRequests(void) if (!AmCheckpointerProcess()) return; - /* - * We have to PANIC if we fail to absorb all the pending requests (eg, - * because our hashtable runs out of memory). This is because the system - * cannot run safely if we are unable to fsync what we have been told to - * fsync. Fortunately, the hashtable is so small that the problem is - * quite unlikely to arise in practice. - */ - START_CRIT_SECTION(); - - /* - * We try to avoid holding the lock for a long time by copying the request - * array. - */ LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); /* Transfer stats counts into pending pgstats message */ @@ -1327,12 +1314,25 @@ AbsorbFsyncRequests(void) CheckpointerShmem->num_backend_writes = 0; CheckpointerShmem->num_backend_fsync = 0; + /* + * We try to avoid holding the lock for a long time by copying the request + * array, and processing the requests after releasing the lock. + * + * Once we have cleared the requests from shared memory, we have to PANIC + * if we then fail to absorb them (eg, because our hashtable runs out of + * memory). This is because the system cannot run safely if we are unable + * to fsync what we have been told to fsync. Fortunately, the hashtable + * is so small that the problem is quite unlikely to arise in practice. + */ n = CheckpointerShmem->num_requests; if (n > 0) { requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); } + + START_CRIT_SECTION(); + CheckpointerShmem->num_requests = 0; LWLockRelease(CheckpointerCommLock); @@ -1340,10 +1340,10 @@ AbsorbFsyncRequests(void) for (request = requests; n > 0; request++, n--) RememberFsyncRequest(request->rnode, request->forknum, request->segno); + END_CRIT_SECTION(); + if (requests) pfree(requests); - - END_CRIT_SECTION(); } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index d23ac62bf8..a62af27d22 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -104,8 +104,8 @@ typedef struct lwlock_stats int spin_delay_count; } lwlock_stats; -static int counts_for_pid = 0; static HTAB *lwlock_stats_htab; +static lwlock_stats lwlock_stats_dummy; #endif #ifdef LOCK_DEBUG @@ -142,21 +142,39 @@ static void init_lwlock_stats(void) { HASHCTL ctl; + static MemoryContext lwlock_stats_cxt = NULL; + static bool exit_registered = false; - if (lwlock_stats_htab != NULL) - { - hash_destroy(lwlock_stats_htab); - lwlock_stats_htab = NULL; - } + if (lwlock_stats_cxt != NULL) + MemoryContextDelete(lwlock_stats_cxt); + + /* + * The LWLock stats will be updated within a critical section, which + * requires allocating new hash entries. Allocations within a critical + * section are normally not allowed because running out of memory would + * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally + * turned on in production, so that's an acceptable risk. The hash entries + * are small, so the risk of running out of memory is minimal in practice. + */ + lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext, + "LWLock stats", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true); MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(lwlock_stats_key); ctl.entrysize = sizeof(lwlock_stats); ctl.hash = tag_hash; + ctl.hcxt = lwlock_stats_cxt; lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl, - HASH_ELEM | HASH_FUNCTION); - counts_for_pid = MyProcPid; - on_shmem_exit(print_lwlock_stats, 0); + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + if (!exit_registered) + { + on_shmem_exit(print_lwlock_stats, 0); + exit_registered = true; + } } static void @@ -190,9 +208,13 @@ get_lwlock_stats_entry(LWLock *lock) lwlock_stats *lwstats; bool found; - /* Set up local count state first time through in a given process */ - if (counts_for_pid != MyProcPid) - init_lwlock_stats(); + /* + * During shared memory initialization, the hash table doesn't exist yet. + * Stats of that phase aren't very interesting, so just collect operations + * on all locks in a single dummy entry. + */ + if (lwlock_stats_htab == NULL) + return &lwlock_stats_dummy; /* Fetch or create the entry. */ key.tranche = lock->tranche; @@ -361,6 +383,16 @@ CreateLWLocks(void) LWLockRegisterTranche(0, &MainLWLockTranche); } +/* + * InitLWLockAccess - initialize backend-local state needed to hold LWLocks + */ +void +InitLWLockAccess(void) +{ +#ifdef LWLOCK_STATS + init_lwlock_stats(); +#endif +} /* * LWLockAssign - assign a dynamically-allocated LWLock number diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index dfaf10e4b5..ea88a24144 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -411,8 +411,9 @@ InitProcess(void) /* * Now that we have a PGPROC, we could try to acquire locks, so initialize - * the deadlock checker. + * local state needed for LWLocks, and the deadlock checker. */ + InitLWLockAccess(); InitDeadLockChecking(); } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 3c1c81a729..167d61c1af 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -115,7 +115,7 @@ typedef struct _MdfdVec struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ } MdfdVec; -static MemoryContext MdCxt; /* context for all md.c allocations */ +static MemoryContext MdCxt; /* context for all MdfdVec objects */ /* @@ -157,6 +157,7 @@ typedef struct static HTAB *pendingOpsTable = NULL; static List *pendingUnlinks = NIL; +static MemoryContext pendingOpsCxt; /* context for the above */ static CycleCtr mdsync_cycle_ctr = 0; static CycleCtr mdckpt_cycle_ctr = 0; @@ -209,11 +210,27 @@ mdinit(void) { HASHCTL hash_ctl; + /* + * XXX: The checkpointer needs to add entries to the pending ops table + * when absorbing fsync requests. That is done within a critical + * section, which isn't usually allowed, but we make an exception. + * It means that there's a theoretical possibility that you run out of + * memory while absorbing fsync requests, which leads to a PANIC. + * Fortunately the hash table is small so that's unlikely to happen in + * practice. + */ + pendingOpsCxt = AllocSetContextCreate(MdCxt, + "Pending Ops Context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + MemoryContextAllowInCriticalSection(pendingOpsCxt, true); + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(RelFileNode); hash_ctl.entrysize = sizeof(PendingOperationEntry); hash_ctl.hash = tag_hash; - hash_ctl.hcxt = MdCxt; + hash_ctl.hcxt = pendingOpsCxt; pendingOpsTable = hash_create("Pending Ops Table", 100L, &hash_ctl, @@ -1516,7 +1533,7 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) else if (segno == UNLINK_RELATION_REQUEST) { /* Unlink request: put it in the linked list */ - MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt); + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); PendingUnlinkEntry *entry; /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */ @@ -1533,7 +1550,7 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) else { /* Normal case: enter a request to fsync this segment */ - MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt); + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); PendingOperationEntry *entry; bool found; diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index e83e76dc0f..4185a03e9f 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -60,15 +60,9 @@ static void MemoryContextStatsInternal(MemoryContext context, int level); * You should not do memory allocations within a critical section, because * an out-of-memory error will be escalated to a PANIC. To enforce that * rule, the allocation functions Assert that. - * - * There are a two exceptions: 1) error recovery uses ErrorContext, which - * has some memory set aside so that you don't run out. And 2) checkpointer - * currently just hopes for the best, which is wrong and ought to be fixed, - * but it's a known issue so let's not complain about in the meanwhile. */ #define AssertNotInCriticalSection(context) \ - Assert(CritSectionCount == 0 || (context) == ErrorContext || \ - AmCheckpointerProcess()) + Assert(CritSectionCount == 0 || (context)->allowInCritSection) /***************************************************************************** * EXPORTED ROUTINES * @@ -120,7 +114,10 @@ MemoryContextInit(void) * require it to contain at least 8K at all times. This is the only case * where retained memory in a context is *essential* --- we want to be * sure ErrorContext still has some memory even if we've run out - * elsewhere! + * elsewhere! Also, allow allocations in ErrorContext within a critical + * section. Otherwise a PANIC will cause an assertion failure in the + * error reporting code, before printing out the real cause of the + * failure. * * This should be the last step in this function, as elog.c assumes memory * management works once ErrorContext is non-null. @@ -130,6 +127,7 @@ MemoryContextInit(void) 8 * 1024, 8 * 1024, 8 * 1024); + MemoryContextAllowInCriticalSection(ErrorContext, true); } /* @@ -305,6 +303,26 @@ MemoryContextSetParent(MemoryContext context, MemoryContext new_parent) } } +/* + * MemoryContextAllowInCriticalSection + * Allow/disallow allocations in this memory context within a critical + * section. + * + * Normally, memory allocations are not allowed within a critical section, + * because a failure would lead to PANIC. There are a few exceptions to + * that, like allocations related to debugging code that is not supposed to + * be enabled in production. This function can be used to exempt specific + * memory contexts from the assertion in palloc(). + */ +void +MemoryContextAllowInCriticalSection(MemoryContext context, bool allow) +{ + AssertArg(MemoryContextIsValid(context)); +#ifdef USE_ASSERT_CHECKING + context->allowInCritSection = allow; +#endif +} + /* * GetMemoryChunkSpace * Given a currently-allocated chunk, determine the total space @@ -533,6 +551,7 @@ MemoryContextCreate(NodeTag tag, Size size, MemoryContext node; Size needed = size + strlen(name) + 1; + /* creating new memory contexts is not allowed in a critical section */ Assert(CritSectionCount == 0); /* Get space for node and name */ @@ -570,6 +589,11 @@ MemoryContextCreate(NodeTag tag, Size size, node->parent = parent; node->nextchild = parent->firstchild; parent->firstchild = node; + +#ifdef USE_ASSERT_CHECKING + /* inherit allowInCritSection flag from parent */ + node->allowInCritSection = parent->allowInCritSection; +#endif } VALGRIND_CREATE_MEMPOOL(node, 0, false); diff --git a/src/include/nodes/memnodes.h b/src/include/nodes/memnodes.h index f79ebd423f..ad77509b0c 100644 --- a/src/include/nodes/memnodes.h +++ b/src/include/nodes/memnodes.h @@ -60,6 +60,9 @@ typedef struct MemoryContextData MemoryContext nextchild; /* next child of same parent */ char *name; /* context name (just for debugging) */ bool isReset; /* T = no space alloced since last reset */ +#ifdef USE_ASSERT_CHECKING + bool allowInCritSection; /* allow palloc in critical section */ +#endif } MemoryContextData; /* utils/palloc.h contains typedef struct MemoryContextData *MemoryContext */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index d588b1434e..1d90b9f858 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -182,6 +182,7 @@ extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value); extern Size LWLockShmemSize(void); extern void CreateLWLocks(void); +extern void InitLWLockAccess(void); /* * The traditional method for obtaining an lwlock for use by an extension is diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index 59d0aecfbb..2fede86027 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -101,6 +101,8 @@ extern MemoryContext GetMemoryChunkContext(void *pointer); extern MemoryContext MemoryContextGetParent(MemoryContext context); extern bool MemoryContextIsEmpty(MemoryContext context); extern void MemoryContextStats(MemoryContext context); +extern void MemoryContextAllowInCriticalSection(MemoryContext context, + bool allow); #ifdef MEMORY_CONTEXT_CHECKING extern void MemoryContextCheck(MemoryContext context);