diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 487331c115..24924647b5 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1110,10 +1110,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser LogicalLauncherMain Waiting in main loop of logical replication launcher process. - - PgStatMain - Waiting in main loop of statistics collector process. - RecoveryWalStream Waiting in main loop of startup process for WAL to arrive, during @@ -2115,6 +2111,18 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting to access the list of predicate locks held by serializable transactions. + + PgStatsDSA + Waiting for stats dynamic shared memory allocator access + + + PgStatsHash + Waiting for stats shared memory hash table access + + + PgStatsData + Waiting for shared memory stats data access + SerializableXactHash Waiting to read or update information about serializable @@ -5142,7 +5150,8 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i timestamp with time zone - Returns the timestamp of the current statistics snapshot. + Returns the timestamp of the current statistics snapshot, or NULL if + no statistics snapshot has been taken. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8ae0a0ba53..c076e48445 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1842,7 +1842,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) WriteRqst.Flush = 0; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); - WalStats.m_wal_buffers_full++; + PendingWalStats.wal_buffers_full++; TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); } /* Re-acquire WALBufMappingLock and retry */ @@ -2200,10 +2200,10 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration); + PendingWalStats.wal_write_time += INSTR_TIME_GET_MICROSEC(duration); } - WalStats.m_wal_write++; + PendingWalStats.wal_write++; if (written <= 0) { @@ -4877,6 +4877,7 @@ StartupXLOG(void) XLogCtlInsert *Insert; CheckPoint checkPoint; bool wasShutdown; + bool didCrash; bool haveTblspcMap; bool haveBackupLabel; XLogRecPtr EndOfLog; @@ -4994,7 +4995,10 @@ StartupXLOG(void) { RemoveTempXlogFiles(); SyncDataDirectory(); + didCrash = true; } + else + didCrash = false; /* * Prepare for WAL recovery if needed. @@ -5106,6 +5110,22 @@ StartupXLOG(void) */ restoreTwoPhaseData(); + /* + * When starting with crash recovery, reset pgstat data - it might not be + * valid. Otherwise restore pgstat data. It's safe to do this here, + * because postmaster will not yet have started any other processes. + * + * NB: Restoring replication slot stats relies on slot state to have + * already been restored from disk. + * + * TODO: With a bit of extra work we could just start with a pgstat file + * associated with the checkpoint redo location we're starting from. + */ + if (didCrash) + pgstat_discard_stats(); + else + pgstat_restore_stats(); + lastFullPageWrites = checkPoint.fullPageWrites; RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; @@ -5180,11 +5200,6 @@ StartupXLOG(void) LocalMinRecoveryPointTLI = 0; } - /* - * Reset pgstat data, because it may be invalid after recovery. - */ - pgstat_reset_all(); - /* Check that the GUCs used to generate the WAL allow recovery */ CheckRequiredParameterValues(); @@ -6081,8 +6096,8 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_sync_end_t); /* Accumulate checkpoint timing summary data, in milliseconds. */ - PendingCheckpointerStats.m_checkpoint_write_time += write_msecs; - PendingCheckpointerStats.m_checkpoint_sync_time += sync_msecs; + PendingCheckpointerStats.checkpoint_write_time += write_msecs; + PendingCheckpointerStats.checkpoint_sync_time += sync_msecs; /* * All of the published timing statistics are accounted for. Only @@ -8009,10 +8024,10 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration); + PendingWalStats.wal_sync_time += INSTR_TIME_GET_MICROSEC(duration); } - WalStats.m_wal_sync++; + PendingWalStats.wal_sync++; } /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 04dbbe5530..e0fc7e8d79 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -351,13 +351,6 @@ vacuum(List *relations, VacuumParams *params, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("PROCESS_TOAST required with VACUUM FULL"))); - /* - * Send info about dead objects to the cumulative stats system, unless - * we are in autovacuum --- autovacuum.c does this for itself. - */ - if ((params->options & VACOPT_VACUUM) && !IsAutoVacuumWorkerProcess()) - pgstat_vacuum_stat(); - /* * Create special memory context for cross-transaction storage. * diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 974a29e7a9..6b4f742578 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -28,6 +28,7 @@ #include "access/amapi.h" #include "access/table.h" +#include "access/xact.h" #include "catalog/index.h" #include "commands/vacuum.h" #include "optimizer/paths.h" @@ -35,6 +36,7 @@ #include "storage/bufmgr.h" #include "tcop/tcopprot.h" #include "utils/lsyscache.h" +#include "utils/rel.h" /* * DSM keys for parallel vacuum. Unlike other parallel execution code, since diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index c6d30fa527..f36c40e852 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -44,11 +44,12 @@ * Note that there can be more than one worker in a database concurrently. * They will store the table they are currently vacuuming in shared memory, so * that other workers avoid being blocked waiting for the vacuum lock for that - * table. They will also reload the pgstats data just before vacuuming each - * table, to avoid vacuuming a table that was just finished being vacuumed by - * another worker and thus is no longer noted in shared memory. However, - * there is a window (caused by pgstat delay) on which a worker may choose a - * table that was already vacuumed; this is a bug in the current design. + * table. They will also fetch the last time the table was vacuumed from + * pgstats just before vacuuming each table, to avoid vacuuming a table that + * was just finished being vacuumed by another worker and thus is no longer + * noted in shared memory. However, there is a small window (due to not yet + * holding the relation lock) during which a worker may choose a table that was + * already vacuumed; this is a bug in the current design. * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -129,9 +130,6 @@ int autovacuum_vac_cost_limit; int Log_autovacuum_min_duration = 600000; -/* how long to keep pgstat data in the launcher, in milliseconds */ -#define STATS_READ_DELAY 1000 - /* the minimum allowed time between two awakenings of the launcher */ #define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */ #define MAX_AUTOVAC_SLEEPTIME 300 /* seconds */ @@ -342,15 +340,11 @@ static void autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy); static AutoVacOpts *extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc); -static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared, - PgStat_StatDBEntry *shared, - PgStat_StatDBEntry *dbentry); static void perform_work_item(AutoVacuumWorkItem *workitem); static void autovac_report_activity(autovac_table *tab); static void autovac_report_workitem(AutoVacuumWorkItem *workitem, const char *nspname, const char *relname); static void avl_sigusr2_handler(SIGNAL_ARGS); -static void autovac_refresh_stats(void); @@ -555,12 +549,6 @@ AutoVacLauncherMain(int argc, char *argv[]) DatabaseListCxt = NULL; dlist_init(&DatabaseList); - /* - * Make sure pgstat also considers our stat data as gone. Note: we - * mustn't use autovac_refresh_stats here. - */ - pgstat_clear_snapshot(); - /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); @@ -611,6 +599,12 @@ AutoVacLauncherMain(int argc, char *argv[]) SetConfigOption("default_transaction_isolation", "read committed", PGC_SUSET, PGC_S_OVERRIDE); + /* + * Even when system is configured to use a different fetch consistency, + * for autovac we always want fresh stats. + */ + SetConfigOption("stats_fetch_consistency", "none", PGC_SUSET, PGC_S_OVERRIDE); + /* * In emergency mode, just start a worker (unless shutdown was requested) * and go away. @@ -963,9 +957,6 @@ rebuild_database_list(Oid newdb) HTAB *dbhash; dlist_iter iter; - /* use fresh stats */ - autovac_refresh_stats(); - newcxt = AllocSetContextCreate(AutovacMemCxt, "Autovacuum database list", ALLOCSET_DEFAULT_SIZES); @@ -1184,9 +1175,6 @@ do_start_worker(void) ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(tmpcxt); - /* use fresh stats */ - autovac_refresh_stats(); - /* Get a list of databases */ dblist = get_database_list(); @@ -1642,6 +1630,12 @@ AutoVacWorkerMain(int argc, char *argv[]) SetConfigOption("synchronous_commit", "local", PGC_SUSET, PGC_S_OVERRIDE); + /* + * Even when system is configured to use a different fetch consistency, + * for autovac we always want fresh stats. + */ + SetConfigOption("stats_fetch_consistency", "none", PGC_SUSET, PGC_S_OVERRIDE); + /* * Get the info about the database we're going to work on. */ @@ -1966,8 +1960,6 @@ do_autovacuum(void) HASHCTL ctl; HTAB *table_toast_map; ListCell *volatile cell; - PgStat_StatDBEntry *shared; - PgStat_StatDBEntry *dbentry; BufferAccessStrategy bstrategy; ScanKeyData key; TupleDesc pg_class_desc; @@ -1986,22 +1978,9 @@ do_autovacuum(void) ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(AutovacMemCxt); - /* - * may be NULL if we couldn't find an entry (only happens if we are - * forcing a vacuum for anti-wrap purposes). - */ - dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); - /* Start a transaction so our commands have one to play into. */ StartTransactionCommand(); - /* - * Clean up any dead statistics entries for this DB. We always want to do - * this exactly once per DB-processing cycle, even if we find nothing - * worth vacuuming in the database. - */ - pgstat_vacuum_stat(); - /* * Compute the multixact age for which freezing is urgent. This is * normally autovacuum_multixact_freeze_max_age, but may be less if we are @@ -2039,9 +2018,6 @@ do_autovacuum(void) /* StartTransactionCommand changed elsewhere */ MemoryContextSwitchTo(AutovacMemCxt); - /* The database hash where pgstat keeps shared relations */ - shared = pgstat_fetch_stat_dbentry(InvalidOid); - classRel = table_open(RelationRelationId, AccessShareLock); /* create a copy so we can use it after closing pg_class */ @@ -2119,8 +2095,8 @@ do_autovacuum(void) /* Fetch reloptions and the pgstat entry for this table */ relopts = extract_autovac_opts(tuple, pg_class_desc); - tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, - shared, dbentry); + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); /* Check if it needs vacuum or analyze */ relation_needs_vacanalyze(relid, relopts, classForm, tabentry, @@ -2203,8 +2179,8 @@ do_autovacuum(void) } /* Fetch the pgstat entry for this table */ - tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, - shared, dbentry); + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); relation_needs_vacanalyze(relid, relopts, classForm, tabentry, effective_multixact_freeze_max_age, @@ -2418,12 +2394,8 @@ do_autovacuum(void) /* * Check whether pgstat data still says we need to vacuum this table. * It could have changed if something else processed the table while - * we weren't looking. - * - * Note: we have a special case in pgstat code to ensure that the - * stats we read are as up-to-date as possible, to avoid the problem - * that somebody just finished vacuuming this table. The window to - * the race condition is not closed but it is very small. + * we weren't looking. This doesn't entirely close the race condition, + * but it is very small. */ MemoryContextSwitchTo(AutovacMemCxt); tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc, @@ -2768,29 +2740,6 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) return av; } -/* - * get_pgstat_tabentry_relid - * - * Fetch the pgstat entry of a table, either local to a database or shared. - */ -static PgStat_StatTabEntry * -get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared, - PgStat_StatDBEntry *dbentry) -{ - PgStat_StatTabEntry *tabentry = NULL; - - if (isshared) - { - if (PointerIsValid(shared)) - tabentry = hash_search(shared->tables, &relid, - HASH_FIND, NULL); - } - else if (PointerIsValid(dbentry)) - tabentry = hash_search(dbentry->tables, &relid, - HASH_FIND, NULL); - - return tabentry; -} /* * table_recheck_autovac @@ -2812,7 +2761,6 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, autovac_table *tab = NULL; bool wraparound; AutoVacOpts *avopts; - static bool reuse_stats = false; /* fetch the relation's relcache entry */ classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); @@ -2836,35 +2784,6 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, avopts = &hentry->ar_reloptions; } - /* - * Reuse the stats to recheck whether a relation needs to be vacuumed or - * analyzed if it was reloaded before and has not been cleared yet. This - * is necessary to avoid frequent refresh of stats, especially when there - * are very large number of relations and the refresh can cause lots of - * overhead. - * - * If we determined that a relation needs to be vacuumed or analyzed, - * based on the old stats, we refresh stats and recheck the necessity - * again. Because a relation may have already been vacuumed or analyzed by - * someone since the last reload of stats. - */ - if (reuse_stats) - { - recheck_relation_needs_vacanalyze(relid, avopts, classForm, - effective_multixact_freeze_max_age, - &dovacuum, &doanalyze, &wraparound); - - /* Quick exit if a relation doesn't need to be vacuumed or analyzed */ - if (!doanalyze && !dovacuum) - { - heap_freetuple(classTup); - return NULL; - } - } - - /* Use fresh stats and recheck again */ - autovac_refresh_stats(); - recheck_relation_needs_vacanalyze(relid, avopts, classForm, effective_multixact_freeze_max_age, &dovacuum, &doanalyze, &wraparound); @@ -2962,21 +2881,6 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, tab->at_dobalance = !(avopts && (avopts->vacuum_cost_limit > 0 || avopts->vacuum_cost_delay > 0)); - - /* - * When we decide to do vacuum or analyze, the existing stats cannot - * be reused in the next cycle because it's cleared at the end of - * vacuum or analyze (by AtEOXact_PgStat()). - */ - reuse_stats = false; - } - else - { - /* - * If neither vacuum nor analyze is necessary, the existing stats is - * not cleared and can be reused in the next cycle. - */ - reuse_stats = true; } heap_freetuple(classTup); @@ -3001,17 +2905,10 @@ recheck_relation_needs_vacanalyze(Oid relid, bool *wraparound) { PgStat_StatTabEntry *tabentry; - PgStat_StatDBEntry *shared = NULL; - PgStat_StatDBEntry *dbentry = NULL; - - if (classForm->relisshared) - shared = pgstat_fetch_stat_dbentry(InvalidOid); - else - dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); /* fetch the pgstat table entry */ - tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, - shared, dbentry); + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); relation_needs_vacanalyze(relid, avopts, classForm, tabentry, effective_multixact_freeze_max_age, @@ -3169,11 +3066,11 @@ relation_needs_vacanalyze(Oid relid, } /* - * If we found the table in the stats hash, and autovacuum is currently - * enabled, make a threshold-based decision whether to vacuum and/or - * analyze. If autovacuum is currently disabled, we must be here for - * anti-wraparound vacuuming only, so don't vacuum (or analyze) anything - * that's not being forced. + * If we found stats for the table, and autovacuum is currently enabled, + * make a threshold-based decision whether to vacuum and/or analyze. If + * autovacuum is currently disabled, we must be here for anti-wraparound + * vacuuming only, so don't vacuum (or analyze) anything that's not being + * forced. */ if (PointerIsValid(tabentry) && AutoVacuumingActive()) { @@ -3472,35 +3369,3 @@ AutoVacuumShmemInit(void) else Assert(found); } - -/* - * autovac_refresh_stats - * Refresh pgstats data for an autovacuum process - * - * Cause the next pgstats read operation to obtain fresh data, but throttle - * such refreshing in the autovacuum launcher. This is mostly to avoid - * rereading the pgstats files too many times in quick succession when there - * are many databases. - * - * Note: we avoid throttling in the autovac worker, as it would be - * counterproductive in the recheck logic. - */ -static void -autovac_refresh_stats(void) -{ - if (IsAutoVacuumLauncherProcess()) - { - static TimestampTz last_read = 0; - TimestampTz current_time; - - current_time = GetCurrentTimestamp(); - - if (!TimestampDifferenceExceeds(last_read, current_time, - STATS_READ_DELAY)) - return; - - last_read = current_time; - } - - pgstat_clear_snapshot(); -} diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index e733c70368..c937c39f50 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -212,6 +212,16 @@ CheckpointerMain(void) */ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); + /* + * Write out stats after shutdown. This needs to be called by exactly one + * process during a normal shutdown, and since checkpointer is shut down + * very late... + * + * Walsenders are shut down after the checkpointer, but currently don't + * report stats. If that changes, we need a more complicated solution. + */ + before_shmem_exit(pgstat_before_server_shutdown, 0); + /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid @@ -358,7 +368,7 @@ CheckpointerMain(void) if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) { do_checkpoint = true; - PendingCheckpointerStats.m_requested_checkpoints++; + PendingCheckpointerStats.requested_checkpoints++; } /* @@ -372,7 +382,7 @@ CheckpointerMain(void) if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) - PendingCheckpointerStats.m_timed_checkpoints++; + PendingCheckpointerStats.timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } @@ -569,7 +579,7 @@ HandleCheckpointerInterrupts(void) * updates the statistics, increment the checkpoint request and flush * out pending statistic. */ - PendingCheckpointerStats.m_requested_checkpoints++; + PendingCheckpointerStats.requested_checkpoints++; ShutdownXLOG(0, 0); pgstat_report_checkpointer(); pgstat_report_wal(true); @@ -1262,9 +1272,9 @@ AbsorbSyncRequests(void) LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); /* Transfer stats counts into pending pgstats message */ - PendingCheckpointerStats.m_buf_written_backend + PendingCheckpointerStats.buf_written_backend += CheckpointerShmem->num_backend_writes; - PendingCheckpointerStats.m_buf_fsync_backend + PendingCheckpointerStats.buf_fsync_backend += CheckpointerShmem->num_backend_fsync; CheckpointerShmem->num_backend_writes = 0; diff --git a/src/backend/postmaster/interrupt.c b/src/backend/postmaster/interrupt.c index 3f412dad2e..1aed2e2e99 100644 --- a/src/backend/postmaster/interrupt.c +++ b/src/backend/postmaster/interrupt.c @@ -98,9 +98,8 @@ SignalHandlerForCrashExit(SIGNAL_ARGS) * shut down and exit. * * Typically, this handler would be used for SIGTERM, but some processes use - * other signals. In particular, the checkpointer exits on SIGUSR2, the - * stats collector on SIGQUIT, and the WAL writer exits on either SIGINT - * or SIGTERM. + * other signals. In particular, the checkpointer exits on SIGUSR2, and the + * WAL writer exits on either SIGINT or SIGTERM. * * ShutdownRequestPending should be checked at a convenient place within the * main loop, or else the main loop should call HandleMainLoopInterrupts. diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 20c4629e55..a9f3a7ef49 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1,100 +1,161 @@ /* ---------- * pgstat.c + * Infrastructure for the cumulative statistics system. * - * All the statistics collector stuff hacked up in one big, ugly file. + * The cumulative statistics system accumulates statistics for different kinds + * of objects. Some kinds of statistics are collected for a fixed number of + * objects (most commonly 1), e.g., checkpointer statistics. Other kinds of + * statistics are collected for a varying number of objects + * (e.g. relations). See PgStat_KindInfo for a list of currently handled + * statistics. * - * TODO: - Separate collector, postmaster and backend stuff - * into different files. + * Statistics are loaded from the filesystem during startup (by the startup + * process), unless preceded by a crash, in which case all stats are + * discarded. They are written out by the checkpointer process just before + * shutting down, except when shutting down in immediate mode. * - * - Add some automatic call for pgstat vacuuming. + * Fixed-numbered stats are stored in plain (non-dynamic) shared memory. * - * - Add a pgstat config column to pg_database, so this - * entire thing can be enabled/disabled on a per db basis. + * Statistics for variable-numbered objects are stored in dynamic shared + * memory and can be found via a dshash hashtable. The statistics counters are + * not part of the dshash entry (PgStatShared_HashEntry) directly, but are + * separately allocated (PgStatShared_HashEntry->body). The separate + * allocation allows different kinds of statistics to be stored in the same + * hashtable without wasting space in PgStatShared_HashEntry. * - * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * Variable-numbered stats are addressed by PgStat_HashKey while running. It + * is not possible to have statistics for an object that cannot be addressed + * that way at runtime. A wider identifier can be used when serializing to + * disk (used for replication slot stats). * - * src/backend/postmaster/pgstat.c + * To avoid contention on the shared hashtable, each backend has a + * backend-local hashtable (pgStatEntryRefHash) in front of the shared + * hashtable, containing references (PgStat_EntryRef) to shared hashtable + * entries. The shared hashtable only needs to be accessed when no prior + * reference is found in the local hashtable. Besides pointing to the the + * shared hashtable entry (PgStatShared_HashEntry) PgStat_EntryRef also + * contains a pointer to the the shared statistics data, as a process-local + * address, to reduce access costs. + * + * The names for structs stored in shared memory are prefixed with + * PgStatShared instead of PgStat. Each stats entry in shared memory is + * protected by a dedicated lwlock. + * + * Most stats updates are first accumulated locally in each process as pending + * entries, then later flushed to shared memory (just after commit, or by + * idle-timeout). This practically eliminates contention on individual stats + * entries. For most kinds of variable-numbered pending stats data is stored + * in PgStat_EntryRef->pending. All entries with pending data are in the + * pgStatPending list. Pending statistics updates are flushed out by + * pgstat_report_stat(). + * + * The behavior of different kinds of statistics is determined by the kind's + * entry in pgstat_kind_infos, see PgStat_KindInfo for details. + * + * The consistency of read accesses to statistics can be configured using the + * stats_fetch_consistency GUC (see config.sgml and monitoring.sgml for the + * settings). When using PGSTAT_FETCH_CONSISTENCY_CACHE or + * PGSTAT_FETCH_CONSISTENCY_SNAPSHOT statistics are stored in + * pgStatLocal.snapshot. + * + * To keep things manageable, stats handling is split across several + * files. Infrastructure pieces are in: + * - pgstat.c - this file, to tie it all together + * - pgstat_shmem.c - nearly everything dealing with shared memory, including + * the maintenance of hashtable entries + * - pgstat_xact.c - transactional integration, including the transactional + * creation and dropping of stats entries + * + * Each statistics kind is handled in a dedicated file: + * - pgstat_archiver.c + * - pgstat_bgwriter.c + * - pgstat_checkpointer.c + * - pgstat_database.c + * - pgstat_function.c + * - pgstat_relation.c + * - pgstat_slru.c + * - pgstat_subscription.c + * - pgstat_wal.c + * + * Whenever possible infrastructure files should not contain code related to + * specific kinds of stats. + * + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/postmaster/pgstat.c * ---------- */ #include "postgres.h" #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_SYS_SELECT_H -#include -#endif -#include "access/heapam.h" -#include "access/htup_details.h" -#include "access/tableam.h" #include "access/transam.h" #include "access/xact.h" -#include "catalog/catalog.h" -#include "catalog/pg_database.h" -#include "catalog/pg_proc.h" -#include "catalog/pg_subscription.h" -#include "common/ip.h" -#include "libpq/libpq.h" -#include "libpq/pqsignal.h" -#include "mb/pg_wchar.h" -#include "miscadmin.h" +#include "lib/dshash.h" #include "pgstat.h" -#include "postmaster/autovacuum.h" -#include "postmaster/fork_process.h" -#include "postmaster/interrupt.h" -#include "postmaster/postmaster.h" -#include "replication/slot.h" -#include "replication/walsender.h" -#include "storage/backendid.h" -#include "storage/dsm.h" +#include "port/atomics.h" #include "storage/fd.h" #include "storage/ipc.h" -#include "storage/latch.h" -#include "storage/lmgr.h" +#include "storage/lwlock.h" #include "storage/pg_shmem.h" -#include "storage/proc.h" -#include "storage/procsignal.h" -#include "utils/builtins.h" +#include "storage/shmem.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/pgstat_internal.h" -#include "utils/ps_status.h" -#include "utils/rel.h" -#include "utils/snapmgr.h" #include "utils/timestamp.h" /* ---------- * Timer definitions. + * + * In milliseconds. * ---------- */ -#define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a - * new file; in milliseconds. */ +/* minimum interval non-forced stats flushes.*/ +#define PGSTAT_MIN_INTERVAL 1000 +/* how long until to block flushing pending stats updates */ +#define PGSTAT_MAX_INTERVAL 60000 +/* when to call pgstat_report_stat() again, even when idle */ +#define PGSTAT_IDLE_INTERVAL 10000 -#define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats - * file update; in milliseconds. */ +/* ---------- + * Initial size hints for the hash tables used in statistics. + * ---------- + */ -#define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a - * new file; in milliseconds. */ +#define PGSTAT_SNAPSHOT_HASH_SIZE 512 -#define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a - * failed statistics collector; in - * seconds. */ -#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY) -#define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY) +/* hash table for statistics snapshots entry */ +typedef struct PgStat_SnapshotEntry +{ + PgStat_HashKey key; + char status; /* for simplehash use */ + void *data; /* the stats data itself */ +} PgStat_SnapshotEntry; -/* Minimum receive buffer size for the collector's socket. */ -#define PGSTAT_MIN_RCVBUF (100 * 1024) + +/* ---------- + * Backend-local Hash Table Definitions + * ---------- + */ + +/* for stats snapshot entries */ +#define SH_PREFIX pgstat_snapshot +#define SH_ELEMENT_TYPE PgStat_SnapshotEntry +#define SH_KEY_TYPE PgStat_HashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL) +#define SH_EQUAL(tb, a, b) \ + pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" /* ---------- @@ -102,63 +163,18 @@ * ---------- */ -#ifdef EXEC_BACKEND -static pid_t pgstat_forkexec(void); -#endif +static void pgstat_write_statsfile(void); +static void pgstat_read_statsfile(void); -NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn(); +static void pgstat_reset_after_failure(TimestampTz ts); -static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create); -static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, - Oid tableoid, bool create); -static PgStat_StatSubEntry *pgstat_get_subscription_entry(Oid subid, bool create); -static void pgstat_reset_subscription(PgStat_StatSubEntry *subentry, TimestampTz ts); -static void pgstat_write_statsfiles(bool permanent, bool allDbs); -static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent); -static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep); -static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, - bool permanent); -static void backend_read_statsfile(void); +static bool pgstat_flush_pending_entries(bool nowait); -static bool pgstat_write_statsfile_needed(void); -static bool pgstat_db_requested(Oid databaseid); +static void pgstat_prep_snapshot(void); +static void pgstat_build_snapshot(void); +static void pgstat_build_snapshot_fixed(PgStat_Kind kind); -static PgStat_StatReplSlotEntry *pgstat_get_replslot_entry(NameData name, bool create_it); -static void pgstat_reset_replslot_entry(PgStat_StatReplSlotEntry *slotstats, TimestampTz ts); - -static HTAB *pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid); - -static void pgstat_setup_memcxt(void); - -static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len); -static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len); -static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len); -static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len); -static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len); -static void pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len); -static void pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len); -static void pgstat_recv_resetslrucounter(PgStat_MsgResetslrucounter *msg, int len); -static void pgstat_recv_resetreplslotcounter(PgStat_MsgResetreplslotcounter *msg, int len); -static void pgstat_recv_resetsubcounter(PgStat_MsgResetsubcounter *msg, int len); -static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len); -static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len); -static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len); -static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len); -static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len); -static void pgstat_recv_checkpointer(PgStat_MsgCheckpointer *msg, int len); -static void pgstat_recv_wal(PgStat_MsgWal *msg, int len); -static void pgstat_recv_slru(PgStat_MsgSLRU *msg, int len); -static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len); -static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); -static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); -static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); -static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len); -static void pgstat_recv_connect(PgStat_MsgConnect *msg, int len); -static void pgstat_recv_disconnect(PgStat_MsgDisconnect *msg, int len); -static void pgstat_recv_replslot(PgStat_MsgReplSlot *msg, int len); -static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); -static void pgstat_recv_subscription_drop(PgStat_MsgSubscriptionDrop *msg, int len); -static void pgstat_recv_subscription_error(PgStat_MsgSubscriptionError *msg, int len); +static inline bool pgstat_is_kind_valid(int ikind); /* ---------- @@ -167,6 +183,7 @@ static void pgstat_recv_subscription_error(PgStat_MsgSubscriptionError *msg, int */ bool pgstat_track_counts = false; +int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE; /* ---------- @@ -184,44 +201,33 @@ char *pgstat_stat_tmpname = NULL; * ---------- */ -pgsocket pgStatSock = PGINVALID_SOCKET; +PgStat_LocalState pgStatLocal; /* ---------- * Local data + * + * NB: There should be only variables related to stats infrastructure here, + * not for specific kinds of stats. * ---------- */ -static struct sockaddr_storage pgStatAddr; +/* + * Memory contexts containing the pgStatEntryRefHash table, the + * pgStatSharedRef entries, and pending data respectively. Mostly to make it + * easier to track / attribute memory usage. + */ -static time_t last_pgstat_start_time; - -static bool pgStatRunningInCollector = false; +static MemoryContext pgStatPendingContext = NULL; /* - * Info about current "snapshot" of stats file + * Backend local list of PgStat_EntryRef with unflushed pending stats. + * + * Newly pending entries should only ever be added to the end of the list, + * otherwise pgstat_flush_pending_entries() might not see them immediately. */ -static MemoryContext pgStatLocalContext = NULL; -static HTAB *pgStatDBHash = NULL; +static dlist_head pgStatPending = DLIST_STATIC_INIT(pgStatPending); -/* - * Cluster wide statistics, kept in the stats collector. - * Contains statistics that are not collected per database - * or per table. - */ -static PgStat_ArchiverStats archiverStats; -static PgStat_GlobalStats globalStats; -static PgStat_WalStats walStats; -static PgStat_SLRUStats slruStats[SLRU_NUM_ELEMENTS]; -static HTAB *replSlotStatHash = NULL; -static HTAB *subscriptionStatHash = NULL; - -/* - * List of OIDs of databases we need to write out. If an entry is InvalidOid, - * it means to write only the shared-catalog stats ("DB 0"); otherwise, we - * will write both that DB's data and the shared stats. - */ -static List *pending_write_requests = NIL; /* * For assertions that check pgstat is not used before initialization / after @@ -233,455 +239,234 @@ static bool pgstat_is_shutdown = false; #endif +/* + * The different kinds of statistics. + * + * If reasonably possible, handling specific to one kind of stats should go + * through this abstraction, rather than making more of pgstat.c aware. + * + * See comments for struct PgStat_KindInfo for details about the individual + * fields. + * + * XXX: It'd be nicer to define this outside of this file. But there doesn't + * seem to be a great way of doing that, given the split across multiple + * files. + */ +static const PgStat_KindInfo pgstat_kind_infos[PGSTAT_NUM_KINDS] = { + + /* stats kinds for variable-numbered objects */ + + [PGSTAT_KIND_DATABASE] = { + .name = "database", + + .fixed_amount = false, + /* so pg_stat_database entries can be seen in all databases */ + .accessed_across_databases = true, + + .shared_size = sizeof(PgStatShared_Database), + .shared_data_off = offsetof(PgStatShared_Database, stats), + .shared_data_len = sizeof(((PgStatShared_Database *) 0)->stats), + .pending_size = sizeof(PgStat_StatDBEntry), + + .flush_pending_cb = pgstat_database_flush_cb, + .reset_timestamp_cb = pgstat_database_reset_timestamp_cb, + }, + + [PGSTAT_KIND_RELATION] = { + .name = "relation", + + .fixed_amount = false, + + .shared_size = sizeof(PgStatShared_Relation), + .shared_data_off = offsetof(PgStatShared_Relation, stats), + .shared_data_len = sizeof(((PgStatShared_Relation *) 0)->stats), + .pending_size = sizeof(PgStat_TableStatus), + + .flush_pending_cb = pgstat_relation_flush_cb, + .delete_pending_cb = pgstat_relation_delete_pending_cb, + }, + + [PGSTAT_KIND_FUNCTION] = { + .name = "function", + + .fixed_amount = false, + + .shared_size = sizeof(PgStatShared_Function), + .shared_data_off = offsetof(PgStatShared_Function, stats), + .shared_data_len = sizeof(((PgStatShared_Function *) 0)->stats), + .pending_size = sizeof(PgStat_BackendFunctionEntry), + + .flush_pending_cb = pgstat_function_flush_cb, + }, + + [PGSTAT_KIND_REPLSLOT] = { + .name = "replslot", + + .fixed_amount = false, + + .accessed_across_databases = true, + .named_on_disk = true, + + .shared_size = sizeof(PgStatShared_ReplSlot), + .shared_data_off = offsetof(PgStatShared_ReplSlot, stats), + .shared_data_len = sizeof(((PgStatShared_ReplSlot *) 0)->stats), + + .reset_timestamp_cb = pgstat_replslot_reset_timestamp_cb, + .to_serialized_name = pgstat_replslot_to_serialized_name_cb, + .from_serialized_name = pgstat_replslot_from_serialized_name_cb, + }, + + [PGSTAT_KIND_SUBSCRIPTION] = { + .name = "subscription", + + .fixed_amount = false, + /* so pg_stat_subscription_stats entries can be seen in all databases */ + .accessed_across_databases = true, + + .shared_size = sizeof(PgStatShared_Subscription), + .shared_data_off = offsetof(PgStatShared_Subscription, stats), + .shared_data_len = sizeof(((PgStatShared_Subscription *) 0)->stats), + .pending_size = sizeof(PgStat_BackendSubEntry), + + .flush_pending_cb = pgstat_subscription_flush_cb, + .reset_timestamp_cb = pgstat_subscription_reset_timestamp_cb, + }, + + + /* stats for fixed-numbered (mostly 1) objects */ + + [PGSTAT_KIND_ARCHIVER] = { + .name = "archiver", + + .fixed_amount = true, + + .reset_all_cb = pgstat_archiver_reset_all_cb, + .snapshot_cb = pgstat_archiver_snapshot_cb, + }, + + [PGSTAT_KIND_BGWRITER] = { + .name = "bgwriter", + + .fixed_amount = true, + + .reset_all_cb = pgstat_bgwriter_reset_all_cb, + .snapshot_cb = pgstat_bgwriter_snapshot_cb, + }, + + [PGSTAT_KIND_CHECKPOINTER] = { + .name = "checkpointer", + + .fixed_amount = true, + + .reset_all_cb = pgstat_checkpointer_reset_all_cb, + .snapshot_cb = pgstat_checkpointer_snapshot_cb, + }, + + [PGSTAT_KIND_SLRU] = { + .name = "slru", + + .fixed_amount = true, + + .reset_all_cb = pgstat_slru_reset_all_cb, + .snapshot_cb = pgstat_slru_snapshot_cb, + }, + + [PGSTAT_KIND_WAL] = { + .name = "wal", + + .fixed_amount = true, + + .reset_all_cb = pgstat_wal_reset_all_cb, + .snapshot_cb = pgstat_wal_snapshot_cb, + }, +}; + + /* ------------------------------------------------------------ - * Public functions called from postmaster follow + * Functions managing the state of the stats system for all backends. * ------------------------------------------------------------ */ /* - * Called from postmaster at startup. Create the resources required - * by the statistics collector process. If unable to do so, do not - * fail --- better to let the postmaster start with stats collection - * disabled. + * Read on-disk stats into memory at server start. + * + * Should only be called by the startup process or in single user mode. */ void -pgstat_init(void) +pgstat_restore_stats(void) +{ + pgstat_read_statsfile(); +} + +/* + * Remove the stats file. This is currently used only if WAL recovery is + * needed after a crash. + * + * Should only be called by the startup process or in single user mode. + */ +void +pgstat_discard_stats(void) { - socklen_t alen; - struct addrinfo *addrs = NULL, - *addr, - hints; int ret; - fd_set rset; - struct timeval tv; - char test_byte; - int sel_res; - int tries = 0; -#define TESTBYTEVAL ((char) 199) + /* NB: this needs to be done even in single user mode */ - /* - * This static assertion verifies that we didn't mess up the calculations - * involved in selecting maximum payload sizes for our UDP messages. - * Because the only consequence of overrunning PGSTAT_MAX_MSG_SIZE would - * be silent performance loss from fragmentation, it seems worth having a - * compile-time cross-check that we didn't. - */ - StaticAssertStmt(sizeof(PgStat_Msg) <= PGSTAT_MAX_MSG_SIZE, - "maximum stats message size exceeds PGSTAT_MAX_MSG_SIZE"); - - /* - * Create the UDP socket for sending and receiving statistic messages - */ - hints.ai_flags = AI_PASSIVE; - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_DGRAM; - hints.ai_protocol = 0; - hints.ai_addrlen = 0; - hints.ai_addr = NULL; - hints.ai_canonname = NULL; - hints.ai_next = NULL; - ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs); - if (ret || !addrs) + ret = unlink(PGSTAT_STAT_PERMANENT_FILENAME); + if (ret != 0) { - ereport(LOG, - (errmsg("could not resolve \"localhost\": %s", - gai_strerror(ret)))); - goto startup_failed; - } - - /* - * On some platforms, pg_getaddrinfo_all() may return multiple addresses - * only one of which will actually work (eg, both IPv6 and IPv4 addresses - * when kernel will reject IPv6). Worse, the failure may occur at the - * bind() or perhaps even connect() stage. So we must loop through the - * results till we find a working combination. We will generate LOG - * messages, but no error, for bogus combinations. - */ - for (addr = addrs; addr; addr = addr->ai_next) - { -#ifdef HAVE_UNIX_SOCKETS - /* Ignore AF_UNIX sockets, if any are returned. */ - if (addr->ai_family == AF_UNIX) - continue; -#endif - - if (++tries > 1) - ereport(LOG, - (errmsg("trying another address for the statistics collector"))); - - /* - * Create the socket. - */ - if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not create socket for statistics collector: %m"))); - continue; - } - - /* - * Bind it to a kernel assigned port on localhost and get the assigned - * port via getsockname(). - */ - if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not bind socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - alen = sizeof(pgStatAddr); - if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not get address of socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - /* - * Connect the socket to its own address. This saves a few cycles by - * not having to respecify the target address on every send. This also - * provides a kernel-level check that only packets from this same - * address will be received. - */ - if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not connect socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - /* - * Try to send and receive a one-byte test message on the socket. This - * is to catch situations where the socket can be created but will not - * actually pass data (for instance, because kernel packet filtering - * rules prevent it). - */ - test_byte = TESTBYTEVAL; - -retry1: - if (send(pgStatSock, &test_byte, 1, 0) != 1) - { - if (errno == EINTR) - goto retry1; /* if interrupted, just retry */ - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not send test message on socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - /* - * There could possibly be a little delay before the message can be - * received. We arbitrarily allow up to half a second before deciding - * it's broken. - */ - for (;;) /* need a loop to handle EINTR */ - { - FD_ZERO(&rset); - FD_SET(pgStatSock, &rset); - - tv.tv_sec = 0; - tv.tv_usec = 500000; - sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv); - if (sel_res >= 0 || errno != EINTR) - break; - } - if (sel_res < 0) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("select() failed in statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset)) - { - /* - * This is the case we actually think is likely, so take pains to - * give a specific message for it. - * - * errno will not be set meaningfully here, so don't use it. - */ - ereport(LOG, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("test message did not get through on socket for statistics collector"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - test_byte++; /* just make sure variable is changed */ - -retry2: - if (recv(pgStatSock, &test_byte, 1, 0) != 1) - { - if (errno == EINTR) - goto retry2; /* if interrupted, just retry */ - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not receive test message on socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */ - { - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("incorrect test message transmission on socket for statistics collector"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - /* If we get here, we have a working socket */ - break; - } - - /* Did we find a working address? */ - if (!addr || pgStatSock == PGINVALID_SOCKET) - goto startup_failed; - - /* - * Set the socket to non-blocking IO. This ensures that if the collector - * falls behind, statistics messages will be discarded; backends won't - * block waiting to send messages to the collector. - */ - if (!pg_set_noblock(pgStatSock)) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not set statistics collector socket to nonblocking mode: %m"))); - goto startup_failed; - } - - /* - * Try to ensure that the socket's receive buffer is at least - * PGSTAT_MIN_RCVBUF bytes, so that it won't easily overflow and lose - * data. Use of UDP protocol means that we are willing to lose data under - * heavy load, but we don't want it to happen just because of ridiculously - * small default buffer sizes (such as 8KB on older Windows versions). - */ - { - int old_rcvbuf; - int new_rcvbuf; - socklen_t rcvbufsize = sizeof(old_rcvbuf); - - if (getsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF, - (char *) &old_rcvbuf, &rcvbufsize) < 0) - { - ereport(LOG, - (errmsg("%s(%s) failed: %m", "getsockopt", "SO_RCVBUF"))); - /* if we can't get existing size, always try to set it */ - old_rcvbuf = 0; - } - - new_rcvbuf = PGSTAT_MIN_RCVBUF; - if (old_rcvbuf < new_rcvbuf) - { - if (setsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF, - (char *) &new_rcvbuf, sizeof(new_rcvbuf)) < 0) - ereport(LOG, - (errmsg("%s(%s) failed: %m", "setsockopt", "SO_RCVBUF"))); - } - } - - pg_freeaddrinfo_all(hints.ai_family, addrs); - - /* Now that we have a long-lived socket, tell fd.c about it. */ - ReserveExternalFD(); - - return; - -startup_failed: - ereport(LOG, - (errmsg("disabling statistics collector for lack of working socket"))); - - if (addrs) - pg_freeaddrinfo_all(hints.ai_family, addrs); - - if (pgStatSock != PGINVALID_SOCKET) - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - - /* - * Adjust GUC variables to suppress useless activity, and for debugging - * purposes (seeing track_counts off is a clue that we failed here). We - * use PGC_S_OVERRIDE because there is no point in trying to turn it back - * on from postgresql.conf without a restart. - */ - SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE); -} - -/* - * subroutine for pgstat_reset_all - */ -static void -pgstat_reset_remove_files(const char *directory) -{ - DIR *dir; - struct dirent *entry; - char fname[MAXPGPATH * 2]; - - dir = AllocateDir(directory); - while ((entry = ReadDir(dir, directory)) != NULL) - { - int nchars; - Oid tmp_oid; - - /* - * Skip directory entries that don't match the file names we write. - * See get_dbstat_filename for the database-specific pattern. - */ - if (strncmp(entry->d_name, "global.", 7) == 0) - nchars = 7; + if (errno == ENOENT) + elog(DEBUG2, + "didn't need to unlink permanent stats file \"%s\" - didn't exist", + PGSTAT_STAT_PERMANENT_FILENAME); else - { - nchars = 0; - (void) sscanf(entry->d_name, "db_%u.%n", - &tmp_oid, &nchars); - if (nchars <= 0) - continue; - /* %u allows leading whitespace, so reject that */ - if (strchr("0123456789", entry->d_name[3]) == NULL) - continue; - } - - if (strcmp(entry->d_name + nchars, "tmp") != 0 && - strcmp(entry->d_name + nchars, "stat") != 0) - continue; - - snprintf(fname, sizeof(fname), "%s/%s", directory, - entry->d_name); - unlink(fname); - } - FreeDir(dir); -} - -/* - * Remove the stats files. This is currently used only if WAL - * recovery is needed after a crash. - */ -void -pgstat_reset_all(void) -{ - pgstat_reset_remove_files(pgstat_stat_directory); - pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY); -} - -#ifdef EXEC_BACKEND - -/* - * Format up the arglist for, then fork and exec, statistics collector process - */ -static pid_t -pgstat_forkexec(void) -{ - char *av[10]; - int ac = 0; - - av[ac++] = "postgres"; - av[ac++] = "--forkcol"; - av[ac++] = NULL; /* filled in by postmaster_forkexec */ - - av[ac] = NULL; - Assert(ac < lengthof(av)); - - return postmaster_forkexec(ac, av); -} -#endif /* EXEC_BACKEND */ - - -/* - * Called from postmaster at startup or after an existing collector - * died. Attempt to fire up a fresh statistics collector. - * - * Returns PID of child process, or 0 if fail. - * - * Note: if fail, we will be called again from the postmaster main loop. - */ -int -pgstat_start(void) -{ - time_t curtime; - pid_t pgStatPid; - - /* - * Check that the socket is there, else pgstat_init failed and we can do - * nothing useful. - */ - if (pgStatSock == PGINVALID_SOCKET) - return 0; - - /* - * Do nothing if too soon since last collector start. This is a safety - * valve to protect against continuous respawn attempts if the collector - * is dying immediately at launch. Note that since we will be re-called - * from the postmaster main loop, we will get another chance later. - */ - curtime = time(NULL); - if ((unsigned int) (curtime - last_pgstat_start_time) < - (unsigned int) PGSTAT_RESTART_INTERVAL) - return 0; - last_pgstat_start_time = curtime; - - /* - * Okay, fork off the collector. - */ -#ifdef EXEC_BACKEND - switch ((pgStatPid = pgstat_forkexec())) -#else - switch ((pgStatPid = fork_process())) -#endif - { - case -1: ereport(LOG, - (errmsg("could not fork statistics collector: %m"))); - return 0; - -#ifndef EXEC_BACKEND - case 0: - /* in postmaster child ... */ - InitPostmasterChild(); - - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - - /* Drop our connection to postmaster's shared memory, as well */ - dsm_detach_all(); - PGSharedMemoryDetach(); - - PgstatCollectorMain(0, NULL); - break; -#endif - - default: - return (int) pgStatPid; + (errcode_for_file_access(), + errmsg("could not unlink permanent statistics file \"%s\": %m", + PGSTAT_STAT_PERMANENT_FILENAME))); + } + else + { + ereport(DEBUG2, + (errcode_for_file_access(), + errmsg("unlinked permanent statistics file \"%s\"", + PGSTAT_STAT_PERMANENT_FILENAME))); } - - /* shouldn't get here */ - return 0; } +/* + * pgstat_before_server_shutdown() needs to be called by exactly one process + * during regular server shutdowns. Otherwise all stats will be lost. + * + * We currently only write out stats for proc_exit(0). We might want to change + * that at some point... But right now pgstat_discard_stats() would be called + * during the start after a disorderly shutdown, anyway. + */ void -allow_immediate_pgstat_restart(void) +pgstat_before_server_shutdown(int code, Datum arg) { - last_pgstat_start_time = 0; + Assert(pgStatLocal.shmem != NULL); + Assert(!pgStatLocal.shmem->is_shutdown); + + /* + * Stats should only be reported after pgstat_initialize() and before + * pgstat_shutdown(). This is a convenient point to catch most violations + * of this rule. + */ + Assert(pgstat_is_initialized && !pgstat_is_shutdown); + + /* flush out our own pending changes before writing out */ + pgstat_report_stat(true); + + /* + * Only write out file during normal shutdown. Don't even signal that + * we've shutdown during irregular shutdowns, because the shutdown + * sequence isn't coordinated to ensure this backend shuts down last. + */ + if (code == 0) + { + pgStatLocal.shmem->is_shutdown = true; + pgstat_write_statsfile(); + } } @@ -701,6 +486,7 @@ static void pgstat_shutdown_hook(int code, Datum arg) { Assert(!pgstat_is_shutdown); + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); /* * If we got as far as discovering our own database ID, we can flush out @@ -709,7 +495,15 @@ pgstat_shutdown_hook(int code, Datum arg) * failed backend starts might never get counted.) */ if (OidIsValid(MyDatabaseId)) - pgstat_report_stat(true); + pgstat_report_disconnect(MyDatabaseId); + + pgstat_report_stat(true); + + /* there shouldn't be any pending changes left */ + Assert(dlist_is_empty(&pgStatPending)); + dlist_init(&pgStatPending); + + pgstat_detach_shmem(); #ifdef USE_ASSERT_CHECKING pgstat_is_shutdown = true; @@ -727,6 +521,8 @@ pgstat_initialize(void) { Assert(!pgstat_is_initialized); + pgstat_attach_shmem(); + pgstat_init_wal(); /* Set up a process-exit hook to clean up */ @@ -745,331 +541,119 @@ pgstat_initialize(void) /* * Must be called by processes that performs DML: tcop/postgres.c, logical - * receiver processes, SPI worker, etc. to send the so far collected - * per-table and function usage statistics to the collector. Note that this - * is called only when not within a transaction, so it is fair to use - * transaction stop time as an approximation of current time. + * receiver processes, SPI worker, etc. to flush pending statistics updates to + * shared memory. * - * "disconnect" is "true" only for the last call before the backend - * exits. This makes sure that no data is lost and that interrupted - * sessions are reported correctly. + * Unless called with 'force', pending stats updates are flushed happen once + * per PGSTAT_MIN_INTERVAL (1000ms). When not forced, stats flushes do not + * block on lock acquisition, except if stats updates have been pending for + * longer than PGSTAT_MAX_INTERVAL (60000ms). + * + * Whenever pending stats updates remain at the end of pgstat_report_stat() a + * suggested idle timeout is returned. Currently this is always + * PGSTAT_IDLE_INTERVAL (10000ms). Callers can use the returned time to set up + * a timeout after which to call pgstat_report_stat(true), but are not + * required to to do so. + * + * Note that this is called only when not within a transaction, so it is fair + * to use transaction stop time as an approximation of current time. */ -void -pgstat_report_stat(bool disconnect) +long +pgstat_report_stat(bool force) { - static TimestampTz last_report = 0; - + static TimestampTz pending_since = 0; + static TimestampTz last_flush = 0; + bool partial_flush; TimestampTz now; + bool nowait; pgstat_assert_is_up(); + Assert(!IsTransactionBlock()); + + /* Don't expend a clock check if nothing to do */ + if (dlist_is_empty(&pgStatPending) && + !have_slrustats && + !pgstat_have_pending_wal()) + { + Assert(pending_since == 0); + return 0; + } /* - * Don't expend a clock check if nothing to do. + * There should never be stats to report once stats are shut down. Can't + * assert that before the checks above, as there is an unconditional + * pgstat_report_stat() call in pgstat_shutdown_hook() - which at least + * the process that ran pgstat_before_server_shutdown() will still call. */ - if (!have_relation_stats && - pgStatXactCommit == 0 && pgStatXactRollback == 0 && - !pgstat_have_pending_wal() && - !have_function_stats && !disconnect) - return; + Assert(!pgStatLocal.shmem->is_shutdown); - /* - * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL - * msec since we last sent one, or the backend is about to exit. - */ now = GetCurrentTransactionStopTimestamp(); - if (!disconnect && - !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL)) - return; - last_report = now; + if (!force) + { + if (pending_since > 0 && + TimestampDifferenceExceeds(pending_since, now, PGSTAT_MAX_INTERVAL)) + { + /* don't keep pending updates longer than PGSTAT_MAX_INTERVAL */ + force = true; + } + else if (last_flush > 0 && + !TimestampDifferenceExceeds(last_flush, now, PGSTAT_MIN_INTERVAL)) + { + /* don't flush too frequently */ + if (pending_since == 0) + pending_since = now; - if (disconnect) - pgstat_report_disconnect(MyDatabaseId); + return PGSTAT_IDLE_INTERVAL; + } + } - /* First, send relation statistics */ - pgstat_send_tabstats(now, disconnect); + pgstat_update_dbstats(now); - /* Now, send function statistics */ - pgstat_send_funcstats(); + /* don't wait for lock acquisition when !force */ + nowait = !force; - /* Send WAL statistics */ - pgstat_report_wal(true); + partial_flush = false; - /* Finally send SLRU statistics */ - pgstat_send_slru(); + /* flush database / relation / function / ... stats */ + partial_flush |= pgstat_flush_pending_entries(nowait); + + /* flush wal stats */ + partial_flush |= pgstat_flush_wal(nowait); + + /* flush SLRU stats */ + partial_flush |= pgstat_slru_flush(nowait); + + last_flush = now; + + /* + * If some of the pending stats could not be flushed due to lock + * contention, let the caller know when to retry. + */ + if (partial_flush) + { + /* force should have prevented us from getting here */ + Assert(!force); + + /* remember since when stats have been pending */ + if (pending_since == 0) + pending_since = now; + + return PGSTAT_IDLE_INTERVAL; + } + + pending_since = 0; + + return 0; } /* - * Will tell the collector about objects he can get rid of. + * Only for use by pgstat_reset_counters() */ -void -pgstat_vacuum_stat(void) +static bool +match_db_entries(PgStatShared_HashEntry *entry, Datum match_data) { - HTAB *htab; - PgStat_MsgTabpurge msg; - PgStat_MsgFuncpurge f_msg; - HASH_SEQ_STATUS hstat; - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; - PgStat_StatFuncEntry *funcentry; - int len; - - if (pgStatSock == PGINVALID_SOCKET) - return; - - /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. - */ - backend_read_statsfile(); - - /* - * Read pg_database and make a list of OIDs of all existing databases - */ - htab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid); - - /* - * Search the database hash table for dead databases and tell the - * collector to drop them. - */ - hash_seq_init(&hstat, pgStatDBHash); - while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) - { - Oid dbid = dbentry->databaseid; - - CHECK_FOR_INTERRUPTS(); - - /* the DB entry for shared tables (with InvalidOid) is never dropped */ - if (OidIsValid(dbid) && - hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL) - pgstat_drop_database(dbid); - } - - /* Clean up */ - hash_destroy(htab); - - /* - * Search for all the dead replication slots in stats hashtable and tell - * the stats collector to drop them. - */ - if (replSlotStatHash) - { - PgStat_StatReplSlotEntry *slotentry; - - hash_seq_init(&hstat, replSlotStatHash); - while ((slotentry = (PgStat_StatReplSlotEntry *) hash_seq_search(&hstat)) != NULL) - { - CHECK_FOR_INTERRUPTS(); - - if (SearchNamedReplicationSlot(NameStr(slotentry->slotname), true) == NULL) - { - PgStat_MsgReplSlot msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(slotentry->slotname)); - msg.m_create = false; - msg.m_drop = true; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); - } - } - } - - /* - * Repeat the above steps for subscriptions, if subscription stats are - * being collected. - */ - if (subscriptionStatHash) - { - PgStat_StatSubEntry *subentry; - - /* - * Read pg_subscription and make a list of OIDs of all existing - * subscriptions. - */ - htab = pgstat_collect_oids(SubscriptionRelationId, Anum_pg_subscription_oid); - - hash_seq_init(&hstat, subscriptionStatHash); - while ((subentry = (PgStat_StatSubEntry *) hash_seq_search(&hstat)) != NULL) - { - CHECK_FOR_INTERRUPTS(); - - if (hash_search(htab, (void *) &(subentry->subid), HASH_FIND, NULL) == NULL) - pgstat_drop_subscription(subentry->subid); - } - - hash_destroy(htab); - } - - /* - * Lookup our own database entry; if not found, nothing more to do. - */ - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &MyDatabaseId, - HASH_FIND, NULL); - if (dbentry == NULL || dbentry->tables == NULL) - return; - - /* - * Similarly to above, make a list of all known relations in this DB. - */ - htab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid); - - /* - * Initialize our messages table counter to zero - */ - msg.m_nentries = 0; - - /* - * Check for all tables listed in stats hashtable if they still exist. - */ - hash_seq_init(&hstat, dbentry->tables); - while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL) - { - Oid tabid = tabentry->tableid; - - CHECK_FOR_INTERRUPTS(); - - if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL) - continue; - - /* - * Not there, so add this table's Oid to the message - */ - msg.m_tableid[msg.m_nentries++] = tabid; - - /* - * If the message is full, send it out and reinitialize to empty - */ - if (msg.m_nentries >= PGSTAT_NUM_TABPURGE) - { - len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) - + msg.m_nentries * sizeof(Oid); - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, len); - - msg.m_nentries = 0; - } - } - - /* - * Send the rest - */ - if (msg.m_nentries > 0) - { - len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) - + msg.m_nentries * sizeof(Oid); - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, len); - } - - /* Clean up */ - hash_destroy(htab); - - /* - * Now repeat the above steps for functions. However, we needn't bother - * in the common case where no function stats are being collected. - */ - if (dbentry->functions != NULL && - hash_get_num_entries(dbentry->functions) > 0) - { - htab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid); - - pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE); - f_msg.m_databaseid = MyDatabaseId; - f_msg.m_nentries = 0; - - hash_seq_init(&hstat, dbentry->functions); - while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL) - { - Oid funcid = funcentry->functionid; - - CHECK_FOR_INTERRUPTS(); - - if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL) - continue; - - /* - * Not there, so add this function's Oid to the message - */ - f_msg.m_functionid[f_msg.m_nentries++] = funcid; - - /* - * If the message is full, send it out and reinitialize to empty - */ - if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE) - { - len = offsetof(PgStat_MsgFuncpurge, m_functionid[0]) - + f_msg.m_nentries * sizeof(Oid); - - pgstat_send(&f_msg, len); - - f_msg.m_nentries = 0; - } - } - - /* - * Send the rest - */ - if (f_msg.m_nentries > 0) - { - len = offsetof(PgStat_MsgFuncpurge, m_functionid[0]) - + f_msg.m_nentries * sizeof(Oid); - - pgstat_send(&f_msg, len); - } - - hash_destroy(htab); - } -} - -/* - * Collect the OIDs of all objects listed in the specified system catalog - * into a temporary hash table. Caller should hash_destroy the result - * when done with it. (However, we make the table in CurrentMemoryContext - * so that it will be freed properly in event of an error.) - */ -static HTAB * -pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid) -{ - HTAB *htab; - HASHCTL hash_ctl; - Relation rel; - TableScanDesc scan; - HeapTuple tup; - Snapshot snapshot; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(Oid); - hash_ctl.hcxt = CurrentMemoryContext; - htab = hash_create("Temporary table of OIDs", - PGSTAT_TAB_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - rel = table_open(catalogid, AccessShareLock); - snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(rel, snapshot, 0, NULL); - while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL) - { - Oid thisoid; - bool isnull; - - thisoid = heap_getattr(tup, anum_oid, RelationGetDescr(rel), &isnull); - Assert(!isnull); - - CHECK_FOR_INTERRUPTS(); - - (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL); - } - table_endscan(scan); - UnregisterSnapshot(snapshot); - table_close(rel, AccessShareLock); - - return htab; + return entry->key.dboid == DatumGetObjectId(MyDatabaseId); } /* @@ -1081,14 +665,11 @@ pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid) void pgstat_reset_counters(void) { - PgStat_MsgResetcounter msg; + TimestampTz ts = GetCurrentTimestamp(); - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, sizeof(msg)); + pgstat_reset_matching_entries(match_db_entries, + ObjectIdGetDatum(MyDatabaseId), + ts); } /* @@ -1103,38 +684,17 @@ pgstat_reset_counters(void) void pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objoid) { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + TimestampTz ts = GetCurrentTimestamp(); - if (pgStatSock == PGINVALID_SOCKET) - return; + /* not needed atm, and doesn't make sense with the current signature */ + Assert(!pgstat_get_kind_info(kind)->fixed_amount); - switch (kind) - { - case PGSTAT_KIND_FUNCTION: - case PGSTAT_KIND_RELATION: - { - PgStat_MsgResetsinglecounter msg; + /* reset the "single counter" */ + pgstat_reset_entry(kind, dboid, objoid, ts); - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSINGLECOUNTER); - msg.m_databaseid = dboid; - msg.m_resettype = kind; - msg.m_objectid = objoid; - pgstat_send(&msg, sizeof(msg)); - } - break; - - case PGSTAT_KIND_SUBSCRIPTION: - { - PgStat_MsgResetsubcounter msg; - - Assert(dboid == InvalidOid); - msg.m_subid = objoid; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSUBCOUNTER); - } - break; - - default: - elog(ERROR, "unexpected"); - } + if (!kind_info->accessed_across_databases) + pgstat_reset_database_timestamp(dboid, ts); } /* @@ -1146,87 +706,20 @@ pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objoid) void pgstat_reset_of_kind(PgStat_Kind kind) { - if (pgStatSock == PGINVALID_SOCKET) - return; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + TimestampTz ts = GetCurrentTimestamp(); - switch (kind) - { - case PGSTAT_KIND_ARCHIVER: - case PGSTAT_KIND_BGWRITER: - case PGSTAT_KIND_CHECKPOINTER: - case PGSTAT_KIND_WAL: - { - PgStat_MsgResetsharedcounter msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER); - msg.m_resettarget = kind; - pgstat_send(&msg, sizeof(msg)); - } - break; - case PGSTAT_KIND_SLRU: - { - PgStat_MsgResetslrucounter msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSLRUCOUNTER); - msg.m_index = -1; - pgstat_send(&msg, sizeof(msg)); - } - break; - case PGSTAT_KIND_REPLSLOT: - { - PgStat_MsgResetreplslotcounter msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETREPLSLOTCOUNTER); - msg.clearall = true; - pgstat_send(&msg, sizeof(msg)); - } - break; - - case PGSTAT_KIND_SUBSCRIPTION: - { - PgStat_MsgResetsubcounter msg; - - msg.m_subid = InvalidOid; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSUBCOUNTER); - - pgstat_send(&msg, sizeof(msg)); - } - break; - - default: - elog(ERROR, "unexpected"); - } + if (kind_info->fixed_amount) + kind_info->reset_all_cb(ts); + else + pgstat_reset_entries_of_kind(kind, ts); } -/* - * Send some junk data to the collector to increase traffic. + +/* ------------------------------------------------------------ + * Fetching of stats + * ------------------------------------------------------------ */ -void -pgstat_ping(void) -{ - PgStat_MsgDummy msg; - - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY); - pgstat_send(&msg, sizeof(msg)); -} - -/* - * Notify collector that we need fresh data. - */ -static void -pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid) -{ - PgStat_MsgInquiry msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); - msg.clock_time = clock_time; - msg.cutoff_time = cutoff_time; - msg.databaseid = databaseid; - pgstat_send(&msg, sizeof(msg)); -} /* * Discard any data collected in the current transaction. Any subsequent @@ -1240,15 +733,19 @@ pgstat_clear_snapshot(void) { pgstat_assert_is_up(); - /* Release memory, if any was allocated */ - if (pgStatLocalContext) - MemoryContextDelete(pgStatLocalContext); + memset(&pgStatLocal.snapshot.fixed_valid, 0, + sizeof(pgStatLocal.snapshot.fixed_valid)); + pgStatLocal.snapshot.stats = NULL; + pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_NONE; - /* Reset variables */ - pgStatLocalContext = NULL; - pgStatDBHash = NULL; - replSlotStatHash = NULL; - subscriptionStatHash = NULL; + /* Release memory, if any was allocated */ + if (pgStatLocal.snapshot.context) + { + MemoryContextDelete(pgStatLocal.snapshot.context); + + /* Reset variables */ + pgStatLocal.snapshot.context = NULL; + } /* * Historically the backend_status.c facilities lived in this file, and @@ -1258,204 +755,399 @@ pgstat_clear_snapshot(void) pgstat_clear_backend_activity_snapshot(); } -/* - * Support function for the SQL-callable pgstat* functions. Returns - * the collected statistics for one database or NULL. NULL doesn't mean - * that the database doesn't exist, just that there are no statistics, so the - * caller is better off to report ZERO instead. - */ -PgStat_StatDBEntry * -pgstat_fetch_stat_dbentry(Oid dbid) +void * +pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, Oid objoid) { - /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. - */ - backend_read_statsfile(); + PgStat_HashKey key; + PgStat_EntryRef *entry_ref; + void *stats_data; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - /* - * Lookup the requested database; return NULL if not found - */ - return (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &dbid, - HASH_FIND, NULL); -} + /* should be called from backends */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + AssertArg(!kind_info->fixed_amount); -/* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the global statistics struct. - */ -PgStat_GlobalStats * -pgstat_fetch_global(void) -{ - backend_read_statsfile(); + pgstat_prep_snapshot(); - return &globalStats; -} + key.kind = kind; + key.dboid = dboid; + key.objoid = objoid; -/* - * Support function for the SQL-callable pgstat* functions. Returns - * the collected statistics for one table or NULL. NULL doesn't mean - * that the table doesn't exist, just that there are no statistics, so the - * caller is better off to report ZERO instead. - */ -PgStat_StatTabEntry * -pgstat_fetch_stat_tabentry(Oid relid) -{ - Oid dbid; - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; + /* if we need to build a full snapshot, do so */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + pgstat_build_snapshot(); - /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. - */ - backend_read_statsfile(); - - /* - * Lookup our database, then look in its table hash table. - */ - dbid = MyDatabaseId; - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &dbid, - HASH_FIND, NULL); - if (dbentry != NULL && dbentry->tables != NULL) + /* if caching is desired, look up in cache */ + if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE) { - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &relid, - HASH_FIND, NULL); - if (tabentry) - return tabentry; + PgStat_SnapshotEntry *entry = NULL; + + entry = pgstat_snapshot_lookup(pgStatLocal.snapshot.stats, key); + + if (entry) + return entry->data; + + /* + * If we built a full snapshot and the key is not in + * pgStatLocal.snapshot.stats, there are no matching stats. + */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + return NULL; + } + + pgStatLocal.snapshot.mode = pgstat_fetch_consistency; + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + + if (entry_ref == NULL || entry_ref->shared_entry->dropped) + { + /* create empty entry when using PGSTAT_FETCH_CONSISTENCY_CACHE */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE) + { + PgStat_SnapshotEntry *entry = NULL; + bool found; + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found); + Assert(!found); + entry->data = NULL; + } + return NULL; } /* - * If we didn't find it, maybe it's a shared table. + * Allocate in caller's context for PGSTAT_FETCH_CONSISTENCY_NONE, + * otherwise we could quickly end up with a fair bit of memory used due to + * repeated accesses. */ - dbid = InvalidOid; - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &dbid, - HASH_FIND, NULL); - if (dbentry != NULL && dbentry->tables != NULL) + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE) + stats_data = palloc(kind_info->shared_data_len); + else + stats_data = MemoryContextAlloc(pgStatLocal.snapshot.context, + kind_info->shared_data_len); + memcpy(stats_data, + pgstat_get_entry_data(kind, entry_ref->shared_stats), + kind_info->shared_data_len); + + if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE) { - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &relid, - HASH_FIND, NULL); - if (tabentry) - return tabentry; + PgStat_SnapshotEntry *entry = NULL; + bool found; + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found); + entry->data = stats_data; } - return NULL; + return stats_data; } - /* - * Support function for the SQL-callable pgstat* functions. Returns - * the collected statistics for one function or NULL. + * If a stats snapshot has been taken, return the timestamp at which that was + * done, and set *have_snapshot to true. Otherwise *have_snapshot is set to + * false. */ -PgStat_StatFuncEntry * -pgstat_fetch_stat_funcentry(Oid func_id) +TimestampTz +pgstat_get_stat_snapshot_timestamp(bool *have_snapshot) { - PgStat_StatDBEntry *dbentry; - PgStat_StatFuncEntry *funcentry = NULL; - - /* load the stats file if needed */ - backend_read_statsfile(); - - /* Lookup our database, then find the requested function. */ - dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); - if (dbentry != NULL && dbentry->functions != NULL) + if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) { - funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, - (void *) &func_id, - HASH_FIND, NULL); + *have_snapshot = true; + return pgStatLocal.snapshot.snapshot_timestamp; } - return funcentry; + *have_snapshot = false; + + return 0; } /* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the archiver statistics struct. + * Ensure snapshot for fixed-numbered 'kind' exists. + * + * Typically used by the pgstat_fetch_* functions for a kind of stats, before + * massaging the data into the desired format. */ -PgStat_ArchiverStats * -pgstat_fetch_stat_archiver(void) +void +pgstat_snapshot_fixed(PgStat_Kind kind) { - backend_read_statsfile(); + AssertArg(pgstat_is_kind_valid(kind)); + AssertArg(pgstat_get_kind_info(kind)->fixed_amount); - return &archiverStats; + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + pgstat_build_snapshot(); + else + pgstat_build_snapshot_fixed(kind); + + Assert(pgStatLocal.snapshot.fixed_valid[kind]); +} + +static void +pgstat_prep_snapshot(void) +{ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE || + pgStatLocal.snapshot.stats != NULL) + return; + + if (!pgStatLocal.snapshot.context) + pgStatLocal.snapshot.context = AllocSetContextCreate(TopMemoryContext, + "PgStat Snapshot", + ALLOCSET_SMALL_SIZES); + + pgStatLocal.snapshot.stats = + pgstat_snapshot_create(pgStatLocal.snapshot.context, + PGSTAT_SNAPSHOT_HASH_SIZE, + NULL); +} + +static void +pgstat_build_snapshot(void) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + + /* should only be called when we need a snapshot */ + Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT); + + /* snapshot already built */ + if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + return; + + pgstat_prep_snapshot(); + + Assert(pgStatLocal.snapshot.stats->members == 0); + + pgStatLocal.snapshot.snapshot_timestamp = GetCurrentTimestamp(); + + /* + * Snapshot all variable stats. + */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + PgStat_Kind kind = p->key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + bool found; + PgStat_SnapshotEntry *entry; + PgStatShared_Common *stats_data; + + /* + * Check if the stats object should be included in the snapshot. + * Unless the stats kind can be accessed from all databases (e.g., + * database stats themselves), we only include stats for the current + * database or objects not associated with a database (e.g. shared + * relations). + */ + if (p->key.dboid != MyDatabaseId && + p->key.dboid != InvalidOid && + !kind_info->accessed_across_databases) + continue; + + if (p->dropped) + continue; + + Assert(pg_atomic_read_u32(&p->refcount) > 0); + + stats_data = dsa_get_address(pgStatLocal.dsa, p->body); + Assert(stats_data); + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, p->key, &found); + Assert(!found); + + entry->data = MemoryContextAlloc(pgStatLocal.snapshot.context, + kind_info->shared_size); + memcpy(entry->data, + pgstat_get_entry_data(kind, stats_data), + kind_info->shared_size); + } + dshash_seq_term(&hstat); + + /* + * Build snapshot of all fixed-numbered stats. + */ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->fixed_amount) + { + Assert(kind_info->snapshot_cb == NULL); + continue; + } + + pgstat_build_snapshot_fixed(kind); + } + + pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_SNAPSHOT; +} + +static void +pgstat_build_snapshot_fixed(PgStat_Kind kind) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + Assert(kind_info->fixed_amount); + Assert(kind_info->snapshot_cb != NULL); + + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE) + { + /* rebuild every time */ + pgStatLocal.snapshot.fixed_valid[kind] = false; + } + else if (pgStatLocal.snapshot.fixed_valid[kind]) + { + /* in snapshot mode we shouldn't get called again */ + Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE); + return; + } + + Assert(!pgStatLocal.snapshot.fixed_valid[kind]); + + kind_info->snapshot_cb(); + + Assert(!pgStatLocal.snapshot.fixed_valid[kind]); + pgStatLocal.snapshot.fixed_valid[kind] = true; +} + + +/* ------------------------------------------------------------ + * Backend-local pending stats infrastructure + * ------------------------------------------------------------ + */ + +/* + * Returns the appropriate PgStat_EntryRef, preparing it to receive pending + * stats if not already done. + * + * If created_entry is non-NULL, it'll be set to true if the entry is newly + * created, false otherwise. + */ +PgStat_EntryRef * +pgstat_prep_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid, bool *created_entry) +{ + PgStat_EntryRef *entry_ref; + + /* need to be able to flush out */ + Assert(pgstat_get_kind_info(kind)->flush_pending_cb != NULL); + + if (unlikely(!pgStatPendingContext)) + { + pgStatPendingContext = + AllocSetContextCreate(CacheMemoryContext, + "PgStat Pending", + ALLOCSET_SMALL_SIZES); + } + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, + true, created_entry); + + if (entry_ref->pending == NULL) + { + size_t entrysize = pgstat_get_kind_info(kind)->pending_size; + + Assert(entrysize != (size_t) -1); + + entry_ref->pending = MemoryContextAllocZero(pgStatPendingContext, entrysize); + dlist_push_tail(&pgStatPending, &entry_ref->pending_node); + } + + return entry_ref; } /* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the bgwriter statistics struct. + * Return an existing stats entry, or NULL. + * + * This should only be used for helper function for pgstatfuncs.c - outside of + * that it shouldn't be needed. */ -PgStat_BgWriterStats * -pgstat_fetch_stat_bgwriter(void) +PgStat_EntryRef * +pgstat_fetch_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid) { - backend_read_statsfile(); + PgStat_EntryRef *entry_ref; - return &globalStats.bgwriter; + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + + if (entry_ref == NULL || entry_ref->pending == NULL) + return NULL; + + return entry_ref; +} + +void +pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref) +{ + PgStat_Kind kind = entry_ref->shared_entry->key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + void *pending_data = entry_ref->pending; + + Assert(pending_data != NULL); + /* !fixed_amount stats should be handled explicitly */ + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + if (kind_info->delete_pending_cb) + kind_info->delete_pending_cb(entry_ref); + + pfree(pending_data); + entry_ref->pending = NULL; + + dlist_delete(&entry_ref->pending_node); } /* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the checkpointer statistics struct. + * Flush out pending stats for database objects (databases, relations, + * functions). */ -PgStat_CheckpointerStats * -pgstat_fetch_stat_checkpointer(void) +static bool +pgstat_flush_pending_entries(bool nowait) { - backend_read_statsfile(); + bool have_pending = false; + dlist_node *cur = NULL; - return &globalStats.checkpointer; -} + /* + * Need to be a bit careful iterating over the list of pending entries. + * Processing a pending entry may queue further pending entries to the end + * of the list that we want to process, so a simple iteration won't do. + * Further complicating matters is that we want to delete the current + * entry in each iteration from the list if we flushed successfully. + * + * So we just keep track of the next pointer in each loop iteration. + */ + if (!dlist_is_empty(&pgStatPending)) + cur = dlist_head_node(&pgStatPending); -/* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the WAL statistics struct. - */ -PgStat_WalStats * -pgstat_fetch_stat_wal(void) -{ - backend_read_statsfile(); + while (cur) + { + PgStat_EntryRef *entry_ref = + dlist_container(PgStat_EntryRef, pending_node, cur); + PgStat_HashKey key = entry_ref->shared_entry->key; + PgStat_Kind kind = key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + bool did_flush; + dlist_node *next; - return &walStats; -} + Assert(!kind_info->fixed_amount); + Assert(kind_info->flush_pending_cb != NULL); -/* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the slru statistics struct. - */ -PgStat_SLRUStats * -pgstat_fetch_slru(void) -{ - backend_read_statsfile(); + /* flush the stats, if possible */ + did_flush = kind_info->flush_pending_cb(entry_ref, nowait); - return slruStats; -} + Assert(did_flush || nowait); -/* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the replication slot statistics struct. - */ -PgStat_StatReplSlotEntry * -pgstat_fetch_replslot(NameData slotname) -{ - backend_read_statsfile(); + /* determine next entry, before deleting the pending entry */ + if (dlist_has_next(&pgStatPending, cur)) + next = dlist_next_node(&pgStatPending, cur); + else + next = NULL; - return pgstat_get_replslot_entry(slotname, false); -} + /* if successfully flushed, remove entry */ + if (did_flush) + pgstat_delete_pending_entry(entry_ref); + else + have_pending = true; -/* - * Support function for the SQL-callable pgstat* functions. Returns - * the collected statistics for one subscription or NULL. - */ -PgStat_StatSubEntry * -pgstat_fetch_stat_subscription(Oid subid) -{ - /* Load the stats file if needed */ - backend_read_statsfile(); + cur = next; + } - return pgstat_get_subscription_entry(subid, false); + Assert(dlist_is_empty(&pgStatPending) == !have_pending); + + return have_pending; } @@ -1464,16 +1156,33 @@ pgstat_fetch_stat_subscription(Oid subid) * ------------------------------------------------------------ */ -/* - * Create pgStatLocalContext, if not already done. - */ -static void -pgstat_setup_memcxt(void) +PgStat_Kind +pgstat_get_kind_from_str(char *kind_str) { - if (!pgStatLocalContext) - pgStatLocalContext = AllocSetContextCreate(TopMemoryContext, - "Statistics snapshot", - ALLOCSET_SMALL_SIZES); + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + if (pg_strcasecmp(kind_str, pgstat_kind_infos[kind].name) == 0) + return kind; + } + + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid statistics kind: \"%s\"", kind_str))); + return PGSTAT_KIND_DATABASE; /* avoid compiler warnings */ +} + +static inline bool +pgstat_is_kind_valid(int ikind) +{ + return ikind >= PGSTAT_KIND_FIRST_VALID && ikind <= PGSTAT_KIND_LAST; +} + +const PgStat_KindInfo * +pgstat_get_kind_info(PgStat_Kind kind) +{ + AssertArg(pgstat_is_kind_valid(kind)); + + return &pgstat_kind_infos[kind]; } /* @@ -1489,642 +1198,44 @@ pgstat_assert_is_up(void) } #endif -/* - * Set common header fields in a statistics message - */ -void -pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype) -{ - hdr->m_type = mtype; -} - - -/* - * Send out one statistics message to the collector - */ -void -pgstat_send(void *msg, int len) -{ - int rc; - - pgstat_assert_is_up(); - - if (pgStatSock == PGINVALID_SOCKET) - return; - - ((PgStat_MsgHdr *) msg)->m_size = len; - - /* We'll retry after EINTR, but ignore all other failures */ - do - { - rc = send(pgStatSock, msg, len, 0); - } while (rc < 0 && errno == EINTR); - -#ifdef USE_ASSERT_CHECKING - /* In debug builds, log send failures ... */ - if (rc < 0) - elog(LOG, "could not send to statistics collector: %m"); -#endif -} - -/* - * Start up the statistics collector process. This is the body of the - * postmaster child process. - * - * The argc/argv parameters are valid only in EXEC_BACKEND case. - */ -NON_EXEC_STATIC void -PgstatCollectorMain(int argc, char *argv[]) -{ - int len; - PgStat_Msg msg; - int wr; - WaitEvent event; - WaitEventSet *wes; - - /* - * Ignore all signals usually bound to some action in the postmaster, - * except SIGHUP and SIGQUIT. Note we don't need a SIGUSR1 handler to - * support latch operations, because we only use a local latch. - */ - pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGINT, SIG_IGN); - pqsignal(SIGTERM, SIG_IGN); - pqsignal(SIGQUIT, SignalHandlerForShutdownRequest); - pqsignal(SIGALRM, SIG_IGN); - pqsignal(SIGPIPE, SIG_IGN); - pqsignal(SIGUSR1, SIG_IGN); - pqsignal(SIGUSR2, SIG_IGN); - /* Reset some signals that are accepted by postmaster but not here */ - pqsignal(SIGCHLD, SIG_DFL); - PG_SETMASK(&UnBlockSig); - - MyBackendType = B_STATS_COLLECTOR; - init_ps_display(NULL); - - /* - * Read in existing stats files or initialize the stats to zero. - */ - pgStatRunningInCollector = true; - pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true); - - /* Prepare to wait for our latch or data in our socket. */ - wes = CreateWaitEventSet(CurrentMemoryContext, 3); - AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); - AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL); - AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL); - - /* - * Loop to process messages until we get SIGQUIT or detect ungraceful - * death of our parent postmaster. - * - * For performance reasons, we don't want to do ResetLatch/WaitLatch after - * every message; instead, do that only after a recv() fails to obtain a - * message. (This effectively means that if backends are sending us stuff - * like mad, we won't notice postmaster death until things slack off a - * bit; which seems fine.) To do that, we have an inner loop that - * iterates as long as recv() succeeds. We do check ConfigReloadPending - * inside the inner loop, which means that such interrupts will get - * serviced but the latch won't get cleared until next time there is a - * break in the action. - */ - for (;;) - { - /* Clear any already-pending wakeups */ - ResetLatch(MyLatch); - - /* - * Quit if we get SIGQUIT from the postmaster. - */ - if (ShutdownRequestPending) - break; - - /* - * Inner loop iterates as long as we keep getting messages, or until - * ShutdownRequestPending becomes set. - */ - while (!ShutdownRequestPending) - { - /* - * Reload configuration if we got SIGHUP from the postmaster. - */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - } - - /* - * Write the stats file(s) if a new request has arrived that is - * not satisfied by existing file(s). - */ - if (pgstat_write_statsfile_needed()) - pgstat_write_statsfiles(false, false); - - /* - * Try to receive and process a message. This will not block, - * since the socket is set to non-blocking mode. - * - * XXX On Windows, we have to force pgwin32_recv to cooperate, - * despite the previous use of pg_set_noblock() on the socket. - * This is extremely broken and should be fixed someday. - */ -#ifdef WIN32 - pgwin32_noblock = 1; -#endif - - len = recv(pgStatSock, (char *) &msg, - sizeof(PgStat_Msg), 0); - -#ifdef WIN32 - pgwin32_noblock = 0; -#endif - - if (len < 0) - { - if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) - break; /* out of inner loop */ - ereport(ERROR, - (errcode_for_socket_access(), - errmsg("could not read statistics message: %m"))); - } - - /* - * We ignore messages that are smaller than our common header - */ - if (len < sizeof(PgStat_MsgHdr)) - continue; - - /* - * The received length must match the length in the header - */ - if (msg.msg_hdr.m_size != len) - continue; - - /* - * O.K. - we accept this message. Process it. - */ - switch (msg.msg_hdr.m_type) - { - case PGSTAT_MTYPE_DUMMY: - break; - - case PGSTAT_MTYPE_INQUIRY: - pgstat_recv_inquiry(&msg.msg_inquiry, len); - break; - - case PGSTAT_MTYPE_TABSTAT: - pgstat_recv_tabstat(&msg.msg_tabstat, len); - break; - - case PGSTAT_MTYPE_TABPURGE: - pgstat_recv_tabpurge(&msg.msg_tabpurge, len); - break; - - case PGSTAT_MTYPE_DROPDB: - pgstat_recv_dropdb(&msg.msg_dropdb, len); - break; - - case PGSTAT_MTYPE_RESETCOUNTER: - pgstat_recv_resetcounter(&msg.msg_resetcounter, len); - break; - - case PGSTAT_MTYPE_RESETSHAREDCOUNTER: - pgstat_recv_resetsharedcounter(&msg.msg_resetsharedcounter, - len); - break; - - case PGSTAT_MTYPE_RESETSINGLECOUNTER: - pgstat_recv_resetsinglecounter(&msg.msg_resetsinglecounter, - len); - break; - - case PGSTAT_MTYPE_RESETSLRUCOUNTER: - pgstat_recv_resetslrucounter(&msg.msg_resetslrucounter, - len); - break; - - case PGSTAT_MTYPE_RESETREPLSLOTCOUNTER: - pgstat_recv_resetreplslotcounter(&msg.msg_resetreplslotcounter, - len); - break; - - case PGSTAT_MTYPE_RESETSUBCOUNTER: - pgstat_recv_resetsubcounter(&msg.msg_resetsubcounter, len); - break; - - case PGSTAT_MTYPE_AUTOVAC_START: - pgstat_recv_autovac(&msg.msg_autovacuum_start, len); - break; - - case PGSTAT_MTYPE_VACUUM: - pgstat_recv_vacuum(&msg.msg_vacuum, len); - break; - - case PGSTAT_MTYPE_ANALYZE: - pgstat_recv_analyze(&msg.msg_analyze, len); - break; - - case PGSTAT_MTYPE_ARCHIVER: - pgstat_recv_archiver(&msg.msg_archiver, len); - break; - - case PGSTAT_MTYPE_BGWRITER: - pgstat_recv_bgwriter(&msg.msg_bgwriter, len); - break; - - case PGSTAT_MTYPE_CHECKPOINTER: - pgstat_recv_checkpointer(&msg.msg_checkpointer, len); - break; - - case PGSTAT_MTYPE_WAL: - pgstat_recv_wal(&msg.msg_wal, len); - break; - - case PGSTAT_MTYPE_SLRU: - pgstat_recv_slru(&msg.msg_slru, len); - break; - - case PGSTAT_MTYPE_FUNCSTAT: - pgstat_recv_funcstat(&msg.msg_funcstat, len); - break; - - case PGSTAT_MTYPE_FUNCPURGE: - pgstat_recv_funcpurge(&msg.msg_funcpurge, len); - break; - - case PGSTAT_MTYPE_RECOVERYCONFLICT: - pgstat_recv_recoveryconflict(&msg.msg_recoveryconflict, - len); - break; - - case PGSTAT_MTYPE_DEADLOCK: - pgstat_recv_deadlock(&msg.msg_deadlock, len); - break; - - case PGSTAT_MTYPE_TEMPFILE: - pgstat_recv_tempfile(&msg.msg_tempfile, len); - break; - - case PGSTAT_MTYPE_CHECKSUMFAILURE: - pgstat_recv_checksum_failure(&msg.msg_checksumfailure, - len); - break; - - case PGSTAT_MTYPE_REPLSLOT: - pgstat_recv_replslot(&msg.msg_replslot, len); - break; - - case PGSTAT_MTYPE_CONNECT: - pgstat_recv_connect(&msg.msg_connect, len); - break; - - case PGSTAT_MTYPE_DISCONNECT: - pgstat_recv_disconnect(&msg.msg_disconnect, len); - break; - - case PGSTAT_MTYPE_SUBSCRIPTIONDROP: - pgstat_recv_subscription_drop(&msg.msg_subscriptiondrop, len); - break; - - case PGSTAT_MTYPE_SUBSCRIPTIONERROR: - pgstat_recv_subscription_error(&msg.msg_subscriptionerror, len); - break; - - default: - break; - } - } /* end of inner message-processing loop */ - - /* Sleep until there's something to do */ -#ifndef WIN32 - wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN); -#else - - /* - * Windows, at least in its Windows Server 2003 R2 incarnation, - * sometimes loses FD_READ events. Waking up and retrying the recv() - * fixes that, so don't sleep indefinitely. This is a crock of the - * first water, but until somebody wants to debug exactly what's - * happening there, this is the best we can do. The two-second - * timeout matches our pre-9.2 behavior, and needs to be short enough - * to not provoke "using stale statistics" complaints from - * backend_read_statsfile. - */ - wr = WaitEventSetWait(wes, 2 * 1000L /* msec */ , &event, 1, - WAIT_EVENT_PGSTAT_MAIN); -#endif - - /* - * Emergency bailout if postmaster has died. This is to avoid the - * necessity for manual cleanup of all postmaster children. - */ - if (wr == 1 && event.events == WL_POSTMASTER_DEATH) - break; - } /* end of outer loop */ - - /* - * Save the final stats to reuse at next startup. - */ - pgstat_write_statsfiles(true, true); - - FreeWaitEventSet(wes); - - exit(0); -} - -/* - * Subroutine to clear stats in a database entry - * - * Tables and functions hashes are initialized to empty. - */ -static void -reset_dbentry_counters(PgStat_StatDBEntry *dbentry) -{ - HASHCTL hash_ctl; - - dbentry->n_xact_commit = 0; - dbentry->n_xact_rollback = 0; - dbentry->n_blocks_fetched = 0; - dbentry->n_blocks_hit = 0; - dbentry->n_tuples_returned = 0; - dbentry->n_tuples_fetched = 0; - dbentry->n_tuples_inserted = 0; - dbentry->n_tuples_updated = 0; - dbentry->n_tuples_deleted = 0; - dbentry->last_autovac_time = 0; - dbentry->n_conflict_tablespace = 0; - dbentry->n_conflict_lock = 0; - dbentry->n_conflict_snapshot = 0; - dbentry->n_conflict_bufferpin = 0; - dbentry->n_conflict_startup_deadlock = 0; - dbentry->n_temp_files = 0; - dbentry->n_temp_bytes = 0; - dbentry->n_deadlocks = 0; - dbentry->n_checksum_failures = 0; - dbentry->last_checksum_failure = 0; - dbentry->n_block_read_time = 0; - dbentry->n_block_write_time = 0; - dbentry->n_sessions = 0; - dbentry->total_session_time = 0; - dbentry->total_active_time = 0; - dbentry->total_idle_in_xact_time = 0; - dbentry->n_sessions_abandoned = 0; - dbentry->n_sessions_fatal = 0; - dbentry->n_sessions_killed = 0; - - dbentry->stat_reset_timestamp = GetCurrentTimestamp(); - dbentry->stats_timestamp = 0; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatTabEntry); - dbentry->tables = hash_create("Per-database table", - PGSTAT_TAB_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); - dbentry->functions = hash_create("Per-database function", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); -} - -/* - * Lookup the hash table entry for the specified database. If no hash - * table entry exists, initialize it, if the create parameter is true. - * Else, return NULL. - */ -static PgStat_StatDBEntry * -pgstat_get_db_entry(Oid databaseid, bool create) -{ - PgStat_StatDBEntry *result; - bool found; - HASHACTION action = (create ? HASH_ENTER : HASH_FIND); - - /* Lookup or create the hash table entry for this database */ - result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - &databaseid, - action, &found); - - if (!create && !found) - return NULL; - - /* - * If not found, initialize the new one. This creates empty hash tables - * for tables and functions, too. - */ - if (!found) - reset_dbentry_counters(result); - - return result; -} - -/* - * Lookup the hash table entry for the specified table. If no hash - * table entry exists, initialize it, if the create parameter is true. - * Else, return NULL. - */ -static PgStat_StatTabEntry * -pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) -{ - PgStat_StatTabEntry *result; - bool found; - HASHACTION action = (create ? HASH_ENTER : HASH_FIND); - - /* Lookup or create the hash table entry for this table */ - result = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - &tableoid, - action, &found); - - if (!create && !found) - return NULL; - - /* If not found, initialize the new one. */ - if (!found) - { - result->numscans = 0; - result->tuples_returned = 0; - result->tuples_fetched = 0; - result->tuples_inserted = 0; - result->tuples_updated = 0; - result->tuples_deleted = 0; - result->tuples_hot_updated = 0; - result->n_live_tuples = 0; - result->n_dead_tuples = 0; - result->changes_since_analyze = 0; - result->inserts_since_vacuum = 0; - result->blocks_fetched = 0; - result->blocks_hit = 0; - result->vacuum_timestamp = 0; - result->vacuum_count = 0; - result->autovac_vacuum_timestamp = 0; - result->autovac_vacuum_count = 0; - result->analyze_timestamp = 0; - result->analyze_count = 0; - result->autovac_analyze_timestamp = 0; - result->autovac_analyze_count = 0; - } - - return result; -} - -/* - * Return the entry of replication slot stats with the given name. Return - * NULL if not found and the caller didn't request to create it. - * - * create tells whether to create the new slot entry if it is not found. - */ -static PgStat_StatReplSlotEntry * -pgstat_get_replslot_entry(NameData name, bool create) -{ - PgStat_StatReplSlotEntry *slotent; - bool found; - - if (replSlotStatHash == NULL) - { - HASHCTL hash_ctl; - - /* - * Quick return NULL if the hash table is empty and the caller didn't - * request to create the entry. - */ - if (!create) - return NULL; - - hash_ctl.keysize = sizeof(NameData); - hash_ctl.entrysize = sizeof(PgStat_StatReplSlotEntry); - replSlotStatHash = hash_create("Replication slots hash", - PGSTAT_REPLSLOT_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); - } - - slotent = (PgStat_StatReplSlotEntry *) hash_search(replSlotStatHash, - (void *) &name, - create ? HASH_ENTER : HASH_FIND, - &found); - - if (!slotent) - { - /* not found */ - Assert(!create && !found); - return NULL; - } - - /* initialize the entry */ - if (create && !found) - { - namestrcpy(&(slotent->slotname), NameStr(name)); - pgstat_reset_replslot_entry(slotent, 0); - } - - return slotent; -} - -/* - * Reset the given replication slot stats. - */ -static void -pgstat_reset_replslot_entry(PgStat_StatReplSlotEntry *slotent, TimestampTz ts) -{ - /* reset only counters. Don't clear slot name */ - slotent->spill_txns = 0; - slotent->spill_count = 0; - slotent->spill_bytes = 0; - slotent->stream_txns = 0; - slotent->stream_count = 0; - slotent->stream_bytes = 0; - slotent->total_txns = 0; - slotent->total_bytes = 0; - slotent->stat_reset_timestamp = ts; -} - -/* - * Return the subscription statistics entry with the given subscription OID. - * If no subscription entry exists, initialize it, if the create parameter is - * true. Else, return NULL. - */ -static PgStat_StatSubEntry * -pgstat_get_subscription_entry(Oid subid, bool create) -{ - PgStat_StatSubEntry *subentry; - bool found; - HASHACTION action = (create ? HASH_ENTER : HASH_FIND); - - if (subscriptionStatHash == NULL) - { - HASHCTL hash_ctl; - - /* - * Quick return NULL if the hash table is empty and the caller didn't - * request to create the entry. - */ - if (!create) - return NULL; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatSubEntry); - subscriptionStatHash = hash_create("Subscription hash", - PGSTAT_SUBSCRIPTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); - } - - subentry = (PgStat_StatSubEntry *) hash_search(subscriptionStatHash, - (void *) &subid, - action, &found); - - if (!create && !found) - return NULL; - - /* If not found, initialize the new one */ - if (!found) - pgstat_reset_subscription(subentry, 0); - - return subentry; -} - -/* - * Reset the given subscription stats. - */ -static void -pgstat_reset_subscription(PgStat_StatSubEntry *subentry, TimestampTz ts) -{ - subentry->apply_error_count = 0; - subentry->sync_error_count = 0; - subentry->stat_reset_timestamp = ts; -} - /* ------------------------------------------------------------ * reading and writing of on-disk stats file * ------------------------------------------------------------ */ +/* helpers for pgstat_write_statsfile() */ +static void +write_chunk(FILE *fpout, void *ptr, size_t len) +{ + int rc; + + rc = fwrite(ptr, len, 1, fpout); + + /* we'll check for errors with ferror once at the end */ + (void) rc; +} + +#define write_chunk_s(fpout, ptr) write_chunk(fpout, ptr, sizeof(*ptr)) + /* - * Write the global statistics file, as well as requested DB files. - * - * 'permanent' specifies writing to the permanent files not temporary ones. - * When true (happens only when the collector is shutting down), also remove - * the temporary files so that backends starting up under a new postmaster - * can't read old data before the new collector is ready. - * - * When 'allDbs' is false, only the requested databases (listed in - * pending_write_requests) will be written; otherwise, all databases - * will be written. + * This function is called in the last process that is accessing the shared + * stats so locking is not required. */ static void -pgstat_write_statsfiles(bool permanent, bool allDbs) +pgstat_write_statsfile(void) { - HASH_SEQ_STATUS hstat; - PgStat_StatDBEntry *dbentry; FILE *fpout; int32 format_id; - const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; - int rc; + const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + dshash_seq_status hstat; + PgStatShared_HashEntry *ps; + + pgstat_assert_is_up(); + + /* we're shutting down, so it's ok to just override this */ + pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE; elog(DEBUG2, "writing stats file \"%s\"", statfile); @@ -2141,2066 +1252,352 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) return; } - /* - * Set the timestamp of the stats file. - */ - globalStats.stats_timestamp = GetCurrentTimestamp(); - /* * Write the file header --- currently just a format ID. */ format_id = PGSTAT_FILE_FORMAT_ID; - rc = fwrite(&format_id, sizeof(format_id), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + write_chunk_s(fpout, &format_id); /* - * Write global stats struct + * XXX: The following could now be generalized to just iterate over + * pgstat_kind_infos instead of knowing about the different kinds of + * stats. */ - rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ /* * Write archiver stats struct */ - rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_ARCHIVER); + write_chunk_s(fpout, &pgStatLocal.snapshot.archiver); /* - * Write WAL stats struct + * Write bgwriter stats struct */ - rc = fwrite(&walStats, sizeof(walStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_BGWRITER); + write_chunk_s(fpout, &pgStatLocal.snapshot.bgwriter); + + /* + * Write checkpointer stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER); + write_chunk_s(fpout, &pgStatLocal.snapshot.checkpointer); /* * Write SLRU stats struct */ - rc = fwrite(slruStats, sizeof(slruStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_SLRU); + write_chunk_s(fpout, &pgStatLocal.snapshot.slru); /* - * Walk through the database table. + * Write WAL stats struct */ - hash_seq_init(&hstat, pgStatDBHash); - while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) + pgstat_build_snapshot_fixed(PGSTAT_KIND_WAL); + write_chunk_s(fpout, &pgStatLocal.snapshot.wal); + + /* + * Walk through the stats entries + */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((ps = dshash_seq_next(&hstat)) != NULL) { - /* - * Write out the table and function stats for this DB into the - * appropriate per-DB stat file, if required. - */ - if (allDbs || pgstat_db_requested(dbentry->databaseid)) - { - /* Make DB's timestamp consistent with the global stats */ - dbentry->stats_timestamp = globalStats.stats_timestamp; - - pgstat_write_db_statsfile(dbentry, permanent); - } - - /* - * Write out the DB entry. We don't write the tables or functions - * pointers, since they're of no use to any other process. - */ - fputc('D', fpout); - rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - - /* - * Write replication slot stats struct - */ - if (replSlotStatHash) - { - PgStat_StatReplSlotEntry *slotent; - - hash_seq_init(&hstat, replSlotStatHash); - while ((slotent = (PgStat_StatReplSlotEntry *) hash_seq_search(&hstat)) != NULL) - { - fputc('R', fpout); - rc = fwrite(slotent, sizeof(PgStat_StatReplSlotEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - } - - /* - * Write subscription stats struct - */ - if (subscriptionStatHash) - { - PgStat_StatSubEntry *subentry; - - hash_seq_init(&hstat, subscriptionStatHash); - while ((subentry = (PgStat_StatSubEntry *) hash_seq_search(&hstat)) != NULL) - { - fputc('S', fpout); - rc = fwrite(subentry, sizeof(PgStat_StatSubEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - } - - /* - * No more output to be done. Close the temp file and replace the old - * pgstat.stat with it. The ferror() check replaces testing for error - * after each individual fputc or fwrite above. - */ - fputc('E', fpout); - - if (ferror(fpout)) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not write temporary statistics file \"%s\": %m", - tmpfile))); - FreeFile(fpout); - unlink(tmpfile); - } - else if (FreeFile(fpout) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not close temporary statistics file \"%s\": %m", - tmpfile))); - unlink(tmpfile); - } - else if (rename(tmpfile, statfile) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", - tmpfile, statfile))); - unlink(tmpfile); - } - - if (permanent) - unlink(pgstat_stat_filename); - - /* - * Now throw away the list of requests. Note that requests sent after we - * started the write are still waiting on the network socket. - */ - list_free(pending_write_requests); - pending_write_requests = NIL; -} - -/* - * return the filename for a DB stat file; filename is the output buffer, - * of length len. - */ -static void -get_dbstat_filename(bool permanent, bool tempname, Oid databaseid, - char *filename, int len) -{ - int printed; - - /* NB -- pgstat_reset_remove_files knows about the pattern this uses */ - printed = snprintf(filename, len, "%s/db_%u.%s", - permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY : - pgstat_stat_directory, - databaseid, - tempname ? "tmp" : "stat"); - if (printed >= len) - elog(ERROR, "overlength pgstat path"); -} - -/* - * Write the stat file for a single database. - * - * If writing to the permanent file (happens when the collector is - * shutting down only), remove the temporary file so that backends - * starting up under a new postmaster can't read the old data before - * the new collector is ready. - */ -static void -pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) -{ - HASH_SEQ_STATUS tstat; - HASH_SEQ_STATUS fstat; - PgStat_StatTabEntry *tabentry; - PgStat_StatFuncEntry *funcentry; - FILE *fpout; - int32 format_id; - Oid dbid = dbentry->databaseid; - int rc; - char tmpfile[MAXPGPATH]; - char statfile[MAXPGPATH]; - - get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH); - get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH); - - elog(DEBUG2, "writing stats file \"%s\"", statfile); - - /* - * Open the statistics temp file to write out the current values. - */ - fpout = AllocateFile(tmpfile, PG_BINARY_W); - if (fpout == NULL) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not open temporary statistics file \"%s\": %m", - tmpfile))); - return; - } - - /* - * Write the file header --- currently just a format ID. - */ - format_id = PGSTAT_FILE_FORMAT_ID; - rc = fwrite(&format_id, sizeof(format_id), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - - /* - * Walk through the database's access stats per table. - */ - hash_seq_init(&tstat, dbentry->tables); - while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL) - { - fputc('T', fpout); - rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - - /* - * Walk through the database's function stats table. - */ - hash_seq_init(&fstat, dbentry->functions); - while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL) - { - fputc('F', fpout); - rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - - /* - * No more output to be done. Close the temp file and replace the old - * pgstat.stat with it. The ferror() check replaces testing for error - * after each individual fputc or fwrite above. - */ - fputc('E', fpout); - - if (ferror(fpout)) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not write temporary statistics file \"%s\": %m", - tmpfile))); - FreeFile(fpout); - unlink(tmpfile); - } - else if (FreeFile(fpout) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not close temporary statistics file \"%s\": %m", - tmpfile))); - unlink(tmpfile); - } - else if (rename(tmpfile, statfile) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", - tmpfile, statfile))); - unlink(tmpfile); - } - - if (permanent) - { - get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH); - - elog(DEBUG2, "removing temporary stats file \"%s\"", statfile); - unlink(statfile); - } -} - -/* - * Reads in some existing statistics collector files and returns the - * databases hash table that is the top level of the data. - * - * If 'onlydb' is not InvalidOid, it means we only want data for that DB - * plus the shared catalogs ("DB 0"). We'll still populate the DB hash - * table for all databases, but we don't bother even creating table/function - * hash tables for other databases. - * - * 'permanent' specifies reading from the permanent files not temporary ones. - * When true (happens only when the collector is starting up), remove the - * files after reading; the in-memory status is now authoritative, and the - * files would be out of date in case somebody else reads them. - * - * If a 'deep' read is requested, table/function stats are read, otherwise - * the table/function hash tables remain empty. - */ -static HTAB * -pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) -{ - PgStat_StatDBEntry *dbentry; - PgStat_StatDBEntry dbbuf; - HASHCTL hash_ctl; - HTAB *dbhash; - FILE *fpin; - int32 format_id; - bool found; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; - int i; - TimestampTz ts; - - /* - * The tables will live in pgStatLocalContext. - */ - pgstat_setup_memcxt(); - - /* - * Create the DB hashtable - */ - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatDBEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - /* - * Clear out global, archiver, WAL and SLRU statistics so they start from - * zero in case we can't load an existing statsfile. - */ - memset(&globalStats, 0, sizeof(globalStats)); - memset(&archiverStats, 0, sizeof(archiverStats)); - memset(&walStats, 0, sizeof(walStats)); - memset(&slruStats, 0, sizeof(slruStats)); - - /* - * Set the current timestamp (will be kept only in case we can't load an - * existing statsfile). - */ - ts = GetCurrentTimestamp(); - globalStats.bgwriter.stat_reset_timestamp = ts; - archiverStats.stat_reset_timestamp = ts; - walStats.stat_reset_timestamp = ts; - - /* - * Set the same reset timestamp for all SLRU items too. - */ - for (i = 0; i < SLRU_NUM_ELEMENTS; i++) - slruStats[i].stat_reset_timestamp = ts; - - /* - * Try to open the stats file. If it doesn't exist, the backends simply - * return zero for anything and the collector simply starts from scratch - * with empty counters. - * - * ENOENT is a possibility if the stats collector is not running or has - * not yet written the stats file the first time. Any other failure - * condition is suspicious. - */ - if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) - { - if (errno != ENOENT) - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errcode_for_file_access(), - errmsg("could not open statistics file \"%s\": %m", - statfile))); - return dbhash; - } - - /* - * Verify it's of the expected format. - */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || - format_id != PGSTAT_FILE_FORMAT_ID) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - goto done; - } - - /* - * Read global stats struct - */ - if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&globalStats, 0, sizeof(globalStats)); - goto done; - } - - /* - * In the collector, disregard the timestamp we read from the permanent - * stats file; we should be willing to write a temp stats file immediately - * upon the first request from any backend. This only matters if the old - * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not - * an unusual scenario. - */ - if (pgStatRunningInCollector) - globalStats.stats_timestamp = 0; - - /* - * Read archiver stats struct - */ - if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&archiverStats, 0, sizeof(archiverStats)); - goto done; - } - - /* - * Read WAL stats struct - */ - if (fread(&walStats, 1, sizeof(walStats), fpin) != sizeof(walStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&walStats, 0, sizeof(walStats)); - goto done; - } - - /* - * Read SLRU stats struct - */ - if (fread(slruStats, 1, sizeof(slruStats), fpin) != sizeof(slruStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&slruStats, 0, sizeof(slruStats)); - goto done; - } - - /* - * We found an existing collector stats file. Read it and put all the - * hashtable entries into place. - */ - for (;;) - { - switch (fgetc(fpin)) - { - /* - * 'D' A PgStat_StatDBEntry struct describing a database - * follows. - */ - case 'D': - if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables), - fpin) != offsetof(PgStat_StatDBEntry, tables)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - /* - * Add to the DB hash - */ - dbentry = (PgStat_StatDBEntry *) hash_search(dbhash, - (void *) &dbbuf.databaseid, - HASH_ENTER, - &found); - if (found) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry)); - dbentry->tables = NULL; - dbentry->functions = NULL; - - /* - * In the collector, disregard the timestamp we read from the - * permanent stats file; we should be willing to write a temp - * stats file immediately upon the first request from any - * backend. - */ - if (pgStatRunningInCollector) - dbentry->stats_timestamp = 0; - - /* - * Don't create tables/functions hashtables for uninteresting - * databases. - */ - if (onlydb != InvalidOid) - { - if (dbbuf.databaseid != onlydb && - dbbuf.databaseid != InvalidOid) - break; - } - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatTabEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbentry->tables = hash_create("Per-database table", - PGSTAT_TAB_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbentry->functions = hash_create("Per-database function", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - /* - * If requested, read the data from the database-specific - * file. Otherwise we just leave the hashtables empty. - */ - if (deep) - pgstat_read_db_statsfile(dbentry->databaseid, - dbentry->tables, - dbentry->functions, - permanent); - - break; - - /* - * 'R' A PgStat_StatReplSlotEntry struct describing a - * replication slot follows. - */ - case 'R': - { - PgStat_StatReplSlotEntry slotbuf; - PgStat_StatReplSlotEntry *slotent; - - if (fread(&slotbuf, 1, sizeof(PgStat_StatReplSlotEntry), fpin) - != sizeof(PgStat_StatReplSlotEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - /* Create hash table if we don't have it already. */ - if (replSlotStatHash == NULL) - { - HASHCTL hash_ctl; - - hash_ctl.keysize = sizeof(NameData); - hash_ctl.entrysize = sizeof(PgStat_StatReplSlotEntry); - hash_ctl.hcxt = pgStatLocalContext; - replSlotStatHash = hash_create("Replication slots hash", - PGSTAT_REPLSLOT_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - } - - slotent = (PgStat_StatReplSlotEntry *) hash_search(replSlotStatHash, - (void *) &slotbuf.slotname, - HASH_ENTER, NULL); - memcpy(slotent, &slotbuf, sizeof(PgStat_StatReplSlotEntry)); - break; - } - - /* - * 'S' A PgStat_StatSubEntry struct describing subscription - * statistics. - */ - case 'S': - { - PgStat_StatSubEntry subbuf; - PgStat_StatSubEntry *subentry; - - if (fread(&subbuf, 1, sizeof(PgStat_StatSubEntry), fpin) - != sizeof(PgStat_StatSubEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - if (subscriptionStatHash == NULL) - { - HASHCTL hash_ctl; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatSubEntry); - hash_ctl.hcxt = pgStatLocalContext; - subscriptionStatHash = hash_create("Subscription hash", - PGSTAT_SUBSCRIPTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - } - - subentry = (PgStat_StatSubEntry *) hash_search(subscriptionStatHash, - (void *) &subbuf.subid, - HASH_ENTER, NULL); - - memcpy(subentry, &subbuf, sizeof(subbuf)); - break; - } - - case 'E': - goto done; - - default: - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - } - -done: - FreeFile(fpin); - - /* If requested to read the permanent file, also get rid of it. */ - if (permanent) - { - elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); - unlink(statfile); - } - - return dbhash; -} - - -/* - * Reads in the existing statistics collector file for the given database, - * filling the passed-in tables and functions hash tables. - * - * As in pgstat_read_statsfiles, if the permanent file is requested, it is - * removed after reading. - * - * Note: this code has the ability to skip storing per-table or per-function - * data, if NULL is passed for the corresponding hashtable. That's not used - * at the moment though. - */ -static void -pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, - bool permanent) -{ - PgStat_StatTabEntry *tabentry; - PgStat_StatTabEntry tabbuf; - PgStat_StatFuncEntry funcbuf; - PgStat_StatFuncEntry *funcentry; - FILE *fpin; - int32 format_id; - bool found; - char statfile[MAXPGPATH]; - - get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH); - - /* - * Try to open the stats file. If it doesn't exist, the backends simply - * return zero for anything and the collector simply starts from scratch - * with empty counters. - * - * ENOENT is a possibility if the stats collector is not running or has - * not yet written the stats file the first time. Any other failure - * condition is suspicious. - */ - if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) - { - if (errno != ENOENT) - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errcode_for_file_access(), - errmsg("could not open statistics file \"%s\": %m", - statfile))); - return; - } - - /* - * Verify it's of the expected format. - */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || - format_id != PGSTAT_FILE_FORMAT_ID) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - goto done; - } - - /* - * We found an existing collector stats file. Read it and put all the - * hashtable entries into place. - */ - for (;;) - { - switch (fgetc(fpin)) - { - /* - * 'T' A PgStat_StatTabEntry follows. - */ - case 'T': - if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry), - fpin) != sizeof(PgStat_StatTabEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - /* - * Skip if table data not wanted. - */ - if (tabhash == NULL) - break; - - tabentry = (PgStat_StatTabEntry *) hash_search(tabhash, - (void *) &tabbuf.tableid, - HASH_ENTER, &found); - - if (found) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - memcpy(tabentry, &tabbuf, sizeof(tabbuf)); - break; - - /* - * 'F' A PgStat_StatFuncEntry follows. - */ - case 'F': - if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry), - fpin) != sizeof(PgStat_StatFuncEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - /* - * Skip if function data not wanted. - */ - if (funchash == NULL) - break; - - funcentry = (PgStat_StatFuncEntry *) hash_search(funchash, - (void *) &funcbuf.functionid, - HASH_ENTER, &found); - - if (found) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - memcpy(funcentry, &funcbuf, sizeof(funcbuf)); - break; - - /* - * 'E' The EOF marker of a complete stats file. - */ - case 'E': - goto done; - - default: - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - } - -done: - FreeFile(fpin); - - if (permanent) - { - elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); - unlink(statfile); - } -} - -/* - * Attempt to determine the timestamp of the last db statfile write. - * Returns true if successful; the timestamp is stored in *ts. The caller must - * rely on timestamp stored in *ts iff the function returns true. - * - * This needs to be careful about handling databases for which no stats file - * exists, such as databases without a stat entry or those not yet written: - * - * - if there's a database entry in the global file, return the corresponding - * stats_timestamp value. - * - * - if there's no db stat entry (e.g. for a new or inactive database), - * there's no stats_timestamp value, but also nothing to write so we return - * the timestamp of the global statfile. - */ -static bool -pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, - TimestampTz *ts) -{ - PgStat_StatDBEntry dbentry; - PgStat_GlobalStats myGlobalStats; - PgStat_ArchiverStats myArchiverStats; - PgStat_WalStats myWalStats; - PgStat_SLRUStats mySLRUStats[SLRU_NUM_ELEMENTS]; - PgStat_StatReplSlotEntry myReplSlotStats; - PgStat_StatSubEntry mySubStats; - FILE *fpin; - int32 format_id; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; - - /* - * Try to open the stats file. As above, anything but ENOENT is worthy of - * complaining about. - */ - if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) - { - if (errno != ENOENT) - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errcode_for_file_access(), - errmsg("could not open statistics file \"%s\": %m", - statfile))); - return false; - } - - /* - * Verify it's of the expected format. - */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || - format_id != PGSTAT_FILE_FORMAT_ID) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read global stats struct - */ - if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), - fpin) != sizeof(myGlobalStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read archiver stats struct - */ - if (fread(&myArchiverStats, 1, sizeof(myArchiverStats), - fpin) != sizeof(myArchiverStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read WAL stats struct - */ - if (fread(&myWalStats, 1, sizeof(myWalStats), fpin) != sizeof(myWalStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read SLRU stats struct - */ - if (fread(mySLRUStats, 1, sizeof(mySLRUStats), fpin) != sizeof(mySLRUStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* By default, we're going to return the timestamp of the global file. */ - *ts = myGlobalStats.stats_timestamp; - - /* - * We found an existing collector stats file. Read it and look for a - * record for the requested database. If found, use its timestamp. - */ - for (;;) - { - switch (fgetc(fpin)) - { - /* - * 'D' A PgStat_StatDBEntry struct describing a database - * follows. - */ - case 'D': - if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables), - fpin) != offsetof(PgStat_StatDBEntry, tables)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - FreeFile(fpin); - return false; - } - - /* - * If this is the DB we're looking for, save its timestamp and - * we're done. - */ - if (dbentry.databaseid == databaseid) - { - *ts = dbentry.stats_timestamp; - goto done; - } - - break; - - /* - * 'R' A PgStat_StatReplSlotEntry struct describing a - * replication slot follows. - */ - case 'R': - if (fread(&myReplSlotStats, 1, sizeof(PgStat_StatReplSlotEntry), fpin) - != sizeof(PgStat_StatReplSlotEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - FreeFile(fpin); - return false; - } - break; - - /* - * 'S' A PgStat_StatSubEntry struct describing subscription - * statistics follows. - */ - case 'S': - if (fread(&mySubStats, 1, sizeof(PgStat_StatSubEntry), fpin) - != sizeof(PgStat_StatSubEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - FreeFile(fpin); - return false; - } - break; - - case 'E': - goto done; - - default: - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - FreeFile(fpin); - return false; - } - } - } - -done: - FreeFile(fpin); - return true; -} - -/* - * If not already done, read the statistics collector stats file into - * some hash tables. The results will be kept until pgstat_clear_snapshot() - * is called (typically, at end of transaction). - */ -static void -backend_read_statsfile(void) -{ - TimestampTz min_ts = 0; - TimestampTz ref_ts = 0; - Oid inquiry_db; - int count; - - pgstat_assert_is_up(); - - /* already read it? */ - if (pgStatDBHash) - return; - Assert(!pgStatRunningInCollector); - - /* - * In a normal backend, we check staleness of the data for our own DB, and - * so we send MyDatabaseId in inquiry messages. In the autovac launcher, - * check staleness of the shared-catalog data, and send InvalidOid in - * inquiry messages so as not to force writing unnecessary data. - */ - if (IsAutoVacuumLauncherProcess()) - inquiry_db = InvalidOid; - else - inquiry_db = MyDatabaseId; - - /* - * Loop until fresh enough stats file is available or we ran out of time. - * The stats inquiry message is sent repeatedly in case collector drops - * it; but not every single time, as that just swamps the collector. - */ - for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++) - { - bool ok; - TimestampTz file_ts = 0; - TimestampTz cur_ts; + PgStatShared_Common *shstats; + const PgStat_KindInfo *kind_info = NULL; CHECK_FOR_INTERRUPTS(); - ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts); + /* we may have some "dropped" entries not yet removed, skip them */ + Assert(!ps->dropped); + if (ps->dropped) + continue; - cur_ts = GetCurrentTimestamp(); - /* Calculate min acceptable timestamp, if we didn't already */ - if (count == 0 || cur_ts < ref_ts) + shstats = (PgStatShared_Common *) dsa_get_address(pgStatLocal.dsa, ps->body); + + kind_info = pgstat_get_kind_info(ps->key.kind); + + /* if not dropped the valid-entry refcount should exist */ + Assert(pg_atomic_read_u32(&ps->refcount) > 0); + + if (!kind_info->to_serialized_name) { - /* - * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL - * msec before now. This indirectly ensures that the collector - * needn't write the file more often than PGSTAT_STAT_INTERVAL. In - * an autovacuum worker, however, we want a lower delay to avoid - * using stale data, so we use PGSTAT_RETRY_DELAY (since the - * number of workers is low, this shouldn't be a problem). - * - * We don't recompute min_ts after sleeping, except in the - * unlikely case that cur_ts went backwards. So we might end up - * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In - * practice that shouldn't happen, though, as long as the sleep - * time is less than PGSTAT_STAT_INTERVAL; and we don't want to - * tell the collector that our cutoff time is less than what we'd - * actually accept. - */ - ref_ts = cur_ts; - if (IsAutoVacuumWorkerProcess()) - min_ts = TimestampTzPlusMilliseconds(ref_ts, - -PGSTAT_RETRY_DELAY); - else - min_ts = TimestampTzPlusMilliseconds(ref_ts, - -PGSTAT_STAT_INTERVAL); + /* normal stats entry, identified by PgStat_HashKey */ + fputc('S', fpout); + write_chunk_s(fpout, &ps->key); + } + else + { + /* stats entry identified by name on disk (e.g. slots) */ + NameData name; + + kind_info->to_serialized_name(shstats, &name); + + fputc('N', fpout); + write_chunk_s(fpout, &ps->key.kind); + write_chunk_s(fpout, &name); } - /* - * If the file timestamp is actually newer than cur_ts, we must have - * had a clock glitch (system time went backwards) or there is clock - * skew between our processor and the stats collector's processor. - * Accept the file, but send an inquiry message anyway to make - * pgstat_recv_inquiry do a sanity check on the collector's time. - */ - if (ok && file_ts > cur_ts) - { - /* - * A small amount of clock skew between processors isn't terribly - * surprising, but a large difference is worth logging. We - * arbitrarily define "large" as 1000 msec. - */ - if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000)) - { - char *filetime; - char *mytime; - - /* Copy because timestamptz_to_str returns a static buffer */ - filetime = pstrdup(timestamptz_to_str(file_ts)); - mytime = pstrdup(timestamptz_to_str(cur_ts)); - ereport(LOG, - (errmsg("statistics collector's time %s is later than backend local time %s", - filetime, mytime))); - pfree(filetime); - pfree(mytime); - } - - pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); - break; - } - - /* Normal acceptance case: file is not older than cutoff time */ - if (ok && file_ts >= min_ts) - break; - - /* Not there or too old, so kick the collector and wait a bit */ - if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) - pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); - - pg_usleep(PGSTAT_RETRY_DELAY * 1000L); + /* Write except the header part of the entry */ + write_chunk(fpout, + pgstat_get_entry_data(ps->key.kind, shstats), + pgstat_get_entry_len(ps->key.kind)); } + dshash_seq_term(&hstat); - if (count >= PGSTAT_POLL_LOOP_COUNT) + /* + * No more output to be done. Close the temp file and replace the old + * pgstat.stat with it. The ferror() check replaces testing for error + * after each individual fputc or fwrite (in write_chunk()) above. + */ + fputc('E', fpout); + + if (ferror(fpout)) + { ereport(LOG, - (errmsg("using stale statistics instead of current ones " - "because stats collector is not responding"))); - - /* - * Autovacuum launcher wants stats about all databases, but a shallow read - * is sufficient. Regular backends want a deep read for just the tables - * they can see (MyDatabaseId + shared catalogs). - */ - if (IsAutoVacuumLauncherProcess()) - pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false); - else - pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true); + (errcode_for_file_access(), + errmsg("could not write temporary statistics file \"%s\": %m", + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); + } + else if (FreeFile(fpout) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close temporary statistics file \"%s\": %m", + tmpfile))); + unlink(tmpfile); + } + else if (rename(tmpfile, statfile) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", + tmpfile, statfile))); + unlink(tmpfile); + } } -/* - * Do we need to write out any stats files? - */ +/* helpers for pgstat_read_statsfile() */ static bool -pgstat_write_statsfile_needed(void) +read_chunk(FILE *fpin, void *ptr, size_t len) { - if (pending_write_requests != NIL) - return true; - - /* Everything was written recently */ - return false; + return fread(ptr, 1, len, fpin) == len; } -/* - * Checks whether stats for a particular DB need to be written to a file. - */ -static bool -pgstat_db_requested(Oid databaseid) -{ - /* - * If any requests are outstanding at all, we should write the stats for - * shared catalogs (the "database" with OID 0). This ensures that - * backends will see up-to-date stats for shared catalogs, even though - * they send inquiry messages mentioning only their own DB. - */ - if (databaseid == InvalidOid && pending_write_requests != NIL) - return true; - - /* Search to see if there's an open request to write this database. */ - if (list_member_oid(pending_write_requests, databaseid)) - return true; - - return false; -} - - -/* ------------------------------------------------------------ - * stats collector message processing functions - * ------------------------------------------------------------ - */ +#define read_chunk_s(fpin, ptr) read_chunk(fpin, ptr, sizeof(*ptr)) /* - * Process stat inquiry requests. + * Reads in existing statistics file into the shared stats hash. + * + * This function is called in the only process that is accessing the shared + * stats so locking is not required. */ static void -pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) +pgstat_read_statsfile(void) { - PgStat_StatDBEntry *dbentry; - - elog(DEBUG2, "received inquiry for database %u", msg->databaseid); - - /* - * If there's already a write request for this DB, there's nothing to do. - * - * Note that if a request is found, we return early and skip the below - * check for clock skew. This is okay, since the only way for a DB - * request to be present in the list is that we have been here since the - * last write round. It seems sufficient to check for clock skew once per - * write round. - */ - if (list_member_oid(pending_write_requests, msg->databaseid)) - return; - - /* - * Check to see if we last wrote this database at a time >= the requested - * cutoff time. If so, this is a stale request that was generated before - * we updated the DB file, and we don't need to do so again. - * - * If the requestor's local clock time is older than stats_timestamp, we - * should suspect a clock glitch, ie system time going backwards; though - * the more likely explanation is just delayed message receipt. It is - * worth expending a GetCurrentTimestamp call to be sure, since a large - * retreat in the system clock reading could otherwise cause us to neglect - * to update the stats file for a long time. - */ - dbentry = pgstat_get_db_entry(msg->databaseid, false); - if (dbentry == NULL) - { - /* - * We have no data for this DB. Enter a write request anyway so that - * the global stats will get updated. This is needed to prevent - * backend_read_statsfile from waiting for data that we cannot supply, - * in the case of a new DB that nobody has yet reported any stats for. - * See the behavior of pgstat_read_db_statsfile_timestamp. - */ - } - else if (msg->clock_time < dbentry->stats_timestamp) - { - TimestampTz cur_ts = GetCurrentTimestamp(); - - if (cur_ts < dbentry->stats_timestamp) - { - /* - * Sure enough, time went backwards. Force a new stats file write - * to get back in sync; but first, log a complaint. - */ - char *writetime; - char *mytime; - - /* Copy because timestamptz_to_str returns a static buffer */ - writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp)); - mytime = pstrdup(timestamptz_to_str(cur_ts)); - ereport(LOG, - (errmsg("stats_timestamp %s is later than collector's time %s for database %u", - writetime, mytime, dbentry->databaseid))); - pfree(writetime); - pfree(mytime); - } - else - { - /* - * Nope, it's just an old request. Assuming msg's clock_time is - * >= its cutoff_time, it must be stale, so we can ignore it. - */ - return; - } - } - else if (msg->cutoff_time <= dbentry->stats_timestamp) - { - /* Stale request, ignore it */ - return; - } - - /* - * We need to write this DB, so create a request. - */ - pending_write_requests = lappend_oid(pending_write_requests, - msg->databaseid); -} - -/* - * Count what the backend has done. - */ -static void -pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; - int i; + FILE *fpin; + int32 format_id; bool found; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - /* - * Update database-wide stats. - */ - dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit); - dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback); - dbentry->n_block_read_time += msg->m_block_read_time; - dbentry->n_block_write_time += msg->m_block_write_time; - - dbentry->total_session_time += msg->m_session_time; - dbentry->total_active_time += msg->m_active_time; - dbentry->total_idle_in_xact_time += msg->m_idle_in_xact_time; - - /* - * Process all table entries in the message. - */ - for (i = 0; i < msg->m_nentries; i++) - { - PgStat_TableEntry *tabmsg = &(msg->m_entry[i]); - - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &(tabmsg->t_id), - HASH_ENTER, &found); - - if (!found) - { - /* - * If it's a new table entry, initialize counters to the values we - * just got. - */ - tabentry->numscans = tabmsg->t_counts.t_numscans; - tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned; - tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched; - tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted; - tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated; - tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted; - tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated; - tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples; - tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples; - tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples; - tabentry->inserts_since_vacuum = tabmsg->t_counts.t_tuples_inserted; - tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched; - tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit; - - tabentry->vacuum_timestamp = 0; - tabentry->vacuum_count = 0; - tabentry->autovac_vacuum_timestamp = 0; - tabentry->autovac_vacuum_count = 0; - tabentry->analyze_timestamp = 0; - tabentry->analyze_count = 0; - tabentry->autovac_analyze_timestamp = 0; - tabentry->autovac_analyze_count = 0; - } - else - { - /* - * Otherwise add the values to the existing entry. - */ - tabentry->numscans += tabmsg->t_counts.t_numscans; - tabentry->tuples_returned += tabmsg->t_counts.t_tuples_returned; - tabentry->tuples_fetched += tabmsg->t_counts.t_tuples_fetched; - tabentry->tuples_inserted += tabmsg->t_counts.t_tuples_inserted; - tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated; - tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted; - tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated; - - /* - * If table was truncated/dropped, first reset the live/dead - * counters. - */ - if (tabmsg->t_counts.t_truncdropped) - { - tabentry->n_live_tuples = 0; - tabentry->n_dead_tuples = 0; - tabentry->inserts_since_vacuum = 0; - } - tabentry->n_live_tuples += tabmsg->t_counts.t_delta_live_tuples; - tabentry->n_dead_tuples += tabmsg->t_counts.t_delta_dead_tuples; - tabentry->changes_since_analyze += tabmsg->t_counts.t_changed_tuples; - tabentry->inserts_since_vacuum += tabmsg->t_counts.t_tuples_inserted; - tabentry->blocks_fetched += tabmsg->t_counts.t_blocks_fetched; - tabentry->blocks_hit += tabmsg->t_counts.t_blocks_hit; - } - - /* Clamp n_live_tuples in case of negative delta_live_tuples */ - tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); - /* Likewise for n_dead_tuples */ - tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); - - /* - * Add per-table stats to the per-database entry, too. - */ - dbentry->n_tuples_returned += tabmsg->t_counts.t_tuples_returned; - dbentry->n_tuples_fetched += tabmsg->t_counts.t_tuples_fetched; - dbentry->n_tuples_inserted += tabmsg->t_counts.t_tuples_inserted; - dbentry->n_tuples_updated += tabmsg->t_counts.t_tuples_updated; - dbentry->n_tuples_deleted += tabmsg->t_counts.t_tuples_deleted; - dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched; - dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit; - } -} - -/* - * Arrange for dead table removal. - */ -static void -pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - int i; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - - /* - * No need to purge if we don't even know the database. - */ - if (!dbentry || !dbentry->tables) - return; - - /* - * Process all table entries in the message. - */ - for (i = 0; i < msg->m_nentries; i++) - { - /* Remove from hashtable if present; we don't care if it's not. */ - (void) hash_search(dbentry->tables, - (void *) &(msg->m_tableid[i]), - HASH_REMOVE, NULL); - } -} - -/* - * Arrange for dead database removal - */ -static void -pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len) -{ - Oid dbid = msg->m_databaseid; - PgStat_StatDBEntry *dbentry; - - /* - * Lookup the database in the hashtable. - */ - dbentry = pgstat_get_db_entry(dbid, false); - - /* - * If found, remove it (along with the db statfile). - */ - if (dbentry) - { - char statfile[MAXPGPATH]; - - get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH); - - elog(DEBUG2, "removing stats file \"%s\"", statfile); - unlink(statfile); - - if (dbentry->tables != NULL) - hash_destroy(dbentry->tables); - if (dbentry->functions != NULL) - hash_destroy(dbentry->functions); - - if (hash_search(pgStatDBHash, - (void *) &dbid, - HASH_REMOVE, NULL) == NULL) - ereport(ERROR, - (errmsg("database hash table corrupted during cleanup --- abort"))); - } -} - -/* - * Reset the statistics for the specified database. - */ -static void -pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - /* - * Lookup the database in the hashtable. Nothing to do if not there. - */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - - if (!dbentry) - return; - - /* - * We simply throw away all the database's table entries by recreating a - * new hash table for them. - */ - if (dbentry->tables != NULL) - hash_destroy(dbentry->tables); - if (dbentry->functions != NULL) - hash_destroy(dbentry->functions); - - dbentry->tables = NULL; - dbentry->functions = NULL; - - /* - * Reset database-level stats, too. This creates empty hash tables for - * tables and functions. - */ - reset_dbentry_counters(dbentry); -} - -/* - * Reset some shared statistics of the cluster. - */ -static void -pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len) -{ - if (msg->m_resettarget == PGSTAT_KIND_BGWRITER || - msg->m_resettarget == PGSTAT_KIND_CHECKPOINTER) - { - /* - * Reset the global, bgwriter and checkpointer statistics for the - * cluster. - */ - memset(&globalStats, 0, sizeof(globalStats)); - globalStats.bgwriter.stat_reset_timestamp = GetCurrentTimestamp(); - } - else if (msg->m_resettarget == PGSTAT_KIND_ARCHIVER) - { - /* Reset the archiver statistics for the cluster. */ - memset(&archiverStats, 0, sizeof(archiverStats)); - archiverStats.stat_reset_timestamp = GetCurrentTimestamp(); - } - else if (msg->m_resettarget == PGSTAT_KIND_WAL) - { - /* Reset the WAL statistics for the cluster. */ - memset(&walStats, 0, sizeof(walStats)); - walStats.stat_reset_timestamp = GetCurrentTimestamp(); - } - - /* - * Presumably the sender of this message validated the target, don't - * complain here if it's not valid - */ -} - -/* - * Reset a statistics for a single object, which may be of current - * database or shared across all databases in the cluster. - */ -static void -pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - if (IsSharedRelation(msg->m_objectid)) - dbentry = pgstat_get_db_entry(InvalidOid, false); - else - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - - if (!dbentry) - return; - - /* Set the reset timestamp for the whole database */ - dbentry->stat_reset_timestamp = GetCurrentTimestamp(); - - /* Remove object if it exists, ignore it if not */ - if (msg->m_resettype == PGSTAT_KIND_RELATION) - (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid), - HASH_REMOVE, NULL); - else if (msg->m_resettype == PGSTAT_KIND_FUNCTION) - (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid), - HASH_REMOVE, NULL); -} - -/* - * Reset some SLRU statistics of the cluster. - */ -static void -pgstat_recv_resetslrucounter(PgStat_MsgResetslrucounter *msg, int len) -{ - int i; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + PgStat_ShmemControl *shmem = pgStatLocal.shmem; TimestampTz ts = GetCurrentTimestamp(); - for (i = 0; i < SLRU_NUM_ELEMENTS; i++) + /* shouldn't be called from postmaster */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + elog(DEBUG2, "reading stats file \"%s\"", statfile); + + /* + * Try to open the stats file. If it doesn't exist, the backends simply + * returns zero for anything and statistics simply starts from scratch + * with empty counters. + * + * ENOENT is a possibility if stats collection was previously disabled or + * has not yet written the stats file for the first time. Any other + * failure condition is suspicious. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) { - /* reset entry with the given index, or all entries (index is -1) */ - if ((msg->m_index == -1) || (msg->m_index == i)) - { - memset(&slruStats[i], 0, sizeof(slruStats[i])); - slruStats[i].stat_reset_timestamp = ts; - } - } -} - -/* - * Reset some replication slot statistics of the cluster. - */ -static void -pgstat_recv_resetreplslotcounter(PgStat_MsgResetreplslotcounter *msg, - int len) -{ - PgStat_StatReplSlotEntry *slotent; - TimestampTz ts; - - /* Return if we don't have replication slot statistics */ - if (replSlotStatHash == NULL) + if (errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + pgstat_reset_after_failure(ts); return; - - ts = GetCurrentTimestamp(); - if (msg->clearall) - { - HASH_SEQ_STATUS sstat; - - hash_seq_init(&sstat, replSlotStatHash); - while ((slotent = (PgStat_StatReplSlotEntry *) hash_seq_search(&sstat)) != NULL) - pgstat_reset_replslot_entry(slotent, ts); } - else - { - /* Get the slot statistics to reset */ - slotent = pgstat_get_replslot_entry(msg->m_slotname, false); - - /* - * Nothing to do if the given slot entry is not found. This could - * happen when the slot with the given name is removed and the - * corresponding statistics entry is also removed before receiving the - * reset message. - */ - if (!slotent) - return; - - /* Reset the stats for the requested replication slot */ - pgstat_reset_replslot_entry(slotent, ts); - } -} - -/* - * Reset some subscription statistics of the cluster. - */ -static void -pgstat_recv_resetsubcounter(PgStat_MsgResetsubcounter *msg, int len) -{ - PgStat_StatSubEntry *subentry; - TimestampTz ts; - - /* Return if we don't have replication subscription statistics */ - if (subscriptionStatHash == NULL) - return; - - ts = GetCurrentTimestamp(); - if (!OidIsValid(msg->m_subid)) - { - HASH_SEQ_STATUS sstat; - - /* Clear all subscription counters */ - hash_seq_init(&sstat, subscriptionStatHash); - while ((subentry = (PgStat_StatSubEntry *) hash_seq_search(&sstat)) != NULL) - pgstat_reset_subscription(subentry, ts); - } - else - { - /* Get the subscription statistics to reset */ - subentry = pgstat_get_subscription_entry(msg->m_subid, false); - - /* - * Nothing to do if the given subscription entry is not found. This - * could happen when the subscription with the subid is removed and - * the corresponding statistics entry is also removed before receiving - * the reset message. - */ - if (!subentry) - return; - - /* Reset the stats for the requested subscription */ - pgstat_reset_subscription(subentry, ts); - } -} - -/* - * Process an autovacuum signaling message. - */ -static void -pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len) -{ - PgStat_StatDBEntry *dbentry; /* - * Store the last autovacuum time in the database's hashtable entry. + * Verify it's of the expected format. */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - dbentry->last_autovac_time = msg->m_start_time; -} - -/* - * Process a VACUUM message. - */ -static void -pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; + if (!read_chunk_s(fpin, &format_id) || + format_id != PGSTAT_FILE_FORMAT_ID) + goto error; /* - * Store the data in the table's hashtable entry. + * XXX: The following could now be generalized to just iterate over + * pgstat_kind_infos instead of knowing about the different kinds of + * stats. */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); - - tabentry->n_live_tuples = msg->m_live_tuples; - tabentry->n_dead_tuples = msg->m_dead_tuples; /* - * It is quite possible that a non-aggressive VACUUM ended up skipping - * various pages, however, we'll zero the insert counter here regardless. - * It's currently used only to track when we need to perform an "insert" - * autovacuum, which are mainly intended to freeze newly inserted tuples. - * Zeroing this may just mean we'll not try to vacuum the table again - * until enough tuples have been inserted to trigger another insert - * autovacuum. An anti-wraparound autovacuum will catch any persistent - * stragglers. + * Read archiver stats struct */ - tabentry->inserts_since_vacuum = 0; - - if (msg->m_autovacuum) - { - tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime; - tabentry->autovac_vacuum_count++; - } - else - { - tabentry->vacuum_timestamp = msg->m_vacuumtime; - tabentry->vacuum_count++; - } -} - -/* - * Process an ANALYZE message. - */ -static void -pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; + if (!read_chunk_s(fpin, &shmem->archiver.stats)) + goto error; /* - * Store the data in the table's hashtable entry. + * Read bgwriter stats struct */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); - - tabentry->n_live_tuples = msg->m_live_tuples; - tabentry->n_dead_tuples = msg->m_dead_tuples; + if (!read_chunk_s(fpin, &shmem->bgwriter.stats)) + goto error; /* - * If commanded, reset changes_since_analyze to zero. This forgets any - * changes that were committed while the ANALYZE was in progress, but we - * have no good way to estimate how many of those there were. + * Read checkpointer stats struct */ - if (msg->m_resetcounter) - tabentry->changes_since_analyze = 0; + if (!read_chunk_s(fpin, &shmem->checkpointer.stats)) + goto error; - if (msg->m_autovacuum) + /* + * Read SLRU stats struct + */ + if (!read_chunk_s(fpin, &shmem->slru.stats)) + goto error; + + /* + * Read WAL stats struct + */ + if (!read_chunk_s(fpin, &shmem->wal.stats)) + goto error; + + /* + * We found an existing statistics file. Read it and put all the hash + * table entries into place. + */ + for (;;) { - tabentry->autovac_analyze_timestamp = msg->m_analyzetime; - tabentry->autovac_analyze_count++; - } - else - { - tabentry->analyze_timestamp = msg->m_analyzetime; - tabentry->analyze_count++; - } -} + char t = fgetc(fpin); -/* - * Process a ARCHIVER message. - */ -static void -pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len) -{ - if (msg->m_failed) - { - /* Failed archival attempt */ - ++archiverStats.failed_count; - memcpy(archiverStats.last_failed_wal, msg->m_xlog, - sizeof(archiverStats.last_failed_wal)); - archiverStats.last_failed_timestamp = msg->m_timestamp; - } - else - { - /* Successful archival operation */ - ++archiverStats.archived_count; - memcpy(archiverStats.last_archived_wal, msg->m_xlog, - sizeof(archiverStats.last_archived_wal)); - archiverStats.last_archived_timestamp = msg->m_timestamp; - } -} - -/* - * Process a BGWRITER message. - */ -static void -pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len) -{ - globalStats.bgwriter.buf_written_clean += msg->m_buf_written_clean; - globalStats.bgwriter.maxwritten_clean += msg->m_maxwritten_clean; - globalStats.bgwriter.buf_alloc += msg->m_buf_alloc; -} - -/* - * Process a CHECKPOINTER message. - */ -static void -pgstat_recv_checkpointer(PgStat_MsgCheckpointer *msg, int len) -{ - globalStats.checkpointer.timed_checkpoints += msg->m_timed_checkpoints; - globalStats.checkpointer.requested_checkpoints += msg->m_requested_checkpoints; - globalStats.checkpointer.checkpoint_write_time += msg->m_checkpoint_write_time; - globalStats.checkpointer.checkpoint_sync_time += msg->m_checkpoint_sync_time; - globalStats.checkpointer.buf_written_checkpoints += msg->m_buf_written_checkpoints; - globalStats.checkpointer.buf_written_backend += msg->m_buf_written_backend; - globalStats.checkpointer.buf_fsync_backend += msg->m_buf_fsync_backend; -} - -/* - * Process a WAL message. - */ -static void -pgstat_recv_wal(PgStat_MsgWal *msg, int len) -{ - walStats.wal_records += msg->m_wal_records; - walStats.wal_fpi += msg->m_wal_fpi; - walStats.wal_bytes += msg->m_wal_bytes; - walStats.wal_buffers_full += msg->m_wal_buffers_full; - walStats.wal_write += msg->m_wal_write; - walStats.wal_sync += msg->m_wal_sync; - walStats.wal_write_time += msg->m_wal_write_time; - walStats.wal_sync_time += msg->m_wal_sync_time; -} - -/* - * Process a SLRU message. - */ -static void -pgstat_recv_slru(PgStat_MsgSLRU *msg, int len) -{ - slruStats[msg->m_index].blocks_zeroed += msg->m_blocks_zeroed; - slruStats[msg->m_index].blocks_hit += msg->m_blocks_hit; - slruStats[msg->m_index].blocks_read += msg->m_blocks_read; - slruStats[msg->m_index].blocks_written += msg->m_blocks_written; - slruStats[msg->m_index].blocks_exists += msg->m_blocks_exists; - slruStats[msg->m_index].flush += msg->m_flush; - slruStats[msg->m_index].truncate += msg->m_truncate; -} - -/* - * Process a RECOVERYCONFLICT message. - */ -static void -pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - switch (msg->m_reason) - { - case PROCSIG_RECOVERY_CONFLICT_DATABASE: - - /* - * Since we drop the information about the database as soon as it - * replicates, there is no point in counting these conflicts. - */ - break; - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: - dbentry->n_conflict_tablespace++; - break; - case PROCSIG_RECOVERY_CONFLICT_LOCK: - dbentry->n_conflict_lock++; - break; - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: - dbentry->n_conflict_snapshot++; - break; - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: - dbentry->n_conflict_bufferpin++; - break; - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: - dbentry->n_conflict_startup_deadlock++; - break; - } -} - -/* - * Process a DEADLOCK message. - */ -static void -pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - dbentry->n_deadlocks++; -} - -/* - * Process a CHECKSUMFAILURE message. - */ -static void -pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - dbentry->n_checksum_failures += msg->m_failurecount; - dbentry->last_checksum_failure = msg->m_failure_time; -} - -/* - * Process a REPLSLOT message. - */ -static void -pgstat_recv_replslot(PgStat_MsgReplSlot *msg, int len) -{ - if (msg->m_drop) - { - Assert(!msg->m_create); - - /* Remove the replication slot statistics with the given name */ - if (replSlotStatHash != NULL) - (void) hash_search(replSlotStatHash, - (void *) &(msg->m_slotname), - HASH_REMOVE, - NULL); - } - else - { - PgStat_StatReplSlotEntry *slotent; - - slotent = pgstat_get_replslot_entry(msg->m_slotname, true); - Assert(slotent); - - if (msg->m_create) + switch (t) { - /* - * If the message for dropping the slot with the same name gets - * lost, slotent has stats for the old slot. So we initialize all - * counters at slot creation. - */ - pgstat_reset_replslot_entry(slotent, 0); - } - else - { - /* Update the replication slot statistics */ - slotent->spill_txns += msg->m_spill_txns; - slotent->spill_count += msg->m_spill_count; - slotent->spill_bytes += msg->m_spill_bytes; - slotent->stream_txns += msg->m_stream_txns; - slotent->stream_count += msg->m_stream_count; - slotent->stream_bytes += msg->m_stream_bytes; - slotent->total_txns += msg->m_total_txns; - slotent->total_bytes += msg->m_total_bytes; + case 'S': + case 'N': + { + PgStat_HashKey key; + PgStatShared_HashEntry *p; + PgStatShared_Common *header; + + CHECK_FOR_INTERRUPTS(); + + if (t == 'S') + { + /* normal stats entry, identified by PgStat_HashKey */ + if (!read_chunk_s(fpin, &key)) + goto error; + + if (!pgstat_is_kind_valid(key.kind)) + goto error; + } + else + { + /* stats entry identified by name on disk (e.g. slots) */ + const PgStat_KindInfo *kind_info = NULL; + PgStat_Kind kind; + NameData name; + + if (!read_chunk_s(fpin, &kind)) + goto error; + if (!read_chunk_s(fpin, &name)) + goto error; + if (!pgstat_is_kind_valid(kind)) + goto error; + + kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->from_serialized_name) + goto error; + + if (!kind_info->from_serialized_name(&name, &key)) + { + /* skip over data for entry we don't care about */ + if (fseek(fpin, pgstat_get_entry_len(kind), SEEK_CUR) != 0) + goto error; + + continue; + } + + Assert(key.kind == kind); + } + + /* + * This intentionally doesn't use pgstat_get_entry_ref() - + * putting all stats into checkpointer's + * pgStatEntryRefHash would be wasted effort and memory. + */ + p = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &found); + + /* don't allow duplicate entries */ + if (found) + { + dshash_release_lock(pgStatLocal.shared_hash, p); + elog(WARNING, "found duplicate stats entry %d/%u/%u", + key.kind, key.dboid, key.objoid); + goto error; + } + + header = pgstat_init_entry(key.kind, p); + dshash_release_lock(pgStatLocal.shared_hash, p); + + if (!read_chunk(fpin, + pgstat_get_entry_data(key.kind, header), + pgstat_get_entry_len(key.kind))) + goto error; + + break; + } + case 'E': + goto done; + + default: + goto error; } } + +done: + FreeFile(fpin); + + elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); + unlink(statfile); + + return; + +error: + ereport(LOG, + (errmsg("corrupted statistics file \"%s\"", statfile))); + + /* Set the current timestamp as reset timestamp */ + pgstat_reset_after_failure(ts); + + goto done; } /* - * Process a CONNECT message. + * Helper to reset / drop stats after restoring stats from disk failed, + * potentially after already loading parts. */ static void -pgstat_recv_connect(PgStat_MsgConnect *msg, int len) +pgstat_reset_after_failure(TimestampTz ts) { - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - dbentry->n_sessions++; -} - -/* - * Process a DISCONNECT message. - */ -static void -pgstat_recv_disconnect(PgStat_MsgDisconnect *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - switch (msg->m_cause) + /* reset fixed-numbered stats */ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) { - case DISCONNECT_NOT_YET: - case DISCONNECT_NORMAL: - /* we don't collect these */ - break; - case DISCONNECT_CLIENT_EOF: - dbentry->n_sessions_abandoned++; - break; - case DISCONNECT_FATAL: - dbentry->n_sessions_fatal++; - break; - case DISCONNECT_KILLED: - dbentry->n_sessions_killed++; - break; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->fixed_amount) + continue; + + kind_info->reset_all_cb(ts); } -} - -/* - * Process a TEMPFILE message. - */ -static void -pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - dbentry->n_temp_bytes += msg->m_filesize; - dbentry->n_temp_files += 1; -} - -/* - * Count what the backend has done. - */ -static void -pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len) -{ - PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]); - PgStat_StatDBEntry *dbentry; - PgStat_StatFuncEntry *funcentry; - int i; - bool found; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - /* - * Process all function entries in the message. - */ - for (i = 0; i < msg->m_nentries; i++, funcmsg++) - { - funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, - (void *) &(funcmsg->f_id), - HASH_ENTER, &found); - - if (!found) - { - /* - * If it's a new function entry, initialize counters to the values - * we just got. - */ - funcentry->f_numcalls = funcmsg->f_numcalls; - funcentry->f_total_time = funcmsg->f_total_time; - funcentry->f_self_time = funcmsg->f_self_time; - } - else - { - /* - * Otherwise add the values to the existing entry. - */ - funcentry->f_numcalls += funcmsg->f_numcalls; - funcentry->f_total_time += funcmsg->f_total_time; - funcentry->f_self_time += funcmsg->f_self_time; - } - } -} - -/* - * Arrange for dead function removal. - */ -static void -pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - int i; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - - /* - * No need to purge if we don't even know the database. - */ - if (!dbentry || !dbentry->functions) - return; - - /* - * Process all function entries in the message. - */ - for (i = 0; i < msg->m_nentries; i++) - { - /* Remove from hashtable if present; we don't care if it's not. */ - (void) hash_search(dbentry->functions, - (void *) &(msg->m_functionid[i]), - HASH_REMOVE, NULL); - } -} - -/* - * Process a SUBSCRIPTIONDROP message. - */ -static void -pgstat_recv_subscription_drop(PgStat_MsgSubscriptionDrop *msg, int len) -{ - /* Return if we don't have replication subscription statistics */ - if (subscriptionStatHash == NULL) - return; - - /* Remove from hashtable if present; we don't care if it's not */ - (void) hash_search(subscriptionStatHash, (void *) &(msg->m_subid), - HASH_REMOVE, NULL); -} - -/* - * Process a SUBSCRIPTIONERROR message. - */ -static void -pgstat_recv_subscription_error(PgStat_MsgSubscriptionError *msg, int len) -{ - PgStat_StatSubEntry *subentry; - - /* Get the subscription stats */ - subentry = pgstat_get_subscription_entry(msg->m_subid, true); - Assert(subentry); - - if (msg->m_is_apply_error) - subentry->apply_error_count++; - else - subentry->sync_error_count++; + + /* and drop variable-numbered ones */ + pgstat_drop_all_entries(); } diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 9f7034df11..d5551e0af6 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -255,7 +255,6 @@ static pid_t StartupPID = 0, WalReceiverPID = 0, AutoVacPID = 0, PgArchPID = 0, - PgStatPID = 0, SysLoggerPID = 0; /* Startup process's status */ @@ -510,7 +509,6 @@ typedef struct PGPROC *AuxiliaryProcs; PGPROC *PreparedXactProcs; PMSignalData *PMSignalState; - InheritableSocket pgStatSock; pid_t PostmasterPid; TimestampTz PgStartTime; TimestampTz PgReloadTime; @@ -645,9 +643,8 @@ PostmasterMain(int argc, char *argv[]) * CAUTION: when changing this list, check for side-effects on the signal * handling setup of child processes. See tcop/postgres.c, * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, - * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, - * postmaster/syslogger.c, postmaster/bgworker.c and - * postmaster/checkpointer.c. + * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/syslogger.c, + * postmaster/bgworker.c and postmaster/checkpointer.c. */ pqinitmask(); PG_SETMASK(&BlockSig); @@ -1384,12 +1381,6 @@ PostmasterMain(int argc, char *argv[]) */ RemovePgTempFiles(); - /* - * Initialize stats collection subsystem (this does NOT start the - * collector process!) - */ - pgstat_init(); - /* * Initialize the autovacuum subsystem (again, no process start yet) */ @@ -1845,11 +1836,6 @@ ServerLoop(void) start_autovac_launcher = false; /* signal processed */ } - /* If we have lost the stats collector, try to start a new one */ - if (PgStatPID == 0 && - (pmState == PM_RUN || pmState == PM_HOT_STANDBY)) - PgStatPID = pgstat_start(); - /* If we have lost the archiver, try to start a new one. */ if (PgArchPID == 0 && PgArchStartupAllowed()) PgArchPID = StartArchiver(); @@ -2772,8 +2758,6 @@ SIGHUP_handler(SIGNAL_ARGS) signal_child(PgArchPID, SIGHUP); if (SysLoggerPID != 0) signal_child(SysLoggerPID, SIGHUP); - if (PgStatPID != 0) - signal_child(PgStatPID, SIGHUP); /* Reload authentication config files too */ if (!load_hba()) @@ -3097,8 +3081,6 @@ reaper(SIGNAL_ARGS) AutoVacPID = StartAutoVacLauncher(); if (PgArchStartupAllowed() && PgArchPID == 0) PgArchPID = StartArchiver(); - if (PgStatPID == 0) - PgStatPID = pgstat_start(); /* workers may be scheduled to start now */ maybe_start_bgworkers(); @@ -3165,13 +3147,6 @@ reaper(SIGNAL_ARGS) SignalChildren(SIGUSR2); pmState = PM_SHUTDOWN_2; - - /* - * We can also shut down the stats collector now; there's - * nothing left for it to do. - */ - if (PgStatPID != 0) - signal_child(PgStatPID, SIGQUIT); } else { @@ -3250,22 +3225,6 @@ reaper(SIGNAL_ARGS) continue; } - /* - * Was it the statistics collector? If so, just try to start a new - * one; no need to force reset of the rest of the system. (If fail, - * we'll try again in future cycles of the main loop.) - */ - if (pid == PgStatPID) - { - PgStatPID = 0; - if (!EXIT_STATUS_0(exitstatus)) - LogChildExit(LOG, _("statistics collector process"), - pid, exitstatus); - if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) - PgStatPID = pgstat_start(); - continue; - } - /* Was it the system logger? If so, try to start a new one */ if (pid == SysLoggerPID) { @@ -3707,22 +3666,6 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(PgArchPID, (SendStop ? SIGSTOP : SIGQUIT)); } - /* - * Force a power-cycle of the pgstat process too. (This isn't absolutely - * necessary, but it seems like a good idea for robustness, and it - * simplifies the state-machine logic in the case where a shutdown request - * arrives during crash processing.) - */ - if (PgStatPID != 0 && take_action) - { - ereport(DEBUG2, - (errmsg_internal("sending %s to process %d", - "SIGQUIT", - (int) PgStatPID))); - signal_child(PgStatPID, SIGQUIT); - allow_immediate_pgstat_restart(); - } - /* We do NOT restart the syslogger */ if (Shutdown != ImmediateShutdown) @@ -3934,12 +3877,10 @@ PostmasterStateMachine(void) FatalError = true; pmState = PM_WAIT_DEAD_END; - /* Kill the walsenders, archiver and stats collector too */ + /* Kill the walsenders and archiver too */ SignalChildren(SIGQUIT); if (PgArchPID != 0) signal_child(PgArchPID, SIGQUIT); - if (PgStatPID != 0) - signal_child(PgStatPID, SIGQUIT); } } } @@ -3963,8 +3904,7 @@ PostmasterStateMachine(void) { /* * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty - * (ie, no dead_end children remain), and the archiver and stats - * collector are gone too. + * (ie, no dead_end children remain), and the archiver is gone too. * * The reason we wait for those two is to protect them against a new * postmaster starting conflicting subprocesses; this isn't an @@ -3974,8 +3914,7 @@ PostmasterStateMachine(void) * normal state transition leading up to PM_WAIT_DEAD_END, or during * FatalError processing. */ - if (dlist_is_empty(&BackendList) && - PgArchPID == 0 && PgStatPID == 0) + if (dlist_is_empty(&BackendList) && PgArchPID == 0) { /* These other guys should be dead already */ Assert(StartupPID == 0); @@ -4183,8 +4122,6 @@ TerminateChildren(int signal) signal_child(AutoVacPID, signal); if (PgArchPID != 0) signal_child(PgArchPID, signal); - if (PgStatPID != 0) - signal_child(PgStatPID, signal); } /* @@ -5115,12 +5052,6 @@ SubPostmasterMain(int argc, char *argv[]) StartBackgroundWorker(); } - if (strcmp(argv[1], "--forkcol") == 0) - { - /* Do not want to attach to shared memory */ - - PgstatCollectorMain(argc, argv); /* does not return */ - } if (strcmp(argv[1], "--forklog") == 0) { /* Do not want to attach to shared memory */ @@ -5224,12 +5155,6 @@ sigusr1_handler(SIGNAL_ARGS) if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && pmState == PM_RECOVERY && Shutdown == NoShutdown) { - /* - * Likewise, start other special children as needed. - */ - Assert(PgStatPID == 0); - PgStatPID = pgstat_start(); - ereport(LOG, (errmsg("database system is ready to accept read-only connections"))); @@ -6145,7 +6070,6 @@ extern slock_t *ShmemLock; extern slock_t *ProcStructLock; extern PGPROC *AuxiliaryProcs; extern PMSignalData *PMSignalState; -extern pgsocket pgStatSock; extern pg_time_t first_syslogger_file_time; #ifndef WIN32 @@ -6201,8 +6125,6 @@ save_backend_variables(BackendParameters *param, Port *port, param->AuxiliaryProcs = AuxiliaryProcs; param->PreparedXactProcs = PreparedXactProcs; param->PMSignalState = PMSignalState; - if (!write_inheritable_socket(¶m->pgStatSock, pgStatSock, childPid)) - return false; param->PostmasterPid = PostmasterPid; param->PgStartTime = PgStartTime; @@ -6436,7 +6358,6 @@ restore_backend_variables(BackendParameters *param, Port *port) AuxiliaryProcs = param->AuxiliaryProcs; PreparedXactProcs = param->PreparedXactProcs; PMSignalState = param->PMSignalState; - read_inheritable_socket(&pgStatSock, ¶m->pgStatSock); PostmasterPid = param->PostmasterPid; PgStartTime = param->PgStartTime; @@ -6475,8 +6396,6 @@ restore_backend_variables(BackendParameters *param, Port *port) if (postmaster_alive_fds[1] >= 0) ReserveExternalFD(); #endif - if (pgStatSock != PGINVALID_SOCKET) - ReserveExternalFD(); } diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 656ec8f555..30e33dace3 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -1911,7 +1911,6 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) (long long) rb->totalTxns, (long long) rb->totalBytes); - namestrcpy(&repSlotStat.slotname, NameStr(ctx->slot->data.name)); repSlotStat.spill_txns = rb->spillTxns; repSlotStat.spill_count = rb->spillCount; repSlotStat.spill_bytes = rb->spillBytes; diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index 697fb23634..b2cb31eaad 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -141,7 +141,7 @@ finish_sync_worker(void) if (IsTransactionState()) { CommitTransactionCommand(); - pgstat_report_stat(false); + pgstat_report_stat(true); } /* And flush all writes. */ @@ -580,7 +580,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) if (started_tx) { CommitTransactionCommand(); - pgstat_report_stat(false); + pgstat_report_stat(true); } } @@ -1386,7 +1386,7 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) MyLogicalRepWorker->relstate, MyLogicalRepWorker->relstate_lsn); CommitTransactionCommand(); - pgstat_report_stat(false); + pgstat_report_stat(true); StartTransactionCommand(); @@ -1630,7 +1630,7 @@ AllTablesyncsReady(void) if (started_tx) { CommitTransactionCommand(); - pgstat_report_stat(false); + pgstat_report_stat(true); } /* diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index f3868b3e1f..7ade49652e 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2937,6 +2937,12 @@ LogicalRepApplyLoop(XLogRecPtr last_received) } send_feedback(last_received, requestReply, requestReply); + + /* + * Force reporting to ensure long idle periods don't lead to + * arbitrarily delayed stats. + */ + pgstat_report_stat(true); } } diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 2217af70d4..c35ea7c35b 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -502,6 +502,14 @@ retry: /* We made this slot active, so it's ours now. */ MyReplicationSlot = s; + + /* + * The call to pgstat_acquire_replslot() protects against stats for + * a different slot, from before a restart or such, being present during + * pgstat_report_replslot(). + */ + if (SlotIsLogical(s)) + pgstat_acquire_replslot(s); } /* @@ -746,20 +754,10 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) elog(DEBUG3, "replication slot drop: %s: removed directory", NameStr(slot->data.name)); /* - * Send a message to drop the replication slot to the stats collector. - * Since there is no guarantee of the order of message transfer on a UDP - * connection, it's possible that a message for creating a new slot - * reaches before a message for removing the old slot. We send the drop - * and create messages while holding ReplicationSlotAllocationLock to - * reduce that possibility. If the messages reached in reverse, we would - * lose one statistics update message. But the next update message will - * create the statistics for the replication slot. - * - * XXX In case, the messages for creation and drop slot of the same name - * get lost and create happens before (auto)vacuum cleans up the dead - * slot, the stats will be accumulated into the old slot. One can imagine - * having OIDs for each slot to avoid the accumulation of stats but that - * doesn't seem worth doing as in practice this won't happen frequently. + * Drop the statistics entry for the replication slot. Do this while + * holding ReplicationSlotAllocationLock so that we don't drop a + * statistics entry for another slot with the same name just created in + * another session. */ if (SlotIsLogical(slot)) pgstat_drop_replslot(slot); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d73a40c1bc..f80f90ac3c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2151,7 +2151,7 @@ BufferSync(int flags) if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); - PendingCheckpointerStats.m_buf_written_checkpoints++; + PendingCheckpointerStats.buf_written_checkpoints++; num_written++; } } @@ -2261,7 +2261,7 @@ BgBufferSync(WritebackContext *wb_context) strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); /* Report buffer alloc counts to pgstat */ - PendingBgWriterStats.m_buf_alloc += recent_alloc; + PendingBgWriterStats.buf_alloc += recent_alloc; /* * If we're not running the LRU scan, just stop after doing the stats @@ -2451,7 +2451,7 @@ BgBufferSync(WritebackContext *wb_context) reusable_buffers++; if (++num_written >= bgwriter_lru_maxpages) { - PendingBgWriterStats.m_maxwritten_clean++; + PendingBgWriterStats.maxwritten_clean++; break; } } @@ -2459,7 +2459,7 @@ BgBufferSync(WritebackContext *wb_context) reusable_buffers++; } - PendingBgWriterStats.m_buf_written_clean += num_written; + PendingBgWriterStats.buf_written_clean += num_written; #ifdef BGW_DEBUG elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d", diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index cd4ebe2fc5..88ff59c568 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -145,6 +145,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); + size = add_size(size, StatsShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -296,6 +297,7 @@ CreateSharedMemoryAndSemaphores(void) BTreeShmemInit(); SyncScanShmemInit(); AsyncShmemInit(); + StatsShmemInit(); #ifdef EXEC_BACKEND diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 8f7f1b2f7c..c24779d0bb 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -176,7 +176,13 @@ static const char *const BuiltinTrancheNames[] = { /* LWTRANCHE_PARALLEL_APPEND: */ "ParallelAppend", /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */ - "PerXactPredicateList" + "PerXactPredicateList", + /* LWTRANCHE_PGSTATS_DSA: */ + "PgStatsDSA", + /* LWTRANCHE_PGSTATS_HASH: */ + "PgStatsHash", + /* LWTRANCHE_PGSTATS_DATA: */ + "PgStatsData", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 260b650f15..95dc2e2c83 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3372,6 +3372,14 @@ ProcessInterrupts(void) IdleSessionTimeoutPending = false; } + if (IdleStatsUpdateTimeoutPending) + { + /* timer should have been disarmed */ + Assert(!IsTransactionBlock()); + IdleStatsUpdateTimeoutPending = false; + pgstat_report_stat(true); + } + if (ProcSignalBarrierPending) ProcessProcSignalBarrier(); @@ -4044,6 +4052,7 @@ PostgresMain(const char *dbname, const char *username) volatile bool send_ready_for_query = true; bool idle_in_transaction_timeout_enabled = false; bool idle_session_timeout_enabled = false; + bool idle_stats_update_timeout_enabled = false; AssertArg(dbname != NULL); AssertArg(username != NULL); @@ -4407,6 +4416,8 @@ PostgresMain(const char *dbname, const char *username) } else { + long stats_timeout; + /* * Process incoming notifies (including self-notifies), if * any, and send relevant messages to the client. Doing it @@ -4417,7 +4428,14 @@ PostgresMain(const char *dbname, const char *username) if (notifyInterruptPending) ProcessNotifyInterrupt(false); - pgstat_report_stat(false); + /* Start the idle-stats-update timer */ + stats_timeout = pgstat_report_stat(false); + if (stats_timeout > 0) + { + idle_stats_update_timeout_enabled = true; + enable_timeout_after(IDLE_STATS_UPDATE_TIMEOUT, + stats_timeout); + } set_ps_display("idle"); pgstat_report_activity(STATE_IDLE, NULL); @@ -4452,9 +4470,9 @@ PostgresMain(const char *dbname, const char *username) firstchar = ReadCommand(&input_message); /* - * (4) turn off the idle-in-transaction and idle-session timeouts, if - * active. We do this before step (5) so that any last-moment timeout - * is certain to be detected in step (5). + * (4) turn off the idle-in-transaction, idle-session and + * idle-stats-update timeouts if active. We do this before step (5) so + * that any last-moment timeout is certain to be detected in step (5). * * At most one of these timeouts will be active, so there's no need to * worry about combining the timeout.c calls into one. @@ -4469,6 +4487,11 @@ PostgresMain(const char *dbname, const char *username) disable_timeout(IDLE_SESSION_TIMEOUT, false); idle_session_timeout_enabled = false; } + if (idle_stats_update_timeout_enabled) + { + disable_timeout(IDLE_STATS_UPDATE_TIMEOUT, false); + idle_stats_update_timeout_enabled = false; + } /* * (5) disable async signal conditions again. diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile index 791ba68e7e..690312308f 100644 --- a/src/backend/utils/activity/Makefile +++ b/src/backend/utils/activity/Makefile @@ -23,6 +23,7 @@ OBJS = \ pgstat_function.o \ pgstat_relation.o \ pgstat_replslot.o \ + pgstat_shmem.o \ pgstat_slru.o \ pgstat_subscription.o \ pgstat_wal.o \ diff --git a/src/backend/utils/activity/pgstat_archiver.c b/src/backend/utils/activity/pgstat_archiver.c index 09bc12070d..851726fd50 100644 --- a/src/backend/utils/activity/pgstat_archiver.c +++ b/src/backend/utils/activity/pgstat_archiver.c @@ -27,14 +27,85 @@ void pgstat_report_archiver(const char *xlog, bool failed) { - PgStat_MsgArchiver msg; + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + TimestampTz now = GetCurrentTimestamp(); - /* - * Prepare and send the message - */ - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER); - msg.m_failed = failed; - strlcpy(msg.m_xlog, xlog, sizeof(msg.m_xlog)); - msg.m_timestamp = GetCurrentTimestamp(); - pgstat_send(&msg, sizeof(msg)); + pgstat_begin_changecount_write(&stats_shmem->changecount); + + if (failed) + { + ++stats_shmem->stats.failed_count; + memcpy(&stats_shmem->stats.last_failed_wal, xlog, + sizeof(stats_shmem->stats.last_failed_wal)); + stats_shmem->stats.last_failed_timestamp = now; + } + else + { + ++stats_shmem->stats.archived_count; + memcpy(&stats_shmem->stats.last_archived_wal, xlog, + sizeof(stats_shmem->stats.last_archived_wal)); + stats_shmem->stats.last_archived_timestamp = now; + } + + pgstat_end_changecount_write(&stats_shmem->changecount); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the archiver statistics struct. + */ +PgStat_ArchiverStats * +pgstat_fetch_stat_archiver(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_ARCHIVER); + + return &pgStatLocal.snapshot.archiver; +} + +void +pgstat_archiver_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + + /* see explanation above PgStatShared_Archiver for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_archiver_snapshot_cb(void) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + PgStat_ArchiverStats *stat_snap = &pgStatLocal.snapshot.archiver; + PgStat_ArchiverStats *reset_offset = &stats_shmem->reset_offset; + PgStat_ArchiverStats reset; + + pgstat_copy_changecounted_stats(stat_snap, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ + if (stat_snap->archived_count == reset.archived_count) + { + stat_snap->last_archived_wal[0] = 0; + stat_snap->last_archived_timestamp = 0; + } + stat_snap->archived_count -= reset.archived_count; + + if (stat_snap->failed_count == reset.failed_count) + { + stat_snap->last_failed_wal[0] = 0; + stat_snap->last_failed_timestamp = 0; + } + stat_snap->failed_count -= reset.failed_count; } diff --git a/src/backend/utils/activity/pgstat_bgwriter.c b/src/backend/utils/activity/pgstat_bgwriter.c index dfea88eca1..fbb1edc527 100644 --- a/src/backend/utils/activity/pgstat_bgwriter.c +++ b/src/backend/utils/activity/pgstat_bgwriter.c @@ -20,12 +20,7 @@ #include "utils/pgstat_internal.h" -/* - * BgWriter global statistics counters. Stored directly in a stats - * message structure so they can be sent without needing to copy things - * around. We assume this init to zeroes. - */ -PgStat_MsgBgWriter PendingBgWriterStats; +PgStat_BgWriterStats PendingBgWriterStats = {0}; /* @@ -34,27 +29,82 @@ PgStat_MsgBgWriter PendingBgWriterStats; void pgstat_report_bgwriter(void) { - /* We assume this initializes to zeroes */ - static const PgStat_MsgBgWriter all_zeroes; + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + static const PgStat_BgWriterStats all_zeroes; + Assert(!pgStatLocal.shmem->is_shutdown); pgstat_assert_is_up(); /* * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. + * this case, avoid unnecessarily modifying the stats entry. */ - if (memcmp(&PendingBgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0) + if (memcmp(&PendingBgWriterStats, &all_zeroes, sizeof(all_zeroes)) == 0) return; - /* - * Prepare and send the message - */ - pgstat_setheader(&PendingBgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER); - pgstat_send(&PendingBgWriterStats, sizeof(PendingBgWriterStats)); + pgstat_begin_changecount_write(&stats_shmem->changecount); + +#define BGWRITER_ACC(fld) stats_shmem->stats.fld += PendingBgWriterStats.fld + BGWRITER_ACC(buf_written_clean); + BGWRITER_ACC(maxwritten_clean); + BGWRITER_ACC(buf_alloc); +#undef BGWRITER_ACC + + pgstat_end_changecount_write(&stats_shmem->changecount); /* * Clear out the statistics buffer, so it can be re-used. */ MemSet(&PendingBgWriterStats, 0, sizeof(PendingBgWriterStats)); } + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the bgwriter statistics struct. + */ +PgStat_BgWriterStats * +pgstat_fetch_stat_bgwriter(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_BGWRITER); + + return &pgStatLocal.snapshot.bgwriter; +} + +void +pgstat_bgwriter_reset_all_cb(TimestampTz ts) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + + /* see explanation above PgStatShared_BgWriter for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_bgwriter_snapshot_cb(void) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + PgStat_BgWriterStats *reset_offset = &stats_shmem->reset_offset; + PgStat_BgWriterStats reset; + + pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.bgwriter, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ +#define BGWRITER_COMP(fld) pgStatLocal.snapshot.bgwriter.fld -= reset.fld; + BGWRITER_COMP(buf_written_clean); + BGWRITER_COMP(maxwritten_clean); + BGWRITER_COMP(buf_alloc); +#undef BGWRITER_COMP +} diff --git a/src/backend/utils/activity/pgstat_checkpointer.c b/src/backend/utils/activity/pgstat_checkpointer.c index 3f4e2054f5..af8d513e7b 100644 --- a/src/backend/utils/activity/pgstat_checkpointer.c +++ b/src/backend/utils/activity/pgstat_checkpointer.c @@ -20,12 +20,7 @@ #include "utils/pgstat_internal.h" -/* - * Checkpointer global statistics counters. Stored directly in a stats - * message structure so they can be sent without needing to copy things - * around. We assume this init to zeroes. - */ -PgStat_MsgCheckpointer PendingCheckpointerStats; +PgStat_CheckpointerStats PendingCheckpointerStats = {0}; /* @@ -35,24 +30,92 @@ void pgstat_report_checkpointer(void) { /* We assume this initializes to zeroes */ - static const PgStat_MsgCheckpointer all_zeroes; + static const PgStat_CheckpointerStats all_zeroes; + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + + Assert(!pgStatLocal.shmem->is_shutdown); + pgstat_assert_is_up(); /* * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. + * this case, avoid unnecessarily modifying the stats entry. */ - if (memcmp(&PendingCheckpointerStats, &all_zeroes, sizeof(PgStat_MsgCheckpointer)) == 0) + if (memcmp(&PendingCheckpointerStats, &all_zeroes, + sizeof(all_zeroes)) == 0) return; - /* - * Prepare and send the message - */ - pgstat_setheader(&PendingCheckpointerStats.m_hdr, PGSTAT_MTYPE_CHECKPOINTER); - pgstat_send(&PendingCheckpointerStats, sizeof(PendingCheckpointerStats)); + pgstat_begin_changecount_write(&stats_shmem->changecount); + +#define CHECKPOINTER_ACC(fld) stats_shmem->stats.fld += PendingCheckpointerStats.fld + CHECKPOINTER_ACC(timed_checkpoints); + CHECKPOINTER_ACC(requested_checkpoints); + CHECKPOINTER_ACC(checkpoint_write_time); + CHECKPOINTER_ACC(checkpoint_sync_time); + CHECKPOINTER_ACC(buf_written_checkpoints); + CHECKPOINTER_ACC(buf_written_backend); + CHECKPOINTER_ACC(buf_fsync_backend); +#undef CHECKPOINTER_ACC + + pgstat_end_changecount_write(&stats_shmem->changecount); /* * Clear out the statistics buffer, so it can be re-used. */ MemSet(&PendingCheckpointerStats, 0, sizeof(PendingCheckpointerStats)); } + +/* + * pgstat_fetch_stat_checkpointer() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the checkpointer statistics struct. + */ +PgStat_CheckpointerStats * +pgstat_fetch_stat_checkpointer(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER); + + return &pgStatLocal.snapshot.checkpointer; +} + +void +pgstat_checkpointer_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + + /* see explanation above PgStatShared_Checkpointer for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_checkpointer_snapshot_cb(void) +{ + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + PgStat_CheckpointerStats *reset_offset = &stats_shmem->reset_offset; + PgStat_CheckpointerStats reset; + + pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.checkpointer, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ +#define CHECKPOINTER_COMP(fld) pgStatLocal.snapshot.checkpointer.fld -= reset.fld; + CHECKPOINTER_COMP(timed_checkpoints); + CHECKPOINTER_COMP(requested_checkpoints); + CHECKPOINTER_COMP(checkpoint_write_time); + CHECKPOINTER_COMP(checkpoint_sync_time); + CHECKPOINTER_COMP(buf_written_checkpoints); + CHECKPOINTER_COMP(buf_written_backend); + CHECKPOINTER_COMP(buf_fsync_backend); +#undef CHECKPOINTER_COMP +} diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index 6d27657bdb..649d9c6960 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -19,13 +19,12 @@ #include "utils/pgstat_internal.h" #include "utils/timestamp.h" +#include "storage/procsignal.h" static bool pgstat_should_report_connstat(void); -int pgStatXactCommit = 0; -int pgStatXactRollback = 0; PgStat_Counter pgStatBlockReadTime = 0; PgStat_Counter pgStatBlockWriteTime = 0; PgStat_Counter pgStatActiveTime = 0; @@ -33,25 +32,18 @@ PgStat_Counter pgStatTransactionIdleTime = 0; SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL; +static int pgStatXactCommit = 0; +static int pgStatXactRollback = 0; static PgStat_Counter pgLastSessionReportTime = 0; /* - * Tell the collector that we just dropped a database. - * (If the message gets lost, we will still clean the dead DB eventually - * via future invocations of pgstat_vacuum_stat().) + * Remove entry for the database being dropped. */ void pgstat_drop_database(Oid databaseid) { - PgStat_MsgDropdb msg; - - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB); - msg.m_databaseid = databaseid; - pgstat_send(&msg, sizeof(msg)); + pgstat_drop_transactional(PGSTAT_KIND_DATABASE, databaseid, InvalidOid); } /* @@ -62,16 +54,24 @@ pgstat_drop_database(Oid databaseid) void pgstat_report_autovac(Oid dboid) { - PgStat_MsgAutovacStart msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Database *dbentry; - if (pgStatSock == PGINVALID_SOCKET) - return; + /* can't get here in single user mode */ + Assert(IsUnderPostmaster); - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START); - msg.m_databaseid = dboid; - msg.m_start_time = GetCurrentTimestamp(); + /* + * End-of-vacuum is reported instantly. Report the start the same way for + * consistency. Vacuum doesn't run frequently and is a long-lasting + * operation so it doesn't matter if we get blocked here a little. + */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, + dboid, InvalidOid, false); - pgstat_send(&msg, sizeof(msg)); + dbentry = (PgStatShared_Database *) entry_ref->shared_stats; + dbentry->stats.last_autovac_time = GetCurrentTimestamp(); + + pgstat_unlock_entry(entry_ref); } /* @@ -80,15 +80,39 @@ pgstat_report_autovac(Oid dboid) void pgstat_report_recovery_conflict(int reason) { - PgStat_MsgRecoveryConflict msg; + PgStat_StatDBEntry *dbentry; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + Assert(IsUnderPostmaster); + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT); - msg.m_databaseid = MyDatabaseId; - msg.m_reason = reason; - pgstat_send(&msg, sizeof(msg)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + switch (reason) + { + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + + /* + * Since we drop the information about the database as soon as it + * replicates, there is no point in counting these conflicts. + */ + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + dbentry->n_conflict_tablespace++; + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + dbentry->n_conflict_lock++; + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + dbentry->n_conflict_snapshot++; + break; + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + dbentry->n_conflict_bufferpin++; + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + dbentry->n_conflict_startup_deadlock++; + break; + } } /* @@ -97,14 +121,13 @@ pgstat_report_recovery_conflict(int reason) void pgstat_report_deadlock(void) { - PgStat_MsgDeadlock msg; + PgStat_StatDBEntry *dbent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DEADLOCK); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, sizeof(msg)); + dbent = pgstat_prep_database_pending(MyDatabaseId); + dbent->n_deadlocks++; } /* @@ -113,17 +136,24 @@ pgstat_report_deadlock(void) void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) { - PgStat_MsgChecksumFailure msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Database *sharedent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE); - msg.m_databaseid = dboid; - msg.m_failurecount = failurecount; - msg.m_failure_time = GetCurrentTimestamp(); + /* + * Update the shared stats directly - checksum failures should never be + * common enough for that to be a problem. + */ + entry_ref = + pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false); - pgstat_send(&msg, sizeof(msg)); + sharedent = (PgStatShared_Database *) entry_ref->shared_stats; + sharedent->stats.n_checksum_failures += failurecount; + sharedent->stats.last_checksum_failure = GetCurrentTimestamp(); + + pgstat_unlock_entry(entry_ref); } /* @@ -141,15 +171,14 @@ pgstat_report_checksum_failure(void) void pgstat_report_tempfile(size_t filesize) { - PgStat_MsgTempFile msg; + PgStat_StatDBEntry *dbent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TEMPFILE); - msg.m_databaseid = MyDatabaseId; - msg.m_filesize = filesize; - pgstat_send(&msg, sizeof(msg)); + dbent = pgstat_prep_database_pending(MyDatabaseId); + dbent->n_temp_bytes += filesize; + dbent->n_temp_files++; } /* @@ -158,16 +187,15 @@ pgstat_report_tempfile(size_t filesize) void pgstat_report_connect(Oid dboid) { - PgStat_MsgConnect msg; + PgStat_StatDBEntry *dbentry; if (!pgstat_should_report_connstat()) return; pgLastSessionReportTime = MyStartTimestamp; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CONNECT); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, sizeof(PgStat_MsgConnect)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + dbentry->n_sessions++; } /* @@ -176,15 +204,42 @@ pgstat_report_connect(Oid dboid) void pgstat_report_disconnect(Oid dboid) { - PgStat_MsgDisconnect msg; + PgStat_StatDBEntry *dbentry; if (!pgstat_should_report_connstat()) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DISCONNECT); - msg.m_databaseid = MyDatabaseId; - msg.m_cause = pgStatSessionEndCause; - pgstat_send(&msg, sizeof(PgStat_MsgDisconnect)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + switch (pgStatSessionEndCause) + { + case DISCONNECT_NOT_YET: + case DISCONNECT_NORMAL: + /* we don't collect these */ + break; + case DISCONNECT_CLIENT_EOF: + dbentry->n_sessions_abandoned++; + break; + case DISCONNECT_FATAL: + dbentry->n_sessions_fatal++; + break; + case DISCONNECT_KILLED: + dbentry->n_sessions_killed++; + break; + } +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one database or NULL. NULL doesn't mean + * that the database doesn't exist, just that there are no statistics, so the + * caller is better off to report ZERO instead. + */ +PgStat_StatDBEntry * +pgstat_fetch_stat_dbentry(Oid dboid) +{ + return (PgStat_StatDBEntry *) + pgstat_fetch_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid); } void @@ -205,57 +260,47 @@ AtEOXact_PgStat_Database(bool isCommit, bool parallel) } /* - * Subroutine for pgstat_send_tabstat: Handle xact commit/rollback and I/O + * Subroutine for pgstat_report_stat(): Handle xact commit/rollback and I/O * timings. */ void -pgstat_update_dbstats(PgStat_MsgTabstat *tsmsg, TimestampTz now) +pgstat_update_dbstats(TimestampTz ts) { - if (OidIsValid(tsmsg->m_databaseid)) - { - tsmsg->m_xact_commit = pgStatXactCommit; - tsmsg->m_xact_rollback = pgStatXactRollback; - tsmsg->m_block_read_time = pgStatBlockReadTime; - tsmsg->m_block_write_time = pgStatBlockWriteTime; + PgStat_StatDBEntry *dbentry; - if (pgstat_should_report_connstat()) - { - long secs; - int usecs; + dbentry = pgstat_prep_database_pending(MyDatabaseId); - /* - * pgLastSessionReportTime is initialized to MyStartTimestamp by - * pgstat_report_connect(). - */ - TimestampDifference(pgLastSessionReportTime, now, &secs, &usecs); - pgLastSessionReportTime = now; - tsmsg->m_session_time = (PgStat_Counter) secs * 1000000 + usecs; - tsmsg->m_active_time = pgStatActiveTime; - tsmsg->m_idle_in_xact_time = pgStatTransactionIdleTime; - } - else - { - tsmsg->m_session_time = 0; - tsmsg->m_active_time = 0; - tsmsg->m_idle_in_xact_time = 0; - } - pgStatXactCommit = 0; - pgStatXactRollback = 0; - pgStatBlockReadTime = 0; - pgStatBlockWriteTime = 0; - pgStatActiveTime = 0; - pgStatTransactionIdleTime = 0; - } - else + /* + * Accumulate xact commit/rollback and I/O timings to stats entry of the + * current database. + */ + dbentry->n_xact_commit += pgStatXactCommit; + dbentry->n_xact_rollback += pgStatXactRollback; + dbentry->n_block_read_time += pgStatBlockReadTime; + dbentry->n_block_write_time += pgStatBlockWriteTime; + + if (pgstat_should_report_connstat()) { - tsmsg->m_xact_commit = 0; - tsmsg->m_xact_rollback = 0; - tsmsg->m_block_read_time = 0; - tsmsg->m_block_write_time = 0; - tsmsg->m_session_time = 0; - tsmsg->m_active_time = 0; - tsmsg->m_idle_in_xact_time = 0; + long secs; + int usecs; + + /* + * pgLastSessionReportTime is initialized to MyStartTimestamp by + * pgstat_report_connect(). + */ + TimestampDifference(pgLastSessionReportTime, ts, &secs, &usecs); + pgLastSessionReportTime = ts; + dbentry->total_session_time += (PgStat_Counter) secs * 1000000 + usecs; + dbentry->total_active_time += pgStatActiveTime; + dbentry->total_idle_in_xact_time += pgStatTransactionIdleTime; } + + pgStatXactCommit = 0; + pgStatXactRollback = 0; + pgStatBlockReadTime = 0; + pgStatBlockWriteTime = 0; + pgStatActiveTime = 0; + pgStatTransactionIdleTime = 0; } /* @@ -270,3 +315,111 @@ pgstat_should_report_connstat(void) { return MyBackendType == B_BACKEND; } + +/* + * Find or create a local PgStat_StatDBEntry entry for dboid. + */ +PgStat_StatDBEntry * +pgstat_prep_database_pending(Oid dboid) +{ + PgStat_EntryRef *entry_ref; + + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid, + NULL); + + return entry_ref->pending; + +} + +/* + * Reset the database's reset timestamp, without resetting the contents of the + * database stats. + */ +void +pgstat_reset_database_timestamp(Oid dboid, TimestampTz ts) +{ + PgStat_EntryRef *dbref; + PgStatShared_Database *dbentry; + + dbref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, MyDatabaseId, InvalidOid, + false); + + dbentry = (PgStatShared_Database *) dbref->shared_stats; + dbentry->stats.stat_reset_timestamp = ts; + + pgstat_unlock_entry(dbref); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_database_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStatShared_Database *sharedent; + PgStat_StatDBEntry *pendingent; + + pendingent = (PgStat_StatDBEntry *) entry_ref->pending; + sharedent = (PgStatShared_Database *) entry_ref->shared_stats; + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + +#define PGSTAT_ACCUM_DBCOUNT(item) \ + (sharedent)->stats.item += (pendingent)->item + + PGSTAT_ACCUM_DBCOUNT(n_xact_commit); + PGSTAT_ACCUM_DBCOUNT(n_xact_rollback); + PGSTAT_ACCUM_DBCOUNT(n_blocks_fetched); + PGSTAT_ACCUM_DBCOUNT(n_blocks_hit); + + PGSTAT_ACCUM_DBCOUNT(n_tuples_returned); + PGSTAT_ACCUM_DBCOUNT(n_tuples_fetched); + PGSTAT_ACCUM_DBCOUNT(n_tuples_inserted); + PGSTAT_ACCUM_DBCOUNT(n_tuples_updated); + PGSTAT_ACCUM_DBCOUNT(n_tuples_deleted); + + /* last_autovac_time is reported immediately */ + Assert(pendingent->last_autovac_time == 0); + + PGSTAT_ACCUM_DBCOUNT(n_conflict_tablespace); + PGSTAT_ACCUM_DBCOUNT(n_conflict_lock); + PGSTAT_ACCUM_DBCOUNT(n_conflict_snapshot); + PGSTAT_ACCUM_DBCOUNT(n_conflict_bufferpin); + PGSTAT_ACCUM_DBCOUNT(n_conflict_startup_deadlock); + + PGSTAT_ACCUM_DBCOUNT(n_temp_bytes); + PGSTAT_ACCUM_DBCOUNT(n_temp_files); + PGSTAT_ACCUM_DBCOUNT(n_deadlocks); + + /* checksum failures are reported immediately */ + Assert(pendingent->n_checksum_failures == 0); + Assert(pendingent->last_checksum_failure == 0); + + PGSTAT_ACCUM_DBCOUNT(n_block_read_time); + PGSTAT_ACCUM_DBCOUNT(n_block_write_time); + + PGSTAT_ACCUM_DBCOUNT(n_sessions); + PGSTAT_ACCUM_DBCOUNT(total_session_time); + PGSTAT_ACCUM_DBCOUNT(total_active_time); + PGSTAT_ACCUM_DBCOUNT(total_idle_in_xact_time); + PGSTAT_ACCUM_DBCOUNT(n_sessions_abandoned); + PGSTAT_ACCUM_DBCOUNT(n_sessions_fatal); + PGSTAT_ACCUM_DBCOUNT(n_sessions_killed); +#undef PGSTAT_ACCUM_DBCOUNT + + pgstat_unlock_entry(entry_ref); + + memset(pendingent, 0, sizeof(*pendingent)); + + return true; +} + +void +pgstat_database_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Database *) header)->stats.stat_reset_timestamp = ts; +} diff --git a/src/backend/utils/activity/pgstat_function.c b/src/backend/utils/activity/pgstat_function.c index ad9879afb2..427d8c47fc 100644 --- a/src/backend/utils/activity/pgstat_function.c +++ b/src/backend/utils/activity/pgstat_function.c @@ -17,8 +17,10 @@ #include "postgres.h" +#include "fmgr.h" +#include "utils/inval.h" #include "utils/pgstat_internal.h" -#include "utils/timestamp.h" +#include "utils/syscache.h" /* ---------- @@ -28,18 +30,6 @@ int pgstat_track_functions = TRACK_FUNC_OFF; -/* - * Indicates if backend has some function stats that it hasn't yet - * sent to the collector. - */ -bool have_function_stats = false; - -/* - * Backends store per-function info that's waiting to be sent to the collector - * in this hash table (indexed by function OID). - */ -static HTAB *pgStatFunctions = NULL; - /* * Total time charged to functions so far in the current backend. * We use this to help separate "self" and "other" time charges. @@ -61,6 +51,10 @@ pgstat_create_function(Oid proid) /* * Ensure that stats are dropped if transaction commits. + * + * NB: This is only reliable because pgstat_init_function_usage() does some + * extra work. If other places start emitting function stats they likely need + * similar logic. */ void pgstat_drop_function(Oid proid) @@ -78,8 +72,9 @@ void pgstat_init_function_usage(FunctionCallInfo fcinfo, PgStat_FunctionCallUsage *fcu) { - PgStat_BackendFunctionEntry *htabent; - bool found; + PgStat_EntryRef *entry_ref; + PgStat_BackendFunctionEntry *pending; + bool created_entry; if (pgstat_track_functions <= fcinfo->flinfo->fn_stats) { @@ -88,29 +83,48 @@ pgstat_init_function_usage(FunctionCallInfo fcinfo, return; } - if (!pgStatFunctions) - { - /* First time through - initialize function stat table */ - HASHCTL hash_ctl; + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_FUNCTION, + MyDatabaseId, + fcinfo->flinfo->fn_oid, + &created_entry); - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry); - pgStatFunctions = hash_create("Function stat entries", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); + /* + * If no shared entry already exists, check if the function has been + * deleted concurrently. This can go unnoticed until here because + * executing a statement that just calls a function, does not trigger + * cache invalidation processing. The reason we care about this case is + * that otherwise we could create a new stats entry for an already dropped + * function (for relations etc this is not possible because emitting stats + * requires a lock for the relation to already have been acquired). + * + * It's somewhat ugly to have a behavioral difference based on + * track_functions being enabled/disabled. But it seems acceptable, given + * that there's already behavioral differences depending on whether the + * function is the caches etc. + * + * For correctness it'd be sufficient to set ->dropped to true. However, + * the accepted invalidation will commonly cause "low level" failures in + * PL code, with an OID in the error message. Making this harder to + * test... + */ + if (created_entry) + { + AcceptInvalidationMessages(); + if (!SearchSysCacheExists1(PROCOID, ObjectIdGetDatum(fcinfo->flinfo->fn_oid))) + { + pgstat_drop_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, + fcinfo->flinfo->fn_oid); + ereport(ERROR, errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("function call to dropped function")); + } } - /* Get the stats entry for this function, create if necessary */ - htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid, - HASH_ENTER, &found); - if (!found) - MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts)); + pending = entry_ref->pending; - fcu->fs = &htabent->f_counts; + fcu->fs = &pending->f_counts; /* save stats for this function, later used to compensate for recursion */ - fcu->save_f_total_time = htabent->f_counts.f_total_time; + fcu->save_f_total_time = pending->f_counts.f_total_time; /* save current backend-wide total time */ fcu->save_total = total_func_time; @@ -167,64 +181,37 @@ pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize) fs->f_numcalls++; fs->f_total_time = f_total; INSTR_TIME_ADD(fs->f_self_time, f_self); - - /* indicate that we have something to send */ - have_function_stats = true; } /* - * Subroutine for pgstat_report_stat: populate and send a function stat message + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. */ -void -pgstat_send_funcstats(void) +bool +pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) { - /* we assume this inits to all zeroes: */ - static const PgStat_FunctionCounts all_zeroes; + PgStat_BackendFunctionEntry *localent; + PgStatShared_Function *shfuncent; - PgStat_MsgFuncstat msg; - PgStat_BackendFunctionEntry *entry; - HASH_SEQ_STATUS fstat; + localent = (PgStat_BackendFunctionEntry *) entry_ref->pending; + shfuncent = (PgStatShared_Function *) entry_ref->shared_stats; - if (pgStatFunctions == NULL) - return; + /* localent always has non-zero content */ - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT); - msg.m_databaseid = MyDatabaseId; - msg.m_nentries = 0; + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; - hash_seq_init(&fstat, pgStatFunctions); - while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL) - { - PgStat_FunctionEntry *m_ent; + shfuncent->stats.f_numcalls += localent->f_counts.f_numcalls; + shfuncent->stats.f_total_time += + INSTR_TIME_GET_MICROSEC(localent->f_counts.f_total_time); + shfuncent->stats.f_self_time += + INSTR_TIME_GET_MICROSEC(localent->f_counts.f_self_time); - /* Skip it if no counts accumulated since last time */ - if (memcmp(&entry->f_counts, &all_zeroes, - sizeof(PgStat_FunctionCounts)) == 0) - continue; + pgstat_unlock_entry(entry_ref); - /* need to convert format of time accumulators */ - m_ent = &msg.m_entry[msg.m_nentries]; - m_ent->f_id = entry->f_id; - m_ent->f_numcalls = entry->f_counts.f_numcalls; - m_ent->f_total_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_total_time); - m_ent->f_self_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_self_time); - - if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES) - { - pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + - msg.m_nentries * sizeof(PgStat_FunctionEntry)); - msg.m_nentries = 0; - } - - /* reset the entry's counts */ - MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts)); - } - - if (msg.m_nentries > 0) - pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + - msg.m_nentries * sizeof(PgStat_FunctionEntry)); - - have_function_stats = false; + return true; } /* @@ -235,12 +222,22 @@ pgstat_send_funcstats(void) PgStat_BackendFunctionEntry * find_funcstat_entry(Oid func_id) { - pgstat_assert_is_up(); + PgStat_EntryRef *entry_ref; - if (pgStatFunctions == NULL) - return NULL; + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id); - return (PgStat_BackendFunctionEntry *) hash_search(pgStatFunctions, - (void *) &func_id, - HASH_FIND, NULL); + if (entry_ref) + return entry_ref->pending; + return NULL; +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one function or NULL. + */ +PgStat_StatFuncEntry * +pgstat_fetch_stat_funcentry(Oid func_id) +{ + return (PgStat_StatFuncEntry *) + pgstat_fetch_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id); } diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index 51a87b6673..bec190c589 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -19,6 +19,7 @@ #include "access/twophase_rmgr.h" #include "access/xact.h" +#include "catalog/partition.h" #include "postmaster/autovacuum.h" #include "utils/memutils.h" #include "utils/pgstat_internal.h" @@ -26,38 +27,6 @@ #include "utils/timestamp.h" -/* - * Structures in which backends store per-table info that's waiting to be - * sent to the collector. - * - * NOTE: once allocated, TabStatusArray structures are never moved or deleted - * for the life of the backend. Also, we zero out the t_id fields of the - * contained PgStat_TableStatus structs whenever they are not actively in use. - * This allows relcache pgstat_info pointers to be treated as long-lived data, - * avoiding repeated searches in pgstat_init_relation() when a relation is - * repeatedly opened during a transaction. - */ -#define TABSTAT_QUANTUM 100 /* we alloc this many at a time */ - - -typedef struct TabStatusArray -{ - struct TabStatusArray *tsa_next; /* link to next array, if any */ - int tsa_used; /* # entries currently used */ - PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM]; /* per-table data */ -} TabStatusArray; - -static TabStatusArray *pgStatTabList = NULL; - -/* - * pgStatTabHash entry: map from relation OID to PgStat_TableStatus pointer - */ -typedef struct TabStatHashEntry -{ - Oid t_id; - PgStat_TableStatus *tsa_entry; -} TabStatHashEntry; - /* Record that's written to 2PC state file when pgstat state is persisted */ typedef struct TwoPhasePgStatRecord { @@ -74,27 +43,13 @@ typedef struct TwoPhasePgStatRecord } TwoPhasePgStatRecord; -static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared); -static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now); +static PgStat_TableStatus *pgstat_prep_relation_pending(Oid rel_id, bool isshared); static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level); static void ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info); static void save_truncdrop_counters(PgStat_TableXactStatus *trans, bool is_drop); static void restore_truncdrop_counters(PgStat_TableXactStatus *trans); -/* - * Indicates if backend has some relation stats that it hasn't yet - * sent to the collector. - */ -bool have_relation_stats; - - -/* - * Hash table for O(1) t_id -> tsa_entry lookup - */ -static HTAB *pgStatTabHash = NULL; - - /* * Copy stats between relations. This is used for things like REINDEX * CONCURRENTLY. @@ -103,43 +58,39 @@ void pgstat_copy_relation_stats(Relation dst, Relation src) { PgStat_StatTabEntry *srcstats; + PgStatShared_Relation *dstshstats; + PgStat_EntryRef *dst_ref; - srcstats = pgstat_fetch_stat_tabentry(RelationGetRelid(src)); - + srcstats = pgstat_fetch_stat_tabentry_ext(src->rd_rel->relisshared, + RelationGetRelid(src)); if (!srcstats) return; - if (pgstat_should_count_relation(dst)) - { - /* - * XXX: temporarily this does not actually quite do what the name - * says, and just copy index related fields. A subsequent commit will - * do more. - */ + dst_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + dst->rd_rel->relisshared ? InvalidOid : MyDatabaseId, + RelationGetRelid(dst), + false); - dst->pgstat_info->t_counts.t_numscans = srcstats->numscans; - dst->pgstat_info->t_counts.t_tuples_returned = srcstats->tuples_returned; - dst->pgstat_info->t_counts.t_tuples_fetched = srcstats->tuples_fetched; - dst->pgstat_info->t_counts.t_blocks_fetched = srcstats->blocks_fetched; - dst->pgstat_info->t_counts.t_blocks_hit = srcstats->blocks_hit; + dstshstats = (PgStatShared_Relation *) dst_ref->shared_stats; + dstshstats->stats = *srcstats; - /* the data will be sent by the next pgstat_report_stat() call */ - } + pgstat_unlock_entry(dst_ref); } /* - * Initialize a relcache entry to count access statistics. - * Called whenever a relation is opened. + * Initialize a relcache entry to count access statistics. Called whenever a + * relation is opened. * - * We assume that a relcache entry's pgstat_info field is zeroed by - * relcache.c when the relcache entry is made; thereafter it is long-lived - * data. We can avoid repeated searches of the TabStatus arrays when the - * same relation is touched repeatedly within a transaction. + * We assume that a relcache entry's pgstat_info field is zeroed by relcache.c + * when the relcache entry is made; thereafter it is long-lived data. + * + * This does not create a reference to a stats entry in shared memory, nor + * allocate memory for the pending stats. That happens in + * pgstat_assoc_relation(). */ void pgstat_init_relation(Relation rel) { - Oid rel_id = rel->rd_id; char relkind = rel->rd_rel->relkind; /* @@ -147,27 +98,68 @@ pgstat_init_relation(Relation rel) */ if (!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE) { + rel->pgstat_enabled = false; rel->pgstat_info = NULL; return; } - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) { + if (rel->pgstat_info) + pgstat_unlink_relation(rel); + /* We're not counting at all */ + rel->pgstat_enabled = false; rel->pgstat_info = NULL; return; } - /* - * If we already set up this relation in the current transaction, nothing - * to do. - */ - if (rel->pgstat_info != NULL && - rel->pgstat_info->t_id == rel_id) - return; + rel->pgstat_enabled = true; +} + +/* + * Prepare for statistics for this relation to be collected. + * + * This ensures we have a reference to the stats entry before stats can be + * generated. That is important because a relation drop in another connection + * could otherwise lead to the stats entry being dropped, which then later + * would get recreated when flushing stats. + * + * This is separate from pgstat_init_relation() as it is not uncommon for + * relcache entries to be opened without ever getting stats reported. + */ +void +pgstat_assoc_relation(Relation rel) +{ + Assert(rel->pgstat_enabled); + Assert(rel->pgstat_info == NULL); /* Else find or make the PgStat_TableStatus entry, and update link */ - rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared); + rel->pgstat_info = pgstat_prep_relation_pending(RelationGetRelid(rel), + rel->rd_rel->relisshared); + + /* don't allow link a stats to multiple relcache entries */ + Assert(rel->pgstat_info->relation == NULL); + + /* mark this relation as the owner */ + rel->pgstat_info->relation = rel; +} + +/* + * Break the mutual link between a relcache entry and pending stats entry. + * This must be called whenever one end of the link is removed. + */ +void +pgstat_unlink_relation(Relation rel) +{ + /* remove the link to stats info if any */ + if (rel->pgstat_info == NULL) + return; + + /* link sanity check */ + Assert(rel->pgstat_info->relation == rel); + rel->pgstat_info->relation = NULL; + rel->pgstat_info = NULL; } /* @@ -187,9 +179,26 @@ pgstat_create_relation(Relation rel) void pgstat_drop_relation(Relation rel) { + int nest_level = GetCurrentTransactionNestLevel(); + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + pgstat_drop_transactional(PGSTAT_KIND_RELATION, rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId, RelationGetRelid(rel)); + + /* + * Transactionally set counters to 0. That ensures that accesses to + * pg_stat_xact_all_tables inside the transaction show 0. + */ + if (pgstat_info && + pgstat_info->trans != NULL && + pgstat_info->trans->nest_level == nest_level) + { + save_truncdrop_counters(pgstat_info->trans, true); + pgstat_info->trans->tuples_inserted = 0; + pgstat_info->trans->tuples_updated = 0; + pgstat_info->trans->tuples_deleted = 0; + } } /* @@ -199,19 +208,52 @@ void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples) { - PgStat_MsgVacuum msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Relation *shtabentry; + PgStat_StatTabEntry *tabentry; + Oid dboid = (shared ? InvalidOid : MyDatabaseId); + TimestampTz ts; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM); - msg.m_databaseid = shared ? InvalidOid : MyDatabaseId; - msg.m_tableoid = tableoid; - msg.m_autovacuum = IsAutoVacuumWorkerProcess(); - msg.m_vacuumtime = GetCurrentTimestamp(); - msg.m_live_tuples = livetuples; - msg.m_dead_tuples = deadtuples; - pgstat_send(&msg, sizeof(msg)); + /* Store the data in the table's hash table entry. */ + ts = GetCurrentTimestamp(); + + /* block acquiring lock for the same reason as pgstat_report_autovac() */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + dboid, tableoid, false); + + shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; + tabentry = &shtabentry->stats; + + tabentry->n_live_tuples = livetuples; + tabentry->n_dead_tuples = deadtuples; + + /* + * It is quite possible that a non-aggressive VACUUM ended up skipping + * various pages, however, we'll zero the insert counter here regardless. + * It's currently used only to track when we need to perform an "insert" + * autovacuum, which are mainly intended to freeze newly inserted tuples. + * Zeroing this may just mean we'll not try to vacuum the table again + * until enough tuples have been inserted to trigger another insert + * autovacuum. An anti-wraparound autovacuum will catch any persistent + * stragglers. + */ + tabentry->inserts_since_vacuum = 0; + + if (IsAutoVacuumWorkerProcess()) + { + tabentry->autovac_vacuum_timestamp = ts; + tabentry->autovac_vacuum_count++; + } + else + { + tabentry->vacuum_timestamp = ts; + tabentry->vacuum_count++; + } + + pgstat_unlock_entry(entry_ref); } /* @@ -225,9 +267,12 @@ pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter) { - PgStat_MsgAnalyze msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Relation *shtabentry; + PgStat_StatTabEntry *tabentry; + Oid dboid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; /* @@ -259,15 +304,39 @@ pgstat_report_analyze(Relation rel, deadtuples = Max(deadtuples, 0); } - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE); - msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId; - msg.m_tableoid = RelationGetRelid(rel); - msg.m_autovacuum = IsAutoVacuumWorkerProcess(); - msg.m_resetcounter = resetcounter; - msg.m_analyzetime = GetCurrentTimestamp(); - msg.m_live_tuples = livetuples; - msg.m_dead_tuples = deadtuples; - pgstat_send(&msg, sizeof(msg)); + /* block acquiring lock for the same reason as pgstat_report_autovac() */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, dboid, + RelationGetRelid(rel), + false); + /* can't get dropped while accessed */ + Assert(entry_ref != NULL && entry_ref->shared_stats != NULL); + + shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; + tabentry = &shtabentry->stats; + + tabentry->n_live_tuples = livetuples; + tabentry->n_dead_tuples = deadtuples; + + /* + * If commanded, reset changes_since_analyze to zero. This forgets any + * changes that were committed while the ANALYZE was in progress, but we + * have no good way to estimate how many of those there were. + */ + if (resetcounter) + tabentry->changes_since_analyze = 0; + + if (IsAutoVacuumWorkerProcess()) + { + tabentry->autovac_analyze_timestamp = GetCurrentTimestamp(); + tabentry->autovac_analyze_count++; + } + else + { + tabentry->analyze_timestamp = GetCurrentTimestamp(); + tabentry->analyze_count++; + } + + pgstat_unlock_entry(entry_ref); } /* @@ -356,30 +425,61 @@ pgstat_update_heap_dead_tuples(Relation rel, int delta) } } +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one table or NULL. NULL doesn't mean + * that the table doesn't exist, just that there are no statistics, so the + * caller is better off to report ZERO instead. + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry(Oid relid) +{ + PgStat_StatTabEntry *tabentry; + + tabentry = pgstat_fetch_stat_tabentry_ext(false, relid); + if (tabentry != NULL) + return tabentry; + + /* + * If we didn't find it, maybe it's a shared table. + */ + tabentry = pgstat_fetch_stat_tabentry_ext(true, relid); + return tabentry; +} + +/* + * More efficient version of pgstat_fetch_stat_tabentry(), allowing to specify + * whether the to-be-accessed table is a shared relation or not. + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry_ext(bool shared, Oid reloid) +{ + Oid dboid = (shared ? InvalidOid : MyDatabaseId); + + return (PgStat_StatTabEntry *) + pgstat_fetch_entry(PGSTAT_KIND_RELATION, dboid, reloid); +} + /* * find any existing PgStat_TableStatus entry for rel * - * If no entry, return NULL, don't create a new one + * Find any existing PgStat_TableStatus entry for rel_id in the current + * database. If not found, try finding from shared tables. * - * Note: if we got an error in the most recent execution of pgstat_report_stat, - * it's possible that an entry exists but there's no hashtable entry for it. - * That's okay, we'll treat this case as "doesn't exist". + * If no entry found, return NULL, don't create a new one */ PgStat_TableStatus * find_tabstat_entry(Oid rel_id) { - TabStatHashEntry *hash_entry; + PgStat_EntryRef *entry_ref; - /* If hashtable doesn't exist, there are no entries at all */ - if (!pgStatTabHash) - return NULL; + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, MyDatabaseId, rel_id); + if (!entry_ref) + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, InvalidOid, rel_id); - hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_FIND, NULL); - if (!hash_entry) - return NULL; - - /* Note that this step could also return NULL, but that's correct */ - return hash_entry->tsa_entry; + if (entry_ref) + return entry_ref->pending; + return NULL; } /* @@ -536,7 +636,7 @@ AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) for (trans = xact_state->first; trans != NULL; trans = trans->next) { - PgStat_TableStatus *tabstat; + PgStat_TableStatus *tabstat PG_USED_FOR_ASSERTS_ONLY; TwoPhasePgStatRecord record; Assert(trans->nest_level == 1); @@ -594,7 +694,7 @@ pgstat_twophase_postcommit(TransactionId xid, uint16 info, PgStat_TableStatus *pgstat_info; /* Find or create a tabstat entry for the rel */ - pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); + pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared); /* Same math as in AtEOXact_PgStat, commit case */ pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; @@ -630,7 +730,7 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info, PgStat_TableStatus *pgstat_info; /* Find or create a tabstat entry for the rel */ - pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); + pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared); /* Same math as in AtEOXact_PgStat, abort case */ if (rec->t_truncdropped) @@ -647,204 +747,116 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info, } /* - * Subroutine for pgstat_report_stat: Send relation statistics + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + * + * Some of the stats are copied to the corresponding pending database stats + * entry when successfully flushing. */ -void -pgstat_send_tabstats(TimestampTz now, bool disconnect) +bool +pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) { - /* we assume this inits to all zeroes: */ static const PgStat_TableCounts all_zeroes; - PgStat_MsgTabstat regular_msg; - PgStat_MsgTabstat shared_msg; - TabStatusArray *tsa; - int i; + Oid dboid; + PgStat_TableStatus *lstats; /* pending stats entry */ + PgStatShared_Relation *shtabstats; + PgStat_StatTabEntry *tabentry; /* table entry of shared stats */ + PgStat_StatDBEntry *dbentry; /* pending database entry */ + + dboid = entry_ref->shared_entry->key.dboid; + lstats = (PgStat_TableStatus *) entry_ref->pending; + shtabstats = (PgStatShared_Relation *) entry_ref->shared_stats; /* - * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry - * entries it points to. (Should we fail partway through the loop below, - * it's okay to have removed the hashtable already --- the only - * consequence is we'd get multiple entries for the same table in the - * pgStatTabList, and that's safe.) + * Ignore entries that didn't accumulate any actual counts, such as + * indexes that were opened by the planner but not used. */ - if (pgStatTabHash) - hash_destroy(pgStatTabHash); - pgStatTabHash = NULL; - - /* - * Scan through the TabStatusArray struct(s) to find tables that actually - * have counts, and build messages to send. We have to separate shared - * relations from regular ones because the databaseid field in the message - * header has to depend on that. - */ - regular_msg.m_databaseid = MyDatabaseId; - shared_msg.m_databaseid = InvalidOid; - regular_msg.m_nentries = 0; - shared_msg.m_nentries = 0; - - for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next) + if (memcmp(&lstats->t_counts, &all_zeroes, + sizeof(PgStat_TableCounts)) == 0) { - for (i = 0; i < tsa->tsa_used; i++) - { - PgStat_TableStatus *entry = &tsa->tsa_entries[i]; - PgStat_MsgTabstat *this_msg; - PgStat_TableEntry *this_ent; - - /* Shouldn't have any pending transaction-dependent counts */ - Assert(entry->trans == NULL); - - /* - * Ignore entries that didn't accumulate any actual counts, such - * as indexes that were opened by the planner but not used. - */ - if (memcmp(&entry->t_counts, &all_zeroes, - sizeof(PgStat_TableCounts)) == 0) - continue; - - /* - * OK, insert data into the appropriate message, and send if full. - */ - this_msg = entry->t_shared ? &shared_msg : ®ular_msg; - this_ent = &this_msg->m_entry[this_msg->m_nentries]; - this_ent->t_id = entry->t_id; - memcpy(&this_ent->t_counts, &entry->t_counts, - sizeof(PgStat_TableCounts)); - if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES) - { - pgstat_send_tabstat(this_msg, now); - this_msg->m_nentries = 0; - } - } - /* zero out PgStat_TableStatus structs after use */ - MemSet(tsa->tsa_entries, 0, - tsa->tsa_used * sizeof(PgStat_TableStatus)); - tsa->tsa_used = 0; + return true; } - /* - * Send partial messages. Make sure that any pending xact commit/abort - * and connection stats get counted, even if there are no table stats to - * send. - */ - if (regular_msg.m_nentries > 0 || - pgStatXactCommit > 0 || pgStatXactRollback > 0 || disconnect) - pgstat_send_tabstat(®ular_msg, now); - if (shared_msg.m_nentries > 0) - pgstat_send_tabstat(&shared_msg, now); + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; - have_relation_stats = false; + /* add the values to the shared entry. */ + tabentry = &shtabstats->stats; + + tabentry->numscans += lstats->t_counts.t_numscans; + tabentry->tuples_returned += lstats->t_counts.t_tuples_returned; + tabentry->tuples_fetched += lstats->t_counts.t_tuples_fetched; + tabentry->tuples_inserted += lstats->t_counts.t_tuples_inserted; + tabentry->tuples_updated += lstats->t_counts.t_tuples_updated; + tabentry->tuples_deleted += lstats->t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated += lstats->t_counts.t_tuples_hot_updated; + + /* + * If table was truncated/dropped, first reset the live/dead counters. + */ + if (lstats->t_counts.t_truncdropped) + { + tabentry->n_live_tuples = 0; + tabentry->n_dead_tuples = 0; + tabentry->inserts_since_vacuum = 0; + } + + tabentry->n_live_tuples += lstats->t_counts.t_delta_live_tuples; + tabentry->n_dead_tuples += lstats->t_counts.t_delta_dead_tuples; + tabentry->changes_since_analyze += lstats->t_counts.t_changed_tuples; + tabentry->inserts_since_vacuum += lstats->t_counts.t_tuples_inserted; + tabentry->blocks_fetched += lstats->t_counts.t_blocks_fetched; + tabentry->blocks_hit += lstats->t_counts.t_blocks_hit; + + /* Clamp n_live_tuples in case of negative delta_live_tuples */ + tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); + /* Likewise for n_dead_tuples */ + tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); + + pgstat_unlock_entry(entry_ref); + + /* The entry was successfully flushed, add the same to database stats */ + dbentry = pgstat_prep_database_pending(dboid); + dbentry->n_tuples_returned += lstats->t_counts.t_tuples_returned; + dbentry->n_tuples_fetched += lstats->t_counts.t_tuples_fetched; + dbentry->n_tuples_inserted += lstats->t_counts.t_tuples_inserted; + dbentry->n_tuples_updated += lstats->t_counts.t_tuples_updated; + dbentry->n_tuples_deleted += lstats->t_counts.t_tuples_deleted; + dbentry->n_blocks_fetched += lstats->t_counts.t_blocks_fetched; + dbentry->n_blocks_hit += lstats->t_counts.t_blocks_hit; + + return true; } -/* - * Subroutine for pgstat_send_tabstats: finish and send one tabstat message - */ -static void -pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now) +void +pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref) { - int n; - int len; + PgStat_TableStatus *pending = (PgStat_TableStatus *) entry_ref->pending; - /* It's unlikely we'd get here with no socket, but maybe not impossible */ - if (pgStatSock == PGINVALID_SOCKET) - return; - - /* - * Report and reset accumulated xact commit/rollback and I/O timings - * whenever we send a normal tabstat message - */ - pgstat_update_dbstats(tsmsg, now); - - n = tsmsg->m_nentries; - len = offsetof(PgStat_MsgTabstat, m_entry[0]) + - n * sizeof(PgStat_TableEntry); - - pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT); - pgstat_send(tsmsg, len); + if (pending->relation) + pgstat_unlink_relation(pending->relation); } /* - * find or create a PgStat_TableStatus entry for rel + * Find or create a PgStat_TableStatus entry for rel. New entry is created and + * initialized if not exists. */ static PgStat_TableStatus * -get_tabstat_entry(Oid rel_id, bool isshared) +pgstat_prep_relation_pending(Oid rel_id, bool isshared) { - TabStatHashEntry *hash_entry; - PgStat_TableStatus *entry; - TabStatusArray *tsa; - bool found; + PgStat_EntryRef *entry_ref; + PgStat_TableStatus *pending; - pgstat_assert_is_up(); + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_RELATION, + isshared ? InvalidOid : MyDatabaseId, + rel_id, NULL); + pending = entry_ref->pending; + pending->t_id = rel_id; + pending->t_shared = isshared; - have_relation_stats = true; - - /* - * Create hash table if we don't have it already. - */ - if (pgStatTabHash == NULL) - { - HASHCTL ctl; - - ctl.keysize = sizeof(Oid); - ctl.entrysize = sizeof(TabStatHashEntry); - - pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table", - TABSTAT_QUANTUM, - &ctl, - HASH_ELEM | HASH_BLOBS); - } - - /* - * Find an entry or create a new one. - */ - hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_ENTER, &found); - if (!found) - { - /* initialize new entry with null pointer */ - hash_entry->tsa_entry = NULL; - } - - /* - * If entry is already valid, we're done. - */ - if (hash_entry->tsa_entry) - return hash_entry->tsa_entry; - - /* - * Locate the first pgStatTabList entry with free space, making a new list - * entry if needed. Note that we could get an OOM failure here, but if so - * we have left the hashtable and the list in a consistent state. - */ - if (pgStatTabList == NULL) - { - /* Set up first pgStatTabList entry */ - pgStatTabList = (TabStatusArray *) - MemoryContextAllocZero(TopMemoryContext, - sizeof(TabStatusArray)); - } - - tsa = pgStatTabList; - while (tsa->tsa_used >= TABSTAT_QUANTUM) - { - if (tsa->tsa_next == NULL) - tsa->tsa_next = (TabStatusArray *) - MemoryContextAllocZero(TopMemoryContext, - sizeof(TabStatusArray)); - tsa = tsa->tsa_next; - } - - /* - * Allocate a PgStat_TableStatus entry within this list entry. We assume - * the entry was already zeroed, either at creation or after last use. - */ - entry = &tsa->tsa_entries[tsa->tsa_used++]; - entry->t_id = rel_id; - entry->t_shared = isshared; - - /* - * Now we can fill the entry in pgStatTabHash. - */ - hash_entry->tsa_entry = entry; - - return entry; + return pending; } /* diff --git a/src/backend/utils/activity/pgstat_replslot.c b/src/backend/utils/activity/pgstat_replslot.c index ceefc5d59b..b77c05ab5f 100644 --- a/src/backend/utils/activity/pgstat_replslot.c +++ b/src/backend/utils/activity/pgstat_replslot.c @@ -8,6 +8,14 @@ * storage implementation and the details about individual types of * statistics. * + * Replication slot stats work a bit different than other other + * variable-numbered stats. Slots do not have oids (so they can be created on + * physical replicas). Use the slot index as object id while running. However, + * the slot index can change when restarting. That is addressed by using the + * name when (de-)serializing. After a restart it is possible for slots to + * have been dropped while shut down, which is addressed by not restoring + * stats for slots that cannot be found by name when starting up. + * * Copyright (c) 2001-2022, PostgreSQL Global Development Group * * IDENTIFICATION @@ -22,6 +30,9 @@ #include "utils/pgstat_internal.h" +static int get_replslot_index(const char *name); + + /* * Reset counters for a single replication slot. * @@ -32,18 +43,10 @@ void pgstat_reset_replslot(const char *name) { ReplicationSlot *slot; - PgStat_MsgResetreplslotcounter msg; AssertArg(name != NULL); - if (pgStatSock == PGINVALID_SOCKET) - return; - - /* - * Check if the slot exists with the given name. It is possible that by - * the time this message is executed the slot is dropped but at least this - * check will ensure that the given name is for a valid slot. - */ + /* Check if the slot exits with the given name. */ slot = SearchNamedReplicationSlot(name, true); if (!slot) @@ -59,10 +62,9 @@ pgstat_reset_replslot(const char *name) if (SlotIsPhysical(slot)) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETREPLSLOTCOUNTER); - namestrcpy(&msg.m_slotname, name); - msg.clearall = false; - pgstat_send(&msg, sizeof(msg)); + /* reset this one entry */ + pgstat_reset(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot)); } /* @@ -71,24 +73,34 @@ pgstat_reset_replslot(const char *name) void pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *repSlotStat) { - PgStat_MsgReplSlot msg; + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + PgStat_StatReplSlotEntry *statent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + statent = &shstatent->stats; /* - * Prepare and send the message + * Any mismatch should have been fixed in pgstat_create_replslot() or + * pgstat_acquire_replslot(). */ - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(repSlotStat->slotname)); - msg.m_create = false; - msg.m_drop = false; - msg.m_spill_txns = repSlotStat->spill_txns; - msg.m_spill_count = repSlotStat->spill_count; - msg.m_spill_bytes = repSlotStat->spill_bytes; - msg.m_stream_txns = repSlotStat->stream_txns; - msg.m_stream_count = repSlotStat->stream_count; - msg.m_stream_bytes = repSlotStat->stream_bytes; - msg.m_total_txns = repSlotStat->total_txns; - msg.m_total_bytes = repSlotStat->total_bytes; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + Assert(namestrcmp(&statent->slotname, NameStr(slot->data.name)) == 0); + + /* Update the replication slot statistics */ +#define REPLSLOT_ACC(fld) statent->fld += repSlotStat->fld + REPLSLOT_ACC(spill_txns); + REPLSLOT_ACC(spill_count); + REPLSLOT_ACC(spill_bytes); + REPLSLOT_ACC(stream_txns); + REPLSLOT_ACC(stream_count); + REPLSLOT_ACC(stream_bytes); + REPLSLOT_ACC(total_txns); + REPLSLOT_ACC(total_bytes); +#undef REPLSLOT_ACC + + pgstat_unlock_entry(entry_ref); } /* @@ -100,13 +112,50 @@ pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *re void pgstat_create_replslot(ReplicationSlot *slot) { - PgStat_MsgReplSlot msg; + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(slot->data.name)); - msg.m_create = true; - msg.m_drop = false; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + + /* + * NB: need to accept that there might be stats from an older slot, e.g. + * if we previously crashed after dropping a slot. + */ + memset(&shstatent->stats, 0, sizeof(shstatent->stats)); + namestrcpy(&shstatent->stats.slotname, NameStr(slot->data.name)); + + pgstat_unlock_entry(entry_ref); +} + +/* + * Report replication slot has been acquired. + */ +void +pgstat_acquire_replslot(ReplicationSlot *slot) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + PgStat_StatReplSlotEntry *statent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + statent = &shstatent->stats; + + /* + * NB: need to accept that there might be stats from an older slot, e.g. + * if we previously crashed after dropping a slot. + */ + if (NameStr(statent->slotname)[0] == 0 || + namestrcmp(&statent->slotname, NameStr(slot->data.name)) != 0) + { + memset(statent, 0, sizeof(*statent)); + namestrcpy(&statent->slotname, NameStr(slot->data.name)); + } + + pgstat_unlock_entry(entry_ref); } /* @@ -115,11 +164,65 @@ pgstat_create_replslot(ReplicationSlot *slot) void pgstat_drop_replslot(ReplicationSlot *slot) { - PgStat_MsgReplSlot msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(slot->data.name)); - msg.m_create = false; - msg.m_drop = true; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + pgstat_drop_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot)); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the replication slot statistics struct. + */ +PgStat_StatReplSlotEntry * +pgstat_fetch_replslot(NameData slotname) +{ + int idx = get_replslot_index(NameStr(slotname)); + + if (idx == -1) + return NULL; + + return (PgStat_StatReplSlotEntry *) + pgstat_fetch_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, idx); +} + +void +pgstat_replslot_to_serialized_name_cb(const PgStatShared_Common *header, NameData *name) +{ + namestrcpy(name, NameStr(((PgStatShared_ReplSlot *) header)->stats.slotname)); +} + +bool +pgstat_replslot_from_serialized_name_cb(const NameData *name, PgStat_HashKey *key) +{ + int idx = get_replslot_index(NameStr(*name)); + + /* slot might have been deleted */ + if (idx == -1) + return false; + + key->kind = PGSTAT_KIND_REPLSLOT; + key->dboid = InvalidOid; + key->objoid = idx; + + return true; +} + +void +pgstat_replslot_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_ReplSlot *) header)->stats.stat_reset_timestamp = ts; +} + +static int +get_replslot_index(const char *name) +{ + ReplicationSlot *slot; + + AssertArg(name != NULL); + + slot = SearchNamedReplicationSlot(name, true); + + if (!slot) + return -1; + + return ReplicationSlotIndex(slot); } diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c new file mode 100644 index 0000000000..a32740b2f6 --- /dev/null +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -0,0 +1,987 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_shmem.c + * Storage of stats entries in shared memory + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_shmem.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgstat.h" +#include "storage/shmem.h" +#include "utils/memutils.h" +#include "utils/pgstat_internal.h" + + +#define PGSTAT_ENTRY_REF_HASH_SIZE 128 + +/* hash table entry for finding the PgStat_EntryRef for a key */ +typedef struct PgStat_EntryRefHashEntry +{ + PgStat_HashKey key; /* hash key */ + char status; /* for simplehash use */ + PgStat_EntryRef *entry_ref; +} PgStat_EntryRefHashEntry; + + +/* for references to shared statistics entries */ +#define SH_PREFIX pgstat_entry_ref_hash +#define SH_ELEMENT_TYPE PgStat_EntryRefHashEntry +#define SH_KEY_TYPE PgStat_HashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL) +#define SH_EQUAL(tb, a, b) \ + pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + + +static void pgstat_drop_database_and_contents(Oid dboid); + +static void pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat); + +static void pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, bool discard_pending); +static bool pgstat_need_entry_refs_gc(void); +static void pgstat_gc_entry_refs(void); +static void pgstat_release_all_entry_refs(bool discard_pending); +typedef bool (*ReleaseMatchCB) (PgStat_EntryRefHashEntry *, Datum data); +static void pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, Datum match_data); + +static void pgstat_setup_memcxt(void); + + +/* parameter for the shared hash */ +static const dshash_parameters dsh_params = { + sizeof(PgStat_HashKey), + sizeof(PgStatShared_HashEntry), + pgstat_cmp_hash_key, + pgstat_hash_hash_key, + LWTRANCHE_PGSTATS_HASH +}; + + +/* + * Backend local references to shared stats entries. If there are pending + * updates to a stats entry, the PgStat_EntryRef is added to the pgStatPending + * list. + * + * When a stats entry is dropped each backend needs to release its reference + * to it before the memory can be released. To trigger that + * pgStatLocal.shmem->gc_request_count is incremented - which each backend + * compares to their copy of pgStatSharedRefAge on a regular basis. + */ +static pgstat_entry_ref_hash_hash *pgStatEntryRefHash = NULL; +static int pgStatSharedRefAge = 0; /* cache age of pgStatShmLookupCache */ + +/* + * Memory contexts containing the pgStatEntryRefHash table and the + * pgStatSharedRef entries respectively. Kept separate to make it easier to + * track / attribute memory usage. + */ +static MemoryContext pgStatSharedRefContext = NULL; +static MemoryContext pgStatEntryRefHashContext = NULL; + + +/* ------------------------------------------------------------ + * Public functions called from postmaster follow + * ------------------------------------------------------------ + */ + +/* + * The size of the shared memory allocation for stats stored in the shared + * stats hash table. This allocation will be done as part of the main shared + * memory, rather than dynamic shared memory, allowing it to be initialized in + * postmaster. + */ +static Size +pgstat_dsa_init_size(void) +{ + Size sz; + + /* + * The dshash header / initial buckets array needs to fit into "plain" + * shared memory, but it's beneficial to not need dsm segments + * immediately. A size of 256kB seems works well and is not + * disproportional compared to other constant sized shared memory + * allocations. NB: To avoid DSMs further, the user can configure + * min_dynamic_shared_memory. + */ + sz = 256 * 1024; + Assert(dsa_minimum_size() <= sz); + return MAXALIGN(sz); +} + +/* + * Compute shared memory space needed for cumulative statistics + */ +Size +StatsShmemSize(void) +{ + Size sz; + + sz = MAXALIGN(sizeof(PgStat_ShmemControl)); + sz = add_size(sz, pgstat_dsa_init_size()); + + return sz; +} + +/* + * Initialize cumulative statistics system during startup + */ +void +StatsShmemInit(void) +{ + bool found; + Size sz; + + sz = StatsShmemSize(); + pgStatLocal.shmem = (PgStat_ShmemControl *) + ShmemInitStruct("Shared Memory Stats", sz, &found); + + if (!IsUnderPostmaster) + { + dsa_area *dsa; + dshash_table *dsh; + PgStat_ShmemControl *ctl = pgStatLocal.shmem; + char *p = (char *) ctl; + + Assert(!found); + + /* the allocation of pgStatLocal.shmem itself */ + p += MAXALIGN(sizeof(PgStat_ShmemControl)); + + /* + * Create a small dsa allocation in plain shared memory. This is + * required because postmaster cannot use dsm segments. It also + * provides a small efficiency win. + */ + ctl->raw_dsa_area = p; + p += MAXALIGN(pgstat_dsa_init_size()); + dsa = dsa_create_in_place(ctl->raw_dsa_area, + pgstat_dsa_init_size(), + LWTRANCHE_PGSTATS_DSA, 0); + dsa_pin(dsa); + + /* + * To ensure dshash is created in "plain" shared memory, temporarily + * limit size of dsa to the initial size of the dsa. + */ + dsa_set_size_limit(dsa, pgstat_dsa_init_size()); + + /* + * With the limit in place, create the dshash table. XXX: It'd be nice + * if there were dshash_create_in_place(). + */ + dsh = dshash_create(dsa, &dsh_params, 0); + ctl->hash_handle = dshash_get_hash_table_handle(dsh); + + /* lift limit set above */ + dsa_set_size_limit(dsa, -1); + + /* + * Postmaster will never access these again, thus free the local + * dsa/dshash references. + */ + dshash_detach(dsh); + dsa_detach(dsa); + + pg_atomic_init_u64(&ctl->gc_request_count, 1); + + + /* initialize fixed-numbered stats */ + LWLockInitialize(&ctl->archiver.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->bgwriter.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->checkpointer.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->slru.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->wal.lock, LWTRANCHE_PGSTATS_DATA); + } + else + { + Assert(found); + } +} + +void +pgstat_attach_shmem(void) +{ + MemoryContext oldcontext; + + Assert(pgStatLocal.dsa == NULL); + + /* stats shared memory persists for the backend lifetime */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + pgStatLocal.dsa = dsa_attach_in_place(pgStatLocal.shmem->raw_dsa_area, + NULL); + dsa_pin_mapping(pgStatLocal.dsa); + + pgStatLocal.shared_hash = dshash_attach(pgStatLocal.dsa, &dsh_params, + pgStatLocal.shmem->hash_handle, 0); + + MemoryContextSwitchTo(oldcontext); +} + +void +pgstat_detach_shmem(void) +{ + Assert(pgStatLocal.dsa); + + /* we shouldn't leave references to shared stats */ + pgstat_release_all_entry_refs(false); + + dshash_detach(pgStatLocal.shared_hash); + pgStatLocal.shared_hash = NULL; + + dsa_detach(pgStatLocal.dsa); + pgStatLocal.dsa = NULL; +} + + +/* ------------------------------------------------------------ + * Maintenance of shared memory stats entries + * ------------------------------------------------------------ + */ + +PgStatShared_Common * +pgstat_init_entry(PgStat_Kind kind, + PgStatShared_HashEntry *shhashent) +{ + /* Create new stats entry. */ + dsa_pointer chunk; + PgStatShared_Common *shheader; + + /* + * Initialize refcount to 1, marking it as valid / not dropped. The entry + * can't be freed before the initialization because it can't be found as + * long as we hold the dshash partition lock. Caller needs to increase + * further if a longer lived reference is needed. + */ + pg_atomic_init_u32(&shhashent->refcount, 1); + shhashent->dropped = false; + + chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size); + shheader = dsa_get_address(pgStatLocal.dsa, chunk); + shheader->magic = 0xdeadbeef; + + /* Link the new entry from the hash entry. */ + shhashent->body = chunk; + + LWLockInitialize(&shheader->lock, LWTRANCHE_PGSTATS_DATA); + + return shheader; +} + +static PgStatShared_Common * +pgstat_reinit_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent) +{ + PgStatShared_Common *shheader; + + shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body); + + /* mark as not dropped anymore */ + pg_atomic_fetch_add_u32(&shhashent->refcount, 1); + shhashent->dropped = false; + + /* reinitialize content */ + Assert(shheader->magic == 0xdeadbeef); + memset(shheader, 0, pgstat_get_kind_info(shhashent->key.kind)->shared_size); + shheader->magic = 0xdeadbeef; + + return shheader; +} + +static void +pgstat_setup_shared_refs(void) +{ + if (likely(pgStatEntryRefHash != NULL)) + return; + + pgStatEntryRefHash = + pgstat_entry_ref_hash_create(pgStatEntryRefHashContext, + PGSTAT_ENTRY_REF_HASH_SIZE, NULL); + pgStatSharedRefAge = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + Assert(pgStatSharedRefAge != 0); +} + +/* + * Helper function for pgstat_get_entry_ref(). + */ +static void +pgstat_acquire_entry_ref(PgStat_EntryRef *entry_ref, + PgStatShared_HashEntry *shhashent, + PgStatShared_Common *shheader) +{ + Assert(shheader->magic == 0xdeadbeef); + Assert(pg_atomic_read_u32(&shhashent->refcount) > 0); + + pg_atomic_fetch_add_u32(&shhashent->refcount, 1); + + dshash_release_lock(pgStatLocal.shared_hash, shhashent); + + entry_ref->shared_stats = shheader; + entry_ref->shared_entry = shhashent; +} + +/* + * Helper function for pgstat_get_entry_ref(). + */ +static bool +pgstat_get_entry_ref_cached(PgStat_HashKey key, PgStat_EntryRef **entry_ref_p) +{ + bool found; + PgStat_EntryRefHashEntry *cache_entry; + + /* + * We immediately insert a cache entry, because it avoids 1) multiple + * hashtable lookups in case of a cache miss 2) having to deal with + * out-of-memory errors after incrementing PgStatShared_Common->refcount. + */ + + cache_entry = pgstat_entry_ref_hash_insert(pgStatEntryRefHash, key, &found); + + if (!found || !cache_entry->entry_ref) + { + PgStat_EntryRef *entry_ref; + + cache_entry->entry_ref = entry_ref = + MemoryContextAlloc(pgStatSharedRefContext, + sizeof(PgStat_EntryRef)); + entry_ref->shared_stats = NULL; + entry_ref->shared_entry = NULL; + entry_ref->pending = NULL; + + found = false; + } + else if (cache_entry->entry_ref->shared_stats == NULL) + { + Assert(cache_entry->entry_ref->pending == NULL); + found = false; + } + else + { + PgStat_EntryRef *entry_ref PG_USED_FOR_ASSERTS_ONLY; + + entry_ref = cache_entry->entry_ref; + Assert(entry_ref->shared_entry != NULL); + Assert(entry_ref->shared_stats != NULL); + + Assert(entry_ref->shared_stats->magic == 0xdeadbeef); + /* should have at least our reference */ + Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) > 0); + } + + *entry_ref_p = cache_entry->entry_ref; + return found; +} + +/* + * Get a shared stats reference. If create is true, the shared stats object is + * created if it does not exist. + * + * When create is true, and created_entry is non-NULL, it'll be set to true + * if the entry is newly created, false otherwise. + */ +PgStat_EntryRef * +pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, Oid objoid, bool create, + bool *created_entry) +{ + PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid}; + PgStatShared_HashEntry *shhashent; + PgStatShared_Common *shheader = NULL; + PgStat_EntryRef *entry_ref; + + /* + * passing in created_entry only makes sense if we possibly could create + * entry. + */ + AssertArg(create || created_entry == NULL); + pgstat_assert_is_up(); + Assert(pgStatLocal.shared_hash != NULL); + Assert(!pgStatLocal.shmem->is_shutdown); + + pgstat_setup_memcxt(); + pgstat_setup_shared_refs(); + + if (created_entry != NULL) + *created_entry = false; + + /* + * Check if other backends dropped stats that could not be deleted because + * somebody held references to it. If so, check this backend's references. + * This is not expected to happen often. The location of the check is a + * bit random, but this is a relatively frequently called path, so better + * than most. + */ + if (pgstat_need_entry_refs_gc()) + pgstat_gc_entry_refs(); + + /* + * First check the lookup cache hashtable in local memory. If we find a + * match here we can avoid taking locks / causing contention. + */ + if (pgstat_get_entry_ref_cached(key, &entry_ref)) + return entry_ref; + + Assert(entry_ref != NULL); + + /* + * Do a lookup in the hash table first - it's quite likely that the entry + * already exists, and that way we only need a shared lock. + */ + shhashent = dshash_find(pgStatLocal.shared_hash, &key, false); + + if (create && !shhashent) + { + bool shfound; + + /* + * It's possible that somebody created the entry since the above + * lookup. If so, fall through to the same path as if we'd have if it + * already had been created before the dshash_find() calls. + */ + shhashent = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &shfound); + if (!shfound) + { + shheader = pgstat_init_entry(kind, shhashent); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + if (created_entry != NULL) + *created_entry = true; + + return entry_ref; + } + } + + if (!shhashent) + { + /* + * If we're not creating, delete the reference again. In all + * likelihood it's just a stats lookup - no point wasting memory for a + * shared ref to nothing... + */ + pgstat_release_entry_ref(key, entry_ref, false); + + return NULL; + } + else + { + /* + * Can get here either because dshash_find() found a match, or if + * dshash_find_or_insert() found a concurrently inserted entry. + */ + + if (shhashent->dropped && create) + { + /* + * There are legitimate cases where the old stats entry might not + * yet have been dropped by the time it's reused. The most obvious + * case are replication slot stats, where a new slot can be + * created with the same index just after dropping. But oid + * wraparound can lead to other cases as well. We just reset the + * stats to their plain state. + */ + shheader = pgstat_reinit_entry(kind, shhashent); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + if (created_entry != NULL) + *created_entry = true; + + return entry_ref; + } + else if (shhashent->dropped) + { + dshash_release_lock(pgStatLocal.shared_hash, shhashent); + pgstat_release_entry_ref(key, entry_ref, false); + + return NULL; + } + else + { + shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + return entry_ref; + } + } +} + +static void +pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, + bool discard_pending) +{ + if (entry_ref && entry_ref->pending) + { + if (discard_pending) + pgstat_delete_pending_entry(entry_ref); + else + elog(ERROR, "releasing ref with pending data"); + } + + if (entry_ref && entry_ref->shared_stats) + { + Assert(entry_ref->shared_stats->magic == 0xdeadbeef); + Assert(entry_ref->pending == NULL); + + /* + * This can't race with another backend looking up the stats entry and + * increasing the refcount because it is not "legal" to create + * additional references to dropped entries. + */ + if (pg_atomic_fetch_sub_u32(&entry_ref->shared_entry->refcount, 1) == 1) + { + PgStatShared_HashEntry *shent; + + /* + * We're the last referrer to this entry, try to drop the shared + * entry. + */ + + /* only dropped entries can reach a 0 refcount */ + Assert(entry_ref->shared_entry->dropped); + + shent = dshash_find(pgStatLocal.shared_hash, + &entry_ref->shared_entry->key, + true); + if (!shent) + elog(ERROR, "could not find just referenced shared stats entry"); + + Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) == 0); + Assert(entry_ref->shared_entry == shent); + + pgstat_free_entry(shent, NULL); + } + } + + if (!pgstat_entry_ref_hash_delete(pgStatEntryRefHash, key)) + elog(ERROR, "entry ref vanished before deletion"); + + if (entry_ref) + pfree(entry_ref); +} + +bool +pgstat_lock_entry(PgStat_EntryRef *entry_ref, bool nowait) +{ + LWLock *lock = &entry_ref->shared_stats->lock; + + if (nowait) + return LWLockConditionalAcquire(lock, LW_EXCLUSIVE); + + LWLockAcquire(lock, LW_EXCLUSIVE); + return true; +} + +void +pgstat_unlock_entry(PgStat_EntryRef *entry_ref) +{ + LWLockRelease(&entry_ref->shared_stats->lock); +} + +/* + * Helper function to fetch and lock shared stats. + */ +PgStat_EntryRef * +pgstat_get_entry_ref_locked(PgStat_Kind kind, Oid dboid, Oid objoid, + bool nowait) +{ + PgStat_EntryRef *entry_ref; + + /* find shared table stats entry corresponding to the local entry */ + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, true, NULL); + + /* lock the shared entry to protect the content, skip if failed */ + if (!pgstat_lock_entry(entry_ref, nowait)) + return NULL; + + return entry_ref; +} + +void +pgstat_request_entry_refs_gc(void) +{ + pg_atomic_fetch_add_u64(&pgStatLocal.shmem->gc_request_count, 1); +} + +static bool +pgstat_need_entry_refs_gc(void) +{ + uint64 curage; + + if (!pgStatEntryRefHash) + return false; + + /* should have been initialized when creating pgStatEntryRefHash */ + Assert(pgStatSharedRefAge != 0); + + curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + + return pgStatSharedRefAge != curage; +} + +static void +pgstat_gc_entry_refs(void) +{ + pgstat_entry_ref_hash_iterator i; + PgStat_EntryRefHashEntry *ent; + uint64 curage; + + curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + Assert(curage != 0); + + /* + * Some entries have been dropped. Invalidate cache pointer to them. + */ + pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i); + while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) != NULL) + { + PgStat_EntryRef *entry_ref = ent->entry_ref; + + Assert(!entry_ref->shared_stats || + entry_ref->shared_stats->magic == 0xdeadbeef); + + if (!entry_ref->shared_entry->dropped) + continue; + + /* cannot gc shared ref that has pending data */ + if (entry_ref->pending != NULL) + continue; + + pgstat_release_entry_ref(ent->key, entry_ref, false); + } + + pgStatSharedRefAge = curage; +} + +static void +pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, + Datum match_data) +{ + pgstat_entry_ref_hash_iterator i; + PgStat_EntryRefHashEntry *ent; + + if (pgStatEntryRefHash == NULL) + return; + + pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i); + + while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) + != NULL) + { + Assert(ent->entry_ref != NULL); + + if (match && !match(ent, match_data)) + continue; + + pgstat_release_entry_ref(ent->key, ent->entry_ref, discard_pending); + } +} + +/* + * Release all local references to shared stats entries. + * + * When a process exits it cannot do so while still holding references onto + * stats entries, otherwise the shared stats entries could never be freed. + */ +static void +pgstat_release_all_entry_refs(bool discard_pending) +{ + if (pgStatEntryRefHash == NULL) + return; + + pgstat_release_matching_entry_refs(discard_pending, NULL, 0); + Assert(pgStatEntryRefHash->members == 0); + pgstat_entry_ref_hash_destroy(pgStatEntryRefHash); + pgStatEntryRefHash = NULL; +} + +static bool +match_db(PgStat_EntryRefHashEntry *ent, Datum match_data) +{ + Oid dboid = DatumGetObjectId(match_data); + + return ent->key.dboid == dboid; +} + +static void +pgstat_release_db_entry_refs(Oid dboid) +{ + pgstat_release_matching_entry_refs( /* discard pending = */ true, + match_db, + ObjectIdGetDatum(dboid)); +} + + +/* ------------------------------------------------------------ + * Dropping and resetting of stats entries + * ------------------------------------------------------------ + */ + +static void +pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat) +{ + dsa_pointer pdsa; + + /* + * Fetch dsa pointer before deleting entry - that way we can free the + * memory after releasing the lock. + */ + pdsa = shent->body; + + if (!hstat) + dshash_delete_entry(pgStatLocal.shared_hash, shent); + else + dshash_delete_current(hstat); + + dsa_free(pgStatLocal.dsa, pdsa); +} + +/* + * Helper for both pgstat_drop_database_and_contents() and + * pgstat_drop_entry(). If hstat is non-null delete the shared entry using + * dshash_delete_current(), otherwise use dshash_delete_entry(). In either + * case the entry needs to be already locked. + */ +static bool +pgstat_drop_entry_internal(PgStatShared_HashEntry *shent, + dshash_seq_status *hstat) +{ + Assert(shent->body != InvalidDsaPointer); + + /* should already have released local reference */ + if (pgStatEntryRefHash) + Assert(!pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, shent->key)); + + /* + * Signal that the entry is dropped - this will eventually cause other + * backends to release their references. + */ + if (shent->dropped) + elog(ERROR, "can only drop stats once"); + shent->dropped = true; + + /* release refcount marking entry as not dropped */ + if (pg_atomic_sub_fetch_u32(&shent->refcount, 1) == 0) + { + pgstat_free_entry(shent, hstat); + return true; + } + else + { + if (!hstat) + dshash_release_lock(pgStatLocal.shared_hash, shent); + return false; + } +} + +/* + * Drop stats for the database and all the objects inside that database. + */ +static void +pgstat_drop_database_and_contents(Oid dboid) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + uint64 not_freed_count = 0; + + Assert(OidIsValid(dboid)); + + Assert(pgStatLocal.shared_hash != NULL); + + /* + * This backend might very well be the only backend holding a reference to + * about-to-be-dropped entries. Ensure that we're not preventing it from + * being cleaned up till later. + * + * Doing this separately from the dshash iteration below avoids having to + * do so while holding a partition lock on the shared hashtable. + */ + pgstat_release_db_entry_refs(dboid); + + /* some of the dshash entries are to be removed, take exclusive lock. */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, true); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + if (p->dropped) + continue; + + if (p->key.dboid != dboid) + continue; + + if (!pgstat_drop_entry_internal(p, &hstat)) + { + /* + * Even statistics for a dropped database might currently be + * accessed (consider e.g. database stats for pg_stat_database). + */ + not_freed_count++; + } + } + dshash_seq_term(&hstat); + + /* + * If some of the stats data could not be freed, signal the reference + * holders to run garbage collection of their cached pgStatShmLookupCache. + */ + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +bool +pgstat_drop_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid}; + PgStatShared_HashEntry *shent; + bool freed = true; + + /* delete local reference */ + if (pgStatEntryRefHash) + { + PgStat_EntryRefHashEntry *lohashent = + pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, key); + + if (lohashent) + pgstat_release_entry_ref(lohashent->key, lohashent->entry_ref, + true); + } + + /* mark entry in shared hashtable as deleted, drop if possible */ + shent = dshash_find(pgStatLocal.shared_hash, &key, true); + if (shent) + { + freed = pgstat_drop_entry_internal(shent, NULL); + + /* + * Database stats contain other stats. Drop those as well when + * dropping the database. XXX: Perhaps this should be done in a + * slightly more principled way? But not obvious what that'd look + * like, and so far this is the only case... + */ + if (key.kind == PGSTAT_KIND_DATABASE) + pgstat_drop_database_and_contents(key.dboid); + } + + return freed; +} + +void +pgstat_drop_all_entries(void) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *ps; + uint64 not_freed_count = 0; + + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((ps = dshash_seq_next(&hstat)) != NULL) + { + if (ps->dropped) + continue; + + if (!pgstat_drop_entry_internal(ps, &hstat)) + not_freed_count++; + } + dshash_seq_term(&hstat); + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +static void +shared_stat_reset_contents(PgStat_Kind kind, PgStatShared_Common *header, + TimestampTz ts) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + memset(pgstat_get_entry_data(kind, header), 0, + pgstat_get_entry_len(kind)); + + if (kind_info->reset_timestamp_cb) + kind_info->reset_timestamp_cb(header, ts); +} + +/* + * Reset one variable-numbered stats entry. + */ +void +pgstat_reset_entry(PgStat_Kind kind, Oid dboid, Oid objoid, TimestampTz ts) +{ + PgStat_EntryRef *entry_ref; + + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + if (!entry_ref || entry_ref->shared_entry->dropped) + return; + + pgstat_lock_entry(entry_ref, false); + shared_stat_reset_contents(kind, entry_ref->shared_stats, ts); + pgstat_unlock_entry(entry_ref); +} + +/* + * Scan through the shared hashtable of stats, resetting statistics if + * approved by the provided do_reset() function. + */ +void +pgstat_reset_matching_entries(bool (*do_reset) (PgStatShared_HashEntry *, Datum), + Datum match_data, TimestampTz ts) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + + /* dshash entry is not modified, take shared lock */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + PgStatShared_Common *header; + + if (p->dropped) + continue; + + if (!do_reset(p, match_data)) + continue; + + header = dsa_get_address(pgStatLocal.dsa, p->body); + + LWLockAcquire(&header->lock, LW_EXCLUSIVE); + + shared_stat_reset_contents(p->key.kind, header, ts); + + LWLockRelease(&header->lock); + } + dshash_seq_term(&hstat); +} + +static bool +match_kind(PgStatShared_HashEntry *p, Datum match_data) +{ + return p->key.kind == DatumGetInt32(match_data); +} + +void +pgstat_reset_entries_of_kind(PgStat_Kind kind, TimestampTz ts) +{ + pgstat_reset_matching_entries(match_kind, Int32GetDatum(kind), ts); +} + +static void +pgstat_setup_memcxt(void) +{ + if (unlikely(!pgStatSharedRefContext)) + pgStatSharedRefContext = + AllocSetContextCreate(CacheMemoryContext, + "PgStat Shared Ref", + ALLOCSET_SMALL_SIZES); + if (unlikely(!pgStatEntryRefHashContext)) + pgStatEntryRefHashContext = + AllocSetContextCreate(CacheMemoryContext, + "PgStat Shared Ref Hash", + ALLOCSET_SMALL_SIZES); +} diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c index d932bc74e0..d0b85b62a5 100644 --- a/src/backend/utils/activity/pgstat_slru.c +++ b/src/backend/utils/activity/pgstat_slru.c @@ -18,18 +18,21 @@ #include "postgres.h" #include "utils/pgstat_internal.h" +#include "utils/timestamp.h" -static inline PgStat_MsgSLRU *get_slru_entry(int slru_idx); +static inline PgStat_SLRUStats *get_slru_entry(int slru_idx); +static void pgstat_reset_slru_counter_internal(int index, TimestampTz ts); /* - * SLRU statistics counts waiting to be sent to the collector. These are - * stored directly in stats message format so they can be sent without needing - * to copy things around. We assume this variable inits to zeroes. Entries - * are one-to-one with slru_names[]. + * SLRU statistics counts waiting to be flushed out. We assume this variable + * inits to zeroes. Entries are one-to-one with slru_names[]. Changes of + * SLRU counters are reported within critical sections so we use static memory + * in order to avoid memory allocation. */ -static PgStat_MsgSLRU SLRUStats[SLRU_NUM_ELEMENTS]; +static PgStat_SLRUStats pending_SLRUStats[SLRU_NUM_ELEMENTS]; +bool have_slrustats = false; /* @@ -41,17 +44,11 @@ static PgStat_MsgSLRU SLRUStats[SLRU_NUM_ELEMENTS]; void pgstat_reset_slru(const char *name) { - PgStat_MsgResetslrucounter msg; + TimestampTz ts = GetCurrentTimestamp(); AssertArg(name != NULL); - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSLRUCOUNTER); - msg.m_index = pgstat_get_slru_index(name); - - pgstat_send(&msg, sizeof(msg)); + pgstat_reset_slru_counter_internal(pgstat_get_slru_index(name), ts); } /* @@ -61,43 +58,55 @@ pgstat_reset_slru(const char *name) void pgstat_count_slru_page_zeroed(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_zeroed += 1; + get_slru_entry(slru_idx)->blocks_zeroed += 1; } void pgstat_count_slru_page_hit(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_hit += 1; + get_slru_entry(slru_idx)->blocks_hit += 1; } void pgstat_count_slru_page_exists(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_exists += 1; + get_slru_entry(slru_idx)->blocks_exists += 1; } void pgstat_count_slru_page_read(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_read += 1; + get_slru_entry(slru_idx)->blocks_read += 1; } void pgstat_count_slru_page_written(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_written += 1; + get_slru_entry(slru_idx)->blocks_written += 1; } void pgstat_count_slru_flush(int slru_idx) { - get_slru_entry(slru_idx)->m_flush += 1; + get_slru_entry(slru_idx)->flush += 1; } void pgstat_count_slru_truncate(int slru_idx) { - get_slru_entry(slru_idx)->m_truncate += 1; + get_slru_entry(slru_idx)->truncate += 1; +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the slru statistics struct. + */ +PgStat_SLRUStats * +pgstat_fetch_slru(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_SLRU); + + return pgStatLocal.snapshot.slru; } /* @@ -135,45 +144,81 @@ pgstat_get_slru_index(const char *name) } /* - * Send SLRU statistics to the collector + * Flush out locally pending SLRU stats entries + * + * If nowait is true, this function returns false on lock failure. Otherwise + * this function always returns true. Writer processes are mutually excluded + * using LWLock, but readers are expected to use change-count protocol to avoid + * interference with writers. + * + * If nowait is true, this function returns true if the lock could not be + * acquired. Otherwise return false. */ -void -pgstat_send_slru(void) +bool +pgstat_slru_flush(bool nowait) { - /* We assume this initializes to zeroes */ - static const PgStat_MsgSLRU all_zeroes; + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + int i; - for (int i = 0; i < SLRU_NUM_ELEMENTS; i++) + if (!have_slrustats) + return false; + + if (!nowait) + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE)) + return true; + + for (i = 0; i < SLRU_NUM_ELEMENTS; i++) { - /* - * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. - */ - if (memcmp(&SLRUStats[i], &all_zeroes, sizeof(PgStat_MsgSLRU)) == 0) - continue; + PgStat_SLRUStats *sharedent = &stats_shmem->stats[i]; + PgStat_SLRUStats *pendingent = &pending_SLRUStats[i]; - /* set the SLRU type before each send */ - SLRUStats[i].m_index = i; - - /* - * Prepare and send the message - */ - pgstat_setheader(&SLRUStats[i].m_hdr, PGSTAT_MTYPE_SLRU); - pgstat_send(&SLRUStats[i], sizeof(PgStat_MsgSLRU)); - - /* - * Clear out the statistics buffer, so it can be re-used. - */ - MemSet(&SLRUStats[i], 0, sizeof(PgStat_MsgSLRU)); +#define SLRU_ACC(fld) sharedent->fld += pendingent->fld + SLRU_ACC(blocks_zeroed); + SLRU_ACC(blocks_hit); + SLRU_ACC(blocks_read); + SLRU_ACC(blocks_written); + SLRU_ACC(blocks_exists); + SLRU_ACC(flush); + SLRU_ACC(truncate); +#undef SLRU_ACC } + + /* done, clear the pending entry */ + MemSet(pending_SLRUStats, 0, sizeof(pending_SLRUStats)); + + LWLockRelease(&stats_shmem->lock); + + have_slrustats = false; + + return false; +} + +void +pgstat_slru_reset_all_cb(TimestampTz ts) +{ + for (int i = 0; i < SLRU_NUM_ELEMENTS; i++) + pgstat_reset_slru_counter_internal(i, ts); +} + +void +pgstat_slru_snapshot_cb(void) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + + memcpy(pgStatLocal.snapshot.slru, &stats_shmem->stats, + sizeof(stats_shmem->stats)); + + LWLockRelease(&stats_shmem->lock); } /* * Returns pointer to entry with counters for given SLRU (based on the name * stored in SlruCtl as lwlock tranche name). */ -static inline PgStat_MsgSLRU * +static inline PgStat_SLRUStats * get_slru_entry(int slru_idx) { pgstat_assert_is_up(); @@ -186,5 +231,20 @@ get_slru_entry(int slru_idx) Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); - return &SLRUStats[slru_idx]; + have_slrustats = true; + + return &pending_SLRUStats[slru_idx]; +} + +static void +pgstat_reset_slru_counter_internal(int index, TimestampTz ts) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + + memset(&stats_shmem->stats[index], 0, sizeof(PgStat_SLRUStats)); + stats_shmem->stats[index].stat_reset_timestamp = ts; + + LWLockRelease(&stats_shmem->lock); } diff --git a/src/backend/utils/activity/pgstat_subscription.c b/src/backend/utils/activity/pgstat_subscription.c index 689029b30a..e1072bd5ba 100644 --- a/src/backend/utils/activity/pgstat_subscription.c +++ b/src/backend/utils/activity/pgstat_subscription.c @@ -26,12 +26,17 @@ void pgstat_report_subscription_error(Oid subid, bool is_apply_error) { - PgStat_MsgSubscriptionError msg; + PgStat_EntryRef *entry_ref; + PgStat_BackendSubEntry *pending; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_SUBSCRIPTIONERROR); - msg.m_subid = subid; - msg.m_is_apply_error = is_apply_error; - pgstat_send(&msg, sizeof(PgStat_MsgSubscriptionError)); + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_SUBSCRIPTION, + InvalidOid, subid, NULL); + pending = entry_ref->pending; + + if (is_apply_error) + pending->apply_error_count++; + else + pending->sync_error_count++; } /* @@ -54,12 +59,52 @@ pgstat_create_subscription(Oid subid) void pgstat_drop_subscription(Oid subid) { - PgStat_MsgSubscriptionDrop msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_SUBSCRIPTIONDROP); - msg.m_subid = subid; - pgstat_send(&msg, sizeof(PgStat_MsgSubscriptionDrop)); - pgstat_drop_transactional(PGSTAT_KIND_SUBSCRIPTION, InvalidOid, subid); } + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one subscription or NULL. + */ +PgStat_StatSubEntry * +pgstat_fetch_stat_subscription(Oid subid) +{ + return (PgStat_StatSubEntry *) + pgstat_fetch_entry(PGSTAT_KIND_SUBSCRIPTION, InvalidOid, subid); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStat_BackendSubEntry *localent; + PgStatShared_Subscription *shsubent; + + localent = (PgStat_BackendSubEntry *) entry_ref->pending; + shsubent = (PgStatShared_Subscription *) entry_ref->shared_stats; + + /* localent always has non-zero content */ + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + +#define SUB_ACC(fld) shsubent->stats.fld += localent->fld + SUB_ACC(apply_error_count); + SUB_ACC(sync_error_count); +#undef SUB_ACC + + pgstat_unlock_entry(entry_ref); + return true; +} + +void +pgstat_subscription_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Subscription *) header)->stats.stat_reset_timestamp = ts; +} diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 8855598f52..5a878bd115 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -21,13 +21,7 @@ #include "executor/instrument.h" -/* - * WAL global statistics counters. Stored directly in a stats message - * structure so they can be sent without needing to copy things around. We - * assume these init to zeroes. - */ -PgStat_MsgWal WalStats; - +PgStat_WalStats PendingWalStats = {0}; /* * WAL usage counters saved from pgWALUsage at the previous call to @@ -39,101 +33,100 @@ static WalUsage prevWalUsage; /* - * Send WAL statistics to the collector. + * Calculate how much WAL usage counters have increased and update + * shared statistics. * - * If 'force' is not set, WAL stats message is only sent if enough time has - * passed since last one was sent to reach PGSTAT_STAT_INTERVAL. + * Must be called by processes that generate WAL, that do not call + * pgstat_report_stat(), like walwriter. */ void pgstat_report_wal(bool force) { - static TimestampTz sendTime = 0; + pgstat_flush_wal(force); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the WAL statistics struct. + */ +PgStat_WalStats * +pgstat_fetch_stat_wal(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_WAL); + + return &pgStatLocal.snapshot.wal; +} + +/* + * Calculate how much WAL usage counters have increased by subtracting the + * previous counters from the current ones. + * + * If nowait is true, this function returns true if the lock could not be + * acquired. Otherwise return false. + */ +bool +pgstat_flush_wal(bool nowait) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + WalUsage diff = {0}; + + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + Assert(pgStatLocal.shmem != NULL && + !pgStatLocal.shmem->is_shutdown); /* - * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. - * - * Check wal_records counter to determine whether any WAL activity has - * happened since last time. Note that other WalUsage counters don't need - * to be checked because they are incremented always together with - * wal_records counter. - * - * m_wal_buffers_full also doesn't need to be checked because it's - * incremented only when at least one WAL record is generated (i.e., - * wal_records counter is incremented). But for safely, we assert that - * m_wal_buffers_full is always zero when no WAL record is generated - * - * This function can be called by a process like walwriter that normally - * generates no WAL records. To determine whether any WAL activity has - * happened at that process since the last time, the numbers of WAL writes - * and syncs are also checked. + * This function can be called even if nothing at all has happened. Avoid + * taking lock for nothing in that case. */ - if (pgWalUsage.wal_records == prevWalUsage.wal_records && - WalStats.m_wal_write == 0 && WalStats.m_wal_sync == 0) - { - Assert(WalStats.m_wal_buffers_full == 0); - return; - } - - if (!force) - { - TimestampTz now = GetCurrentTimestamp(); - - /* - * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL - * msec since we last sent one to avoid overloading the stats - * collector. - */ - if (!TimestampDifferenceExceeds(sendTime, now, PGSTAT_STAT_INTERVAL)) - return; - sendTime = now; - } + if (!pgstat_have_pending_wal()) + return false; /* - * Set the counters related to generated WAL data if the counters were - * updated. + * We don't update the WAL usage portion of the local WalStats elsewhere. + * Calculate how much WAL usage counters were increased by subtracting the + * previous counters from the current ones. */ - if (pgWalUsage.wal_records != prevWalUsage.wal_records) - { - WalUsage walusage; + WalUsageAccumDiff(&diff, &pgWalUsage, &prevWalUsage); + PendingWalStats.wal_records = diff.wal_records; + PendingWalStats.wal_fpi = diff.wal_fpi; + PendingWalStats.wal_bytes = diff.wal_bytes; - /* - * Calculate how much WAL usage counters were increased by subtracting - * the previous counters from the current ones. Fill the results in - * WAL stats message. - */ - MemSet(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage); + if (!nowait) + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE)) + return true; - WalStats.m_wal_records = walusage.wal_records; - WalStats.m_wal_fpi = walusage.wal_fpi; - WalStats.m_wal_bytes = walusage.wal_bytes; +#define WALSTAT_ACC(fld) stats_shmem->stats.fld += PendingWalStats.fld + WALSTAT_ACC(wal_records); + WALSTAT_ACC(wal_fpi); + WALSTAT_ACC(wal_bytes); + WALSTAT_ACC(wal_buffers_full); + WALSTAT_ACC(wal_write); + WALSTAT_ACC(wal_sync); + WALSTAT_ACC(wal_write_time); + WALSTAT_ACC(wal_sync_time); +#undef WALSTAT_ACC - /* - * Save the current counters for the subsequent calculation of WAL - * usage. - */ - prevWalUsage = pgWalUsage; - } + LWLockRelease(&stats_shmem->lock); /* - * Prepare and send the message + * Save the current counters for the subsequent calculation of WAL usage. */ - pgstat_setheader(&WalStats.m_hdr, PGSTAT_MTYPE_WAL); - pgstat_send(&WalStats, sizeof(WalStats)); + prevWalUsage = pgWalUsage; /* * Clear out the statistics buffer, so it can be re-used. */ - MemSet(&WalStats, 0, sizeof(WalStats)); + MemSet(&PendingWalStats, 0, sizeof(PendingWalStats)); + + return false; } void pgstat_init_wal(void) { /* - * Initialize prevWalUsage with pgWalUsage so that pgstat_report_wal() can + * Initialize prevWalUsage with pgWalUsage so that pgstat_flush_wal() can * calculate how much pgWalUsage counters are increased by subtracting * prevWalUsage from pgWalUsage. */ @@ -151,6 +144,28 @@ bool pgstat_have_pending_wal(void) { return pgWalUsage.wal_records != prevWalUsage.wal_records || - WalStats.m_wal_write != 0 || - WalStats.m_wal_sync != 0; + PendingWalStats.wal_write != 0 || + PendingWalStats.wal_sync != 0; +} + +void +pgstat_wal_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + memset(&stats_shmem->stats, 0, sizeof(stats_shmem->stats)); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_wal_snapshot_cb(void) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&pgStatLocal.snapshot.wal, &stats_shmem->stats, + sizeof(pgStatLocal.snapshot.wal)); + LWLockRelease(&stats_shmem->lock); } diff --git a/src/backend/utils/activity/pgstat_xact.c b/src/backend/utils/activity/pgstat_xact.c index 3f33087378..230ffa5afc 100644 --- a/src/backend/utils/activity/pgstat_xact.c +++ b/src/backend/utils/activity/pgstat_xact.c @@ -68,6 +68,7 @@ static void AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) { dlist_mutable_iter iter; + int not_freed_count = 0; if (xact_state->pending_drops_count == 0) { @@ -79,6 +80,7 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) { PgStat_PendingDroppedStatsItem *pending = dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + xl_xact_stats_item *it = &pending->item; if (isCommit && !pending->is_create) { @@ -86,7 +88,8 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) * Transaction that dropped an object committed. Drop the stats * too. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } else if (!isCommit && pending->is_create) { @@ -94,13 +97,17 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) * Transaction that created an object aborted. Drop the stats * associated with the object. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } dlist_delete(&pending->node); xact_state->pending_drops_count--; pfree(pending); } + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } /* @@ -135,6 +142,7 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, { PgStat_SubXactStatus *parent_xact_state; dlist_mutable_iter iter; + int not_freed_count = 0; if (xact_state->pending_drops_count == 0) return; @@ -145,6 +153,7 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, { PgStat_PendingDroppedStatsItem *pending = dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + xl_xact_stats_item *it = &pending->item; dlist_delete(&pending->node); xact_state->pending_drops_count--; @@ -155,7 +164,8 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, * Subtransaction creating a new stats object aborted. Drop the * stats object. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; pfree(pending); } else if (isCommit) @@ -175,6 +185,8 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, } Assert(xact_state->pending_drops_count == 0); + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } /* @@ -307,13 +319,21 @@ pgstat_get_transactional_drops(bool isCommit, xl_xact_stats_item **items) void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_item *items, bool is_redo) { + int not_freed_count = 0; + if (ndrops == 0) return; for (int i = 0; i < ndrops; i++) { - /* will do work in subsequent commit */ + xl_xact_stats_item *it = &items[i]; + + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } static void @@ -345,6 +365,15 @@ create_drop_transactional_internal(PgStat_Kind kind, Oid dboid, Oid objoid, bool void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, Oid objoid) { + if (pgstat_get_entry_ref(kind, dboid, objoid, false, NULL)) + { + ereport(WARNING, + errmsg("resetting existing stats for type %s, db=%d, oid=%d", + (pgstat_get_kind_info(kind))->name, dboid, objoid)); + + pgstat_reset(kind, dboid, objoid); + } + create_drop_transactional_internal(kind, dboid, objoid, /* create */ true); } diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 1c8aba4925..87c15b9c6f 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -230,9 +230,6 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN: event_name = "LogicalLauncherMain"; break; - case WAIT_EVENT_PGSTAT_MAIN: - event_name = "PgStatMain"; - break; case WAIT_EVENT_RECOVERY_WAL_STREAM: event_name = "RecoveryWalStream"; break; diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index be5470a107..248d318f86 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -2046,7 +2046,15 @@ pg_stat_get_xact_function_self_time(PG_FUNCTION_ARGS) Datum pg_stat_get_snapshot_timestamp(PG_FUNCTION_ARGS) { - PG_RETURN_TIMESTAMPTZ(pgstat_fetch_global()->stats_timestamp); + bool have_snapshot; + TimestampTz ts; + + ts = pgstat_get_stat_snapshot_timestamp(&have_snapshot); + + if (!have_snapshot) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(ts); } /* Discard the active statistics snapshot */ diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index a15ce9edb1..1f29670a13 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -73,6 +73,7 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" +#include "pgstat.h" #include "rewrite/rewriteDefine.h" #include "rewrite/rowsecurity.h" #include "storage/lmgr.h" @@ -2409,6 +2410,9 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) */ RelationCloseSmgr(relation); + /* break mutual link with stats entry */ + pgstat_unlink_relation(relation); + /* * Free all the subsidiary data structures of the relcache entry, then the * entry itself. @@ -2716,8 +2720,9 @@ RelationClearRelation(Relation relation, bool rebuild) SWAPFIELD(RowSecurityDesc *, rd_rsdesc); /* toast OID override must be preserved */ SWAPFIELD(Oid, rd_toastoid); - /* pgstat_info must be preserved */ + /* pgstat_info / enabled must be preserved */ SWAPFIELD(struct PgStat_TableStatus *, pgstat_info); + SWAPFIELD(bool, pgstat_enabled); /* preserve old partition key if we have one */ if (keep_partkey) { diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 3419c099b2..1a5d29ac9b 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -36,6 +36,7 @@ volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false; volatile sig_atomic_t IdleSessionTimeoutPending = false; volatile sig_atomic_t ProcSignalBarrierPending = false; volatile sig_atomic_t LogMemoryContextPending = false; +volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false; volatile uint32 InterruptHoldoffCount = 0; volatile uint32 QueryCancelHoldoffCount = 0; volatile uint32 CritSectionCount = 0; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index bdc77af719..0d3cfe8240 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -288,9 +288,6 @@ GetBackendTypeDesc(BackendType backendType) case B_ARCHIVER: backendDesc = "archiver"; break; - case B_STATS_COLLECTOR: - backendDesc = "stats collector"; - break; case B_LOGGER: backendDesc = "logger"; break; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 342169b195..a85c2e0260 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -80,6 +80,7 @@ static void StatementTimeoutHandler(void); static void LockTimeoutHandler(void); static void IdleInTransactionSessionTimeoutHandler(void); static void IdleSessionTimeoutHandler(void); +static void IdleStatsUpdateTimeoutHandler(void); static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); @@ -725,6 +726,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, IdleInTransactionSessionTimeoutHandler); RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler); RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler); + RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT, + IdleStatsUpdateTimeoutHandler); } /* @@ -752,6 +755,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, * Use before_shmem_exit() so that ShutdownXLOG() can rely on DSM * segments etc to work (which in turn is required for pgstats). */ + before_shmem_exit(pgstat_before_server_shutdown, 0); before_shmem_exit(ShutdownXLOG, 0); } @@ -1334,6 +1338,14 @@ IdleSessionTimeoutHandler(void) SetLatch(MyLatch); } +static void +IdleStatsUpdateTimeoutHandler(void) +{ + IdleStatsUpdateTimeoutPending = true; + InterruptPending = true; + SetLatch(MyLatch); +} + static void ClientCheckTimeoutHandler(void) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5538465d7d..f7758ea4a7 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -375,6 +375,16 @@ static const struct config_enum_entry track_function_options[] = { StaticAssertDecl(lengthof(track_function_options) == (TRACK_FUNC_ALL + 2), "array length mismatch"); +static const struct config_enum_entry stats_fetch_consistency[] = { + {"none", PGSTAT_FETCH_CONSISTENCY_NONE, false}, + {"cache", PGSTAT_FETCH_CONSISTENCY_CACHE, false}, + {"snapshot", PGSTAT_FETCH_CONSISTENCY_SNAPSHOT, false}, + {NULL, 0, false} +}; + +StaticAssertDecl(lengthof(stats_fetch_consistency) == (PGSTAT_FETCH_CONSISTENCY_SNAPSHOT + 2), + "array length mismatch"); + static const struct config_enum_entry xmlbinary_options[] = { {"base64", XMLBINARY_BASE64, false}, {"hex", XMLBINARY_HEX, false}, @@ -4918,6 +4928,17 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + + { + {"stats_fetch_consistency", PGC_USERSET, STATS_COLLECTOR, + gettext_noop("Sets the consistency of accesses to statistics data"), + NULL + }, + &pgstat_fetch_consistency, + PGSTAT_FETCH_CONSISTENCY_CACHE, stats_fetch_consistency, + NULL, NULL, NULL + }, + { {"wal_compression", PGC_SUSET, WAL_SETTINGS, gettext_noop("Compresses full-page writes written in WAL file with specified method."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 93d221a37b..5f9a37bed3 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -614,6 +614,7 @@ #track_wal_io_timing = off #track_functions = none # none, pl, all #stats_temp_directory = 'pg_stat_tmp' +#stats_fetch_consistency = none # - Monitoring - diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9321d7f264..66c404c666 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -94,6 +94,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending; extern PGDLLIMPORT volatile sig_atomic_t LogMemoryContextPending; +extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending; extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost; @@ -333,7 +334,6 @@ typedef enum BackendType B_WAL_SENDER, B_WAL_WRITER, B_ARCHIVER, - B_STATS_COLLECTOR, B_LOGGER, } BackendType; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 99115bacde..1d2d3de86c 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -14,10 +14,8 @@ #include "datatype/timestamp.h" #include "portability/instr_time.h" #include "postmaster/pgarch.h" /* for MAX_XFN_CHARS */ -#include "replication/logicalproto.h" #include "utils/backend_progress.h" /* for backward compatibility */ #include "utils/backend_status.h" /* for backward compatibility */ -#include "utils/hsearch.h" #include "utils/relcache.h" #include "utils/wait_event.h" /* for backward compatibility */ @@ -27,8 +25,8 @@ * ---------- */ #define PGSTAT_STAT_PERMANENT_DIRECTORY "pg_stat" -#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat" -#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp" +#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/pgstat.stat" +#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/pgstat.tmp" /* Default directory to store temporary statistics data in */ #define PG_STAT_TMP_DIR "pg_stat_tmp" @@ -66,6 +64,13 @@ typedef enum TrackFunctionsLevel TRACK_FUNC_ALL } TrackFunctionsLevel; +typedef enum PgStat_FetchConsistency +{ + PGSTAT_FETCH_CONSISTENCY_NONE, + PGSTAT_FETCH_CONSISTENCY_CACHE, + PGSTAT_FETCH_CONSISTENCY_SNAPSHOT, +} PgStat_FetchConsistency; + /* Values to track the cause of session termination */ typedef enum SessionEndType { @@ -92,7 +97,7 @@ typedef int64 PgStat_Counter; * PgStat_FunctionCounts The actual per-function counts kept by a backend * * This struct should contain only actual event counters, because we memcmp - * it against zeroes to detect whether there are any counts to transmit. + * it against zeroes to detect whether there are any pending stats. * * Note that the time counters are in instr_time format here. We convert to * microseconds in PgStat_Counter format when flushing out pending statistics. @@ -106,12 +111,11 @@ typedef struct PgStat_FunctionCounts } PgStat_FunctionCounts; /* ---------- - * PgStat_BackendFunctionEntry Entry in backend's per-function hash table + * PgStat_BackendFunctionEntry Non-flushed function stats. * ---------- */ typedef struct PgStat_BackendFunctionEntry { - Oid f_id; PgStat_FunctionCounts f_counts; } PgStat_BackendFunctionEntry; @@ -131,13 +135,22 @@ typedef struct PgStat_FunctionCallUsage instr_time f_start; } PgStat_FunctionCallUsage; +/* ---------- + * PgStat_BackendSubEntry Non-flushed subscription stats. + * ---------- + */ +typedef struct PgStat_BackendSubEntry +{ + PgStat_Counter apply_error_count; + PgStat_Counter sync_error_count; +} PgStat_BackendSubEntry; + /* ---------- * PgStat_TableCounts The actual per-table counts kept by a backend * * This struct should contain only actual event counters, because we memcmp - * it against zeroes to detect whether there are any counts to transmit. - * It is a component of PgStat_TableStatus (within-backend state) and - * PgStat_TableEntry (the transmitted message format). + * it against zeroes to detect whether there are any stats updates to apply. + * It is a component of PgStat_TableStatus (within-backend state). * * Note: for a table, tuples_returned is the number of tuples successfully * fetched by heap_getnext, while tuples_fetched is the number of tuples @@ -194,6 +207,7 @@ typedef struct PgStat_TableStatus bool t_shared; /* is it a shared catalog? */ struct PgStat_TableXactStatus *trans; /* lowest subxact's counts */ PgStat_TableCounts t_counts; /* event counts to be sent */ + Relation relation; /* rel that is using this entry */ } PgStat_TableStatus; /* ---------- @@ -221,569 +235,14 @@ typedef struct PgStat_TableXactStatus /* ------------------------------------------------------------ - * Message formats follow - * ------------------------------------------------------------ - */ - -/* ---------- - * The types of backend -> collector messages - * ---------- - */ -typedef enum StatMsgType -{ - PGSTAT_MTYPE_DUMMY, - PGSTAT_MTYPE_INQUIRY, - PGSTAT_MTYPE_TABSTAT, - PGSTAT_MTYPE_TABPURGE, - PGSTAT_MTYPE_DROPDB, - PGSTAT_MTYPE_RESETCOUNTER, - PGSTAT_MTYPE_RESETSHAREDCOUNTER, - PGSTAT_MTYPE_RESETSINGLECOUNTER, - PGSTAT_MTYPE_RESETSLRUCOUNTER, - PGSTAT_MTYPE_RESETREPLSLOTCOUNTER, - PGSTAT_MTYPE_RESETSUBCOUNTER, - PGSTAT_MTYPE_AUTOVAC_START, - PGSTAT_MTYPE_VACUUM, - PGSTAT_MTYPE_ANALYZE, - PGSTAT_MTYPE_ARCHIVER, - PGSTAT_MTYPE_BGWRITER, - PGSTAT_MTYPE_CHECKPOINTER, - PGSTAT_MTYPE_WAL, - PGSTAT_MTYPE_SLRU, - PGSTAT_MTYPE_FUNCSTAT, - PGSTAT_MTYPE_FUNCPURGE, - PGSTAT_MTYPE_RECOVERYCONFLICT, - PGSTAT_MTYPE_TEMPFILE, - PGSTAT_MTYPE_DEADLOCK, - PGSTAT_MTYPE_CHECKSUMFAILURE, - PGSTAT_MTYPE_REPLSLOT, - PGSTAT_MTYPE_CONNECT, - PGSTAT_MTYPE_DISCONNECT, - PGSTAT_MTYPE_SUBSCRIPTIONDROP, - PGSTAT_MTYPE_SUBSCRIPTIONERROR, -} StatMsgType; - -/* ---------- - * PgStat_MsgHdr The common message header - * ---------- - */ -typedef struct PgStat_MsgHdr -{ - StatMsgType m_type; - int m_size; -} PgStat_MsgHdr; - -/* ---------- - * Space available in a message. This will keep the UDP packets below 1K, - * which should fit unfragmented into the MTU of the loopback interface. - * (Larger values of PGSTAT_MAX_MSG_SIZE would work for that on most - * platforms, but we're being conservative here.) - * ---------- - */ -#define PGSTAT_MAX_MSG_SIZE 1000 -#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr)) - - -/* ---------- - * PgStat_MsgDummy A dummy message, ignored by the collector - * ---------- - */ -typedef struct PgStat_MsgDummy -{ - PgStat_MsgHdr m_hdr; -} PgStat_MsgDummy; - -/* ---------- - * PgStat_MsgInquiry Sent by a backend to ask the collector - * to write the stats file(s). - * - * Ordinarily, an inquiry message prompts writing of the global stats file, - * the stats file for shared catalogs, and the stats file for the specified - * database. If databaseid is InvalidOid, only the first two are written. - * - * New file(s) will be written only if the existing file has a timestamp - * older than the specified cutoff_time; this prevents duplicated effort - * when multiple requests arrive at nearly the same time, assuming that - * backends send requests with cutoff_times a little bit in the past. - * - * clock_time should be the requestor's current local time; the collector - * uses this to check for the system clock going backward, but it has no - * effect unless that occurs. We assume clock_time >= cutoff_time, though. - * ---------- - */ -typedef struct PgStat_MsgInquiry -{ - PgStat_MsgHdr m_hdr; - TimestampTz clock_time; /* observed local clock time */ - TimestampTz cutoff_time; /* minimum acceptable file timestamp */ - Oid databaseid; /* requested DB (InvalidOid => shared only) */ -} PgStat_MsgInquiry; - -/* ---------- - * PgStat_TableEntry Per-table info in a MsgTabstat - * ---------- - */ -typedef struct PgStat_TableEntry -{ - Oid t_id; - PgStat_TableCounts t_counts; -} PgStat_TableEntry; - -/* ---------- - * PgStat_MsgTabstat Sent by the backend to report table - * and buffer access statistics. - * ---------- - */ -#define PGSTAT_NUM_TABENTRIES \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 5 * sizeof(PgStat_Counter)) \ - / sizeof(PgStat_TableEntry)) - -typedef struct PgStat_MsgTabstat -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - int m_xact_commit; - int m_xact_rollback; - PgStat_Counter m_block_read_time; /* times in microseconds */ - PgStat_Counter m_block_write_time; - PgStat_Counter m_session_time; - PgStat_Counter m_active_time; - PgStat_Counter m_idle_in_xact_time; - PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES]; -} PgStat_MsgTabstat; - -/* ---------- - * PgStat_MsgTabpurge Sent by the backend to tell the collector - * about dead tables. - * ---------- - */ -#define PGSTAT_NUM_TABPURGE \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(Oid)) - -typedef struct PgStat_MsgTabpurge -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - Oid m_tableid[PGSTAT_NUM_TABPURGE]; -} PgStat_MsgTabpurge; - -/* ---------- - * PgStat_MsgDropdb Sent by the backend to tell the collector - * about a dropped database - * ---------- - */ -typedef struct PgStat_MsgDropdb -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgDropdb; - -/* ---------- - * PgStat_MsgResetcounter Sent by the backend to tell the collector - * to reset counters - * ---------- - */ -typedef struct PgStat_MsgResetcounter -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgResetcounter; - -/* ---------- - * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector - * to reset a shared counter - * ---------- - */ -typedef struct PgStat_MsgResetsharedcounter -{ - PgStat_MsgHdr m_hdr; - PgStat_Kind m_resettarget; -} PgStat_MsgResetsharedcounter; - -/* ---------- - * PgStat_MsgResetsinglecounter Sent by the backend to tell the collector - * to reset a single counter - * ---------- - */ -typedef struct PgStat_MsgResetsinglecounter -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - PgStat_Kind m_resettype; - Oid m_objectid; -} PgStat_MsgResetsinglecounter; - -/* ---------- - * PgStat_MsgResetslrucounter Sent by the backend to tell the collector - * to reset a SLRU counter - * ---------- - */ -typedef struct PgStat_MsgResetslrucounter -{ - PgStat_MsgHdr m_hdr; - int m_index; -} PgStat_MsgResetslrucounter; - -/* ---------- - * PgStat_MsgResetreplslotcounter Sent by the backend to tell the collector - * to reset replication slot counter(s) - * ---------- - */ -typedef struct PgStat_MsgResetreplslotcounter -{ - PgStat_MsgHdr m_hdr; - NameData m_slotname; - bool clearall; -} PgStat_MsgResetreplslotcounter; - -/* ---------- - * PgStat_MsgResetsubcounter Sent by the backend to tell the collector - * to reset subscription counter(s) - * ---------- - */ -typedef struct PgStat_MsgResetsubcounter -{ - PgStat_MsgHdr m_hdr; - Oid m_subid; /* InvalidOid means reset all subscription - * stats */ -} PgStat_MsgResetsubcounter; - -/* ---------- - * PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal - * that a database is going to be processed - * ---------- - */ -typedef struct PgStat_MsgAutovacStart -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - TimestampTz m_start_time; -} PgStat_MsgAutovacStart; - -/* ---------- - * PgStat_MsgVacuum Sent by the backend or autovacuum daemon - * after VACUUM - * ---------- - */ -typedef struct PgStat_MsgVacuum -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - Oid m_tableoid; - bool m_autovacuum; - TimestampTz m_vacuumtime; - PgStat_Counter m_live_tuples; - PgStat_Counter m_dead_tuples; -} PgStat_MsgVacuum; - -/* ---------- - * PgStat_MsgAnalyze Sent by the backend or autovacuum daemon - * after ANALYZE - * ---------- - */ -typedef struct PgStat_MsgAnalyze -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - Oid m_tableoid; - bool m_autovacuum; - bool m_resetcounter; - TimestampTz m_analyzetime; - PgStat_Counter m_live_tuples; - PgStat_Counter m_dead_tuples; -} PgStat_MsgAnalyze; - -/* ---------- - * PgStat_MsgArchiver Sent by the archiver to update statistics. - * ---------- - */ -typedef struct PgStat_MsgArchiver -{ - PgStat_MsgHdr m_hdr; - bool m_failed; /* Failed attempt */ - char m_xlog[MAX_XFN_CHARS + 1]; - TimestampTz m_timestamp; -} PgStat_MsgArchiver; - -/* ---------- - * PgStat_MsgBgWriter Sent by the bgwriter to update statistics. - * ---------- - */ -typedef struct PgStat_MsgBgWriter -{ - PgStat_MsgHdr m_hdr; - - PgStat_Counter m_buf_written_clean; - PgStat_Counter m_maxwritten_clean; - PgStat_Counter m_buf_alloc; -} PgStat_MsgBgWriter; - -/* ---------- - * PgStat_MsgCheckpointer Sent by the checkpointer to update statistics. - * ---------- - */ -typedef struct PgStat_MsgCheckpointer -{ - PgStat_MsgHdr m_hdr; - - PgStat_Counter m_timed_checkpoints; - PgStat_Counter m_requested_checkpoints; - PgStat_Counter m_buf_written_checkpoints; - PgStat_Counter m_buf_written_backend; - PgStat_Counter m_buf_fsync_backend; - PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */ - PgStat_Counter m_checkpoint_sync_time; -} PgStat_MsgCheckpointer; - -/* ---------- - * PgStat_MsgWal Sent by backends and background processes to update WAL statistics. - * ---------- - */ -typedef struct PgStat_MsgWal -{ - PgStat_MsgHdr m_hdr; - PgStat_Counter m_wal_records; - PgStat_Counter m_wal_fpi; - uint64 m_wal_bytes; - PgStat_Counter m_wal_buffers_full; - PgStat_Counter m_wal_write; - PgStat_Counter m_wal_sync; - PgStat_Counter m_wal_write_time; /* time spent writing wal records in - * microseconds */ - PgStat_Counter m_wal_sync_time; /* time spent syncing wal records in - * microseconds */ -} PgStat_MsgWal; - -/* ---------- - * PgStat_MsgSLRU Sent by a backend to update SLRU statistics. - * ---------- - */ -typedef struct PgStat_MsgSLRU -{ - PgStat_MsgHdr m_hdr; - PgStat_Counter m_index; - PgStat_Counter m_blocks_zeroed; - PgStat_Counter m_blocks_hit; - PgStat_Counter m_blocks_read; - PgStat_Counter m_blocks_written; - PgStat_Counter m_blocks_exists; - PgStat_Counter m_flush; - PgStat_Counter m_truncate; -} PgStat_MsgSLRU; - -/* ---------- - * PgStat_MsgReplSlot Sent by a backend or a wal sender to update replication - * slot statistics. - * ---------- - */ -typedef struct PgStat_MsgReplSlot -{ - PgStat_MsgHdr m_hdr; - NameData m_slotname; - bool m_create; - bool m_drop; - PgStat_Counter m_spill_txns; - PgStat_Counter m_spill_count; - PgStat_Counter m_spill_bytes; - PgStat_Counter m_stream_txns; - PgStat_Counter m_stream_count; - PgStat_Counter m_stream_bytes; - PgStat_Counter m_total_txns; - PgStat_Counter m_total_bytes; -} PgStat_MsgReplSlot; - -/* ---------- - * PgStat_MsgSubscriptionDrop Sent by the backend and autovacuum to tell the - * collector about the dead subscription. - * ---------- - */ -typedef struct PgStat_MsgSubscriptionDrop -{ - PgStat_MsgHdr m_hdr; - Oid m_subid; -} PgStat_MsgSubscriptionDrop; - -/* ---------- - * PgStat_MsgSubscriptionError Sent by the apply worker or the table sync - * worker to report an error on the subscription. - * ---------- - */ -typedef struct PgStat_MsgSubscriptionError -{ - PgStat_MsgHdr m_hdr; - - Oid m_subid; - bool m_is_apply_error; -} PgStat_MsgSubscriptionError; - -/* ---------- - * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict - * ---------- - */ -typedef struct PgStat_MsgRecoveryConflict -{ - PgStat_MsgHdr m_hdr; - - Oid m_databaseid; - int m_reason; -} PgStat_MsgRecoveryConflict; - -/* ---------- - * PgStat_MsgTempFile Sent by the backend upon creating a temp file - * ---------- - */ -typedef struct PgStat_MsgTempFile -{ - PgStat_MsgHdr m_hdr; - - Oid m_databaseid; - size_t m_filesize; -} PgStat_MsgTempFile; - -/* ---------- - * PgStat_FunctionEntry Per-function info in a MsgFuncstat - * ---------- - */ -typedef struct PgStat_FunctionEntry -{ - Oid f_id; - PgStat_Counter f_numcalls; - PgStat_Counter f_total_time; /* times in microseconds */ - PgStat_Counter f_self_time; -} PgStat_FunctionEntry; - -/* ---------- - * PgStat_MsgFuncstat Sent by the backend to report function - * usage statistics. - * ---------- - */ -#define PGSTAT_NUM_FUNCENTRIES \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(PgStat_FunctionEntry)) - -typedef struct PgStat_MsgFuncstat -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES]; -} PgStat_MsgFuncstat; - -/* ---------- - * PgStat_MsgFuncpurge Sent by the backend to tell the collector - * about dead functions. - * ---------- - */ -#define PGSTAT_NUM_FUNCPURGE \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(Oid)) - -typedef struct PgStat_MsgFuncpurge -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - Oid m_functionid[PGSTAT_NUM_FUNCPURGE]; -} PgStat_MsgFuncpurge; - -/* ---------- - * PgStat_MsgDeadlock Sent by the backend to tell the collector - * about a deadlock that occurred. - * ---------- - */ -typedef struct PgStat_MsgDeadlock -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgDeadlock; - -/* ---------- - * PgStat_MsgChecksumFailure Sent by the backend to tell the collector - * about checksum failures noticed. - * ---------- - */ -typedef struct PgStat_MsgChecksumFailure -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_failurecount; - TimestampTz m_failure_time; -} PgStat_MsgChecksumFailure; - -/* ---------- - * PgStat_MsgConnect Sent by the backend upon connection - * establishment - * ---------- - */ -typedef struct PgStat_MsgConnect -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgConnect; - -/* ---------- - * PgStat_MsgDisconnect Sent by the backend when disconnecting - * ---------- - */ -typedef struct PgStat_MsgDisconnect -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - SessionEndType m_cause; -} PgStat_MsgDisconnect; - -/* ---------- - * PgStat_Msg Union over all possible messages. - * ---------- - */ -typedef union PgStat_Msg -{ - PgStat_MsgHdr msg_hdr; - PgStat_MsgDummy msg_dummy; - PgStat_MsgInquiry msg_inquiry; - PgStat_MsgTabstat msg_tabstat; - PgStat_MsgTabpurge msg_tabpurge; - PgStat_MsgDropdb msg_dropdb; - PgStat_MsgResetcounter msg_resetcounter; - PgStat_MsgResetsharedcounter msg_resetsharedcounter; - PgStat_MsgResetsinglecounter msg_resetsinglecounter; - PgStat_MsgResetslrucounter msg_resetslrucounter; - PgStat_MsgResetreplslotcounter msg_resetreplslotcounter; - PgStat_MsgResetsubcounter msg_resetsubcounter; - PgStat_MsgAutovacStart msg_autovacuum_start; - PgStat_MsgVacuum msg_vacuum; - PgStat_MsgAnalyze msg_analyze; - PgStat_MsgArchiver msg_archiver; - PgStat_MsgBgWriter msg_bgwriter; - PgStat_MsgCheckpointer msg_checkpointer; - PgStat_MsgWal msg_wal; - PgStat_MsgSLRU msg_slru; - PgStat_MsgFuncstat msg_funcstat; - PgStat_MsgFuncpurge msg_funcpurge; - PgStat_MsgRecoveryConflict msg_recoveryconflict; - PgStat_MsgDeadlock msg_deadlock; - PgStat_MsgTempFile msg_tempfile; - PgStat_MsgChecksumFailure msg_checksumfailure; - PgStat_MsgReplSlot msg_replslot; - PgStat_MsgConnect msg_connect; - PgStat_MsgDisconnect msg_disconnect; - PgStat_MsgSubscriptionError msg_subscriptionerror; - PgStat_MsgSubscriptionDrop msg_subscriptiondrop; -} PgStat_Msg; - - -/* ------------------------------------------------------------ - * Statistic collector data structures follow + * Data structures on disk and in shared memory follow * * PGSTAT_FILE_FORMAT_ID should be changed whenever any of these * data structures change. * ------------------------------------------------------------ */ -#define PGSTAT_FILE_FORMAT_ID 0x01A5BCA6 +#define PGSTAT_FILE_FORMAT_ID 0x01A5BCA7 typedef struct PgStat_ArchiverStats { @@ -808,7 +267,6 @@ typedef struct PgStat_BgWriterStats typedef struct PgStat_CheckpointerStats { - TimestampTz stats_timestamp; /* time of stats file update */ PgStat_Counter timed_checkpoints; PgStat_Counter requested_checkpoints; PgStat_Counter checkpoint_write_time; /* times in milliseconds */ @@ -820,7 +278,6 @@ typedef struct PgStat_CheckpointerStats typedef struct PgStat_StatDBEntry { - Oid databaseid; PgStat_Counter n_xact_commit; PgStat_Counter n_xact_rollback; PgStat_Counter n_blocks_fetched; @@ -852,34 +309,16 @@ typedef struct PgStat_StatDBEntry PgStat_Counter n_sessions_killed; TimestampTz stat_reset_timestamp; - TimestampTz stats_timestamp; /* time of db stats file update */ - - /* - * tables and functions must be last in the struct, because we don't write - * the pointers out to the stats file. - */ - HTAB *tables; - HTAB *functions; } PgStat_StatDBEntry; typedef struct PgStat_StatFuncEntry { - Oid functionid; - PgStat_Counter f_numcalls; PgStat_Counter f_total_time; /* times in microseconds */ PgStat_Counter f_self_time; } PgStat_StatFuncEntry; -typedef struct PgStat_GlobalStats -{ - TimestampTz stats_timestamp; /* time of stats file update */ - - PgStat_CheckpointerStats checkpointer; - PgStat_BgWriterStats bgwriter; -} PgStat_GlobalStats; - typedef struct PgStat_StatReplSlotEntry { NameData slotname; @@ -908,8 +347,6 @@ typedef struct PgStat_SLRUStats typedef struct PgStat_StatSubEntry { - Oid subid; /* hash key (must be first) */ - PgStat_Counter apply_error_count; PgStat_Counter sync_error_count; TimestampTz stat_reset_timestamp; @@ -917,8 +354,6 @@ typedef struct PgStat_StatSubEntry typedef struct PgStat_StatTabEntry { - Oid tableid; - PgStat_Counter numscans; PgStat_Counter tuples_returned; @@ -966,22 +401,19 @@ typedef struct PgStat_WalStats */ /* functions called from postmaster */ -extern void pgstat_init(void); -extern void pgstat_reset_all(void); -extern int pgstat_start(void); -extern void allow_immediate_pgstat_restart(void); +extern Size StatsShmemSize(void); +extern void StatsShmemInit(void); -#ifdef EXEC_BACKEND -extern void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn(); -#endif +/* Functions called during server startup / shutdown */ +extern void pgstat_restore_stats(void); +extern void pgstat_discard_stats(void); +extern void pgstat_before_server_shutdown(int code, Datum arg); /* Functions for backend initialization */ extern void pgstat_initialize(void); /* Functions called from backends */ -extern void pgstat_report_stat(bool force); -extern void pgstat_vacuum_stat(void); -extern void pgstat_ping(void); +extern long pgstat_report_stat(bool force); extern void pgstat_reset_counters(void); extern void pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objectid); @@ -989,24 +421,17 @@ extern void pgstat_reset_of_kind(PgStat_Kind kind); /* stats accessors */ extern void pgstat_clear_snapshot(void); -extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); -extern PgStat_BgWriterStats *pgstat_fetch_stat_bgwriter(void); -extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void); -extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid); -extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid); -extern PgStat_GlobalStats *pgstat_fetch_global(void); -extern PgStat_StatReplSlotEntry *pgstat_fetch_replslot(NameData slotname); -extern PgStat_StatSubEntry *pgstat_fetch_stat_subscription(Oid subid); -extern PgStat_SLRUStats *pgstat_fetch_slru(void); -extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); -extern PgStat_WalStats *pgstat_fetch_stat_wal(void); +extern TimestampTz pgstat_get_stat_snapshot_timestamp(bool *have_snapshot); +/* helpers */ +extern PgStat_Kind pgstat_get_kind_from_str(char *kind_str); /* * Functions in pgstat_archiver.c */ extern void pgstat_report_archiver(const char *xlog, bool failed); +extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); /* @@ -1014,6 +439,7 @@ extern void pgstat_report_archiver(const char *xlog, bool failed); */ extern void pgstat_report_bgwriter(void); +extern PgStat_BgWriterStats *pgstat_fetch_stat_bgwriter(void); /* @@ -1021,6 +447,7 @@ extern void pgstat_report_bgwriter(void); */ extern void pgstat_report_checkpointer(void); +extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void); /* @@ -1044,6 +471,7 @@ extern void pgstat_report_connect(Oid dboid); #define pgstat_count_conn_txn_idle_time(n) \ (pgStatTransactionIdleTime += (n)) +extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid); /* * Functions in pgstat_function.c @@ -1058,6 +486,7 @@ extern void pgstat_init_function_usage(struct FunctionCallInfoBaseData *fcinfo, extern void pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize); +extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid); extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id); @@ -1070,6 +499,8 @@ extern void pgstat_drop_relation(Relation rel); extern void pgstat_copy_relation_stats(Relation dstrel, Relation srcrel); extern void pgstat_init_relation(Relation rel); +extern void pgstat_assoc_relation(Relation rel); +extern void pgstat_unlink_relation(Relation rel); extern void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples); @@ -1077,8 +508,14 @@ extern void pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter); +/* + * If stats are enabled, but pending data hasn't been prepared yet, call + * pgstat_assoc_relation() to do so. See its comment for why this is done + * separately from pgstat_init_relation(). + */ #define pgstat_should_count_relation(rel) \ - (likely((rel)->pgstat_info != NULL)) + (likely((rel)->pgstat_info != NULL) ? true : \ + ((rel)->pgstat_enabled ? pgstat_assoc_relation(rel), true : false)) /* nontransactional event counts are simple enough to inline */ @@ -1129,6 +566,9 @@ extern void pgstat_twophase_postcommit(TransactionId xid, uint16 info, extern void pgstat_twophase_postabort(TransactionId xid, uint16 info, void *recdata, uint32 len); +extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); +extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_ext(bool shared, + Oid relid); extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id); @@ -1140,7 +580,9 @@ extern void pgstat_reset_replslot(const char *name); struct ReplicationSlot; extern void pgstat_report_replslot(struct ReplicationSlot *slot, const PgStat_StatReplSlotEntry *repSlotStat); extern void pgstat_create_replslot(struct ReplicationSlot *slot); +extern void pgstat_acquire_replslot(struct ReplicationSlot *slot); extern void pgstat_drop_replslot(struct ReplicationSlot *slot); +extern PgStat_StatReplSlotEntry *pgstat_fetch_replslot(NameData slotname); /* @@ -1157,6 +599,7 @@ extern void pgstat_count_slru_flush(int slru_idx); extern void pgstat_count_slru_truncate(int slru_idx); extern const char *pgstat_get_slru_name(int slru_idx); extern int pgstat_get_slru_index(const char *name); +extern PgStat_SLRUStats *pgstat_fetch_slru(void); /* @@ -1166,6 +609,7 @@ extern int pgstat_get_slru_index(const char *name); extern void pgstat_report_subscription_error(Oid subid, bool is_apply_error); extern void pgstat_create_subscription(Oid subid); extern void pgstat_drop_subscription(Oid subid); +extern PgStat_StatSubEntry *pgstat_fetch_stat_subscription(Oid subid); /* @@ -1186,6 +630,7 @@ extern void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_ */ extern void pgstat_report_wal(bool force); +extern PgStat_WalStats *pgstat_fetch_stat_wal(void); /* @@ -1195,6 +640,8 @@ extern void pgstat_report_wal(bool force); /* GUC parameters */ extern PGDLLIMPORT bool pgstat_track_counts; extern PGDLLIMPORT int pgstat_track_functions; +extern PGDLLIMPORT int pgstat_fetch_consistency; + extern char *pgstat_stat_directory; extern char *pgstat_stat_tmpname; extern char *pgstat_stat_filename; @@ -1205,7 +652,7 @@ extern char *pgstat_stat_filename; */ /* updated directly by bgwriter and bufmgr */ -extern PgStat_MsgBgWriter PendingBgWriterStats; +extern PgStat_BgWriterStats PendingBgWriterStats; /* @@ -1216,7 +663,7 @@ extern PgStat_MsgBgWriter PendingBgWriterStats; * Checkpointer statistics counters are updated directly by checkpointer and * bufmgr. */ -extern PgStat_MsgCheckpointer PendingCheckpointerStats; +extern PgStat_CheckpointerStats PendingCheckpointerStats; /* @@ -1243,7 +690,7 @@ extern SessionEndType pgStatSessionEndCause; */ /* updated directly by backends and background processes */ -extern PgStat_MsgWal WalStats; +extern PgStat_WalStats PendingWalStats; #endif /* PGSTAT_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index c3d5889d7b..33eb4c1033 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -190,6 +190,9 @@ typedef enum BuiltinTrancheIds LWTRANCHE_SHARED_TIDBITMAP, LWTRANCHE_PARALLEL_APPEND, LWTRANCHE_PER_XACT_PREDICATE_LIST, + LWTRANCHE_PGSTATS_DSA, + LWTRANCHE_PGSTATS_HASH, + LWTRANCHE_PGSTATS_DATA, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index c3f83c74c6..ab27bc47c5 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -14,21 +14,134 @@ #define PGSTAT_INTERNAL_H +#include "common/hashfn.h" +#include "lib/dshash.h" +#include "lib/ilist.h" #include "pgstat.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" -#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file - * updates; in milliseconds. */ - -/* ---------- - * The initial size hints for the hash tables used in the collector. - * ---------- +/* + * Types related to shared memory storage of statistics. + * + * Per-object statistics are stored in the "shared stats" hashtable. That + * table's entries (PgStatShared_HashEntry) contain a pointer to the actual stats + * data for the object (the size of the stats data varies depending on the + * kind of stats). The table is keyed by PgStat_HashKey. + * + * Once a backend has a reference to a shared stats entry, it increments the + * entry's refcount. Even after stats data is dropped (e.g., due to a DROP + * TABLE), the entry itself can only be deleted once all references have been + * released. + * + * These refcounts, in combination with a backend local hashtable + * (pgStatEntryRefHash, with entries pointing to PgStat_EntryRef) in front of + * the shared hash table, mean that most stats work can happen without + * touching the shared hash table, reducing contention. + * + * Once there are pending stats updates for a table PgStat_EntryRef->pending + * is allocated to contain a working space for as-of-yet-unapplied stats + * updates. Once the stats are flushed, PgStat_EntryRef->pending is freed. + * + * Each stat kind in the shared hash table has a fixed member + * PgStatShared_Common as the first element. */ -#define PGSTAT_DB_HASH_SIZE 16 -#define PGSTAT_TAB_HASH_SIZE 512 -#define PGSTAT_FUNCTION_HASH_SIZE 512 -#define PGSTAT_SUBSCRIPTION_HASH_SIZE 32 -#define PGSTAT_REPLSLOT_HASH_SIZE 32 + +/* struct for shared statistics hash entry key. */ +typedef struct PgStat_HashKey +{ + PgStat_Kind kind; /* statistics entry kind */ + Oid dboid; /* database ID. InvalidOid for shared objects. */ + Oid objoid; /* object ID, either table or function. */ +} PgStat_HashKey; + +/* + * Shared statistics hash entry. Doesn't itself contain any stats, but points + * to them (with ->body). That allows the stats entries themselves to be of + * variable size. + */ +typedef struct PgStatShared_HashEntry +{ + PgStat_HashKey key; /* hash key */ + + /* + * If dropped is set, backends need to release their references so that + * the memory for the entry can be freed. No new references may be made + * once marked as dropped. + */ + bool dropped; + + /* + * Refcount managing lifetime of the entry itself (as opposed to the + * dshash entry pointing to it). The stats lifetime has to be separate + * from the hash table entry lifetime because we allow backends to point + * to a stats entry without holding a hash table lock (and some other + * reasons). + * + * As long as the entry is not dropped, 1 is added to the refcount + * representing that the entry should not be dropped. In addition each + * backend that has a reference to the entry needs to increment the + * refcount as long as it does. + * + * May only be incremented / decremented while holding at least a shared + * lock on the dshash partition containing the entry. It needs to be an + * atomic variable because multiple backends can increment the refcount + * with just a shared lock. + * + * When the refcount reaches 0 the entry needs to be freed. + */ + pg_atomic_uint32 refcount; + + /* + * Pointer to shared stats. The stats entry always starts with + * PgStatShared_Common, embedded in a larger struct containing the + * PgStat_Kind specific stats fields. + */ + dsa_pointer body; +} PgStatShared_HashEntry; + +/* + * Common header struct for PgStatShm_Stat*Entry. + */ +typedef struct PgStatShared_Common +{ + uint32 magic; /* just a validity cross-check */ + /* lock protecting stats contents (i.e. data following the header) */ + LWLock lock; +} PgStatShared_Common; + +/* + * A backend local reference to a shared stats entry. As long as at least one + * such reference exists, the shared stats entry will not be released. + * + * If there are pending stats update to the shared stats, these are stored in + * ->pending. + */ +typedef struct PgStat_EntryRef +{ + /* + * Pointer to the PgStatShared_HashEntry entry in the shared stats + * hashtable. + */ + PgStatShared_HashEntry *shared_entry; + + /* + * Pointer to the stats data (i.e. PgStatShared_HashEntry->body), resolved + * as a local pointer, to avoid repeated dsa_get_address() calls. + */ + PgStatShared_Common *shared_stats; + + /* + * Pending statistics data that will need to be flushed to shared memory + * stats eventually. Each stats kind utilizing pending data defines what + * format its pending data has and needs to provide a + * PgStat_KindInfo->flush_pending_cb callback to merge pending into shared + * stats. + */ + void *pending; + dlist_node pending_node; /* membership in pgStatPending list */ +} PgStat_EntryRef; /* @@ -43,11 +156,11 @@ typedef struct PgStat_SubXactStatus struct PgStat_SubXactStatus *prev; /* higher-level subxact if any */ /* - * Dropping the statistics for objects that dropped transactionally itself - * needs to be transactional. Therefore we collect the stats dropped in - * the current (sub-)transaction and only execute the stats drop when we - * know if the transaction commits/aborts. To handle replicas and crashes, - * stats drops are included in commit records. + * Statistics for transactionally dropped objects need to be + * transactionally dropped as well. Collect the stats dropped in the + * current (sub-)transaction and only execute the stats drop when we know + * if the transaction commits/aborts. To handle replicas and crashes, + * stats drops are included in commit / abort records. */ dlist_head pending_drops; int pending_drops_count; @@ -64,10 +177,96 @@ typedef struct PgStat_SubXactStatus } PgStat_SubXactStatus; +/* + * Metadata for a specific kind of statistics. + */ +typedef struct PgStat_KindInfo +{ + /* + * Do a fixed number of stats objects exist for this kind of stats (e.g. + * bgwriter stats) or not (e.g. tables). + */ + bool fixed_amount:1; + + /* + * Can stats of this kind be accessed from another database? Determines + * whether a stats object gets included in stats snapshots. + */ + bool accessed_across_databases:1; + + /* + * For variable-numbered stats: Identified on-disk using a name, rather + * than PgStat_HashKey. Probably only needed for replication slot stats. + */ + bool named_on_disk:1; + + /* + * The size of an entry in the shared stats hash table (pointed to by + * PgStatShared_HashEntry->body). + */ + uint32 shared_size; + + /* + * The offset/size of statistics inside the shared stats entry. Used when + * [de-]serializing statistics to / from disk respectively. Separate from + * shared_size because [de-]serialization may not include in-memory state + * like lwlocks. + */ + uint32 shared_data_off; + uint32 shared_data_len; + + /* + * The size of the pending data for this kind. E.g. how large + * PgStat_EntryRef->pending is. Used for allocations. + * + * 0 signals that an entry of this kind should never have a pending entry. + */ + uint32 pending_size; + + /* + * For variable-numbered stats: flush pending stats. Required if pending + * data is used. + */ + bool (*flush_pending_cb) (PgStat_EntryRef *sr, bool nowait); + + /* + * For variable-numbered stats: delete pending stats. Optional. + */ + void (*delete_pending_cb) (PgStat_EntryRef *sr); + + /* + * For variable-numbered stats: reset the reset timestamp. Optional. + */ + void (*reset_timestamp_cb) (PgStatShared_Common *header, TimestampTz ts); + + /* + * For variable-numbered stats with named_on_disk. Optional. + */ + void (*to_serialized_name) (const PgStatShared_Common *header, NameData *name); + bool (*from_serialized_name) (const NameData *name, PgStat_HashKey *key); + + /* + * For fixed-numbered statistics: Reset All. + */ + void (*reset_all_cb) (TimestampTz ts); + + /* + * For fixed-numbered statistics: Build snapshot for entry + */ + void (*snapshot_cb) (void); + + /* name of the kind of stats */ + const char *const name; +} PgStat_KindInfo; + + /* * List of SLRU names that we keep stats for. There is no central registry of * SLRUs, so we use this fixed list instead. The "other" entry is used for * all SLRUs without an explicit entry (e.g. SLRUs in extensions). + * + * This is only defined here so that SLRU_NUM_ELEMENTS is known for later type + * definitions. */ static const char *const slru_names[] = { "CommitTs", @@ -83,33 +282,271 @@ static const char *const slru_names[] = { #define SLRU_NUM_ELEMENTS lengthof(slru_names) +/* ---------- + * Types and definitions for different kinds of fixed-amount stats. + * + * Single-writer stats use the changecount mechanism to achieve low-overhead + * writes - they're obviously more performance critical than reads. Check the + * definition of struct PgBackendStatus for some explanation of the + * changecount mechanism. + * + * Because the obvious implementation of resetting single-writer stats isn't + * compatible with that (another backend needs to write), we don't scribble on + * shared stats while resetting. Instead, just record the current counter + * values in a copy of the stats data, which is protected by ->lock. See + * pgstat_fetch_stat_(archiver|bgwriter|checkpointer) for the reader side. + * + * The only exception to that is the the stat_reset_timestamp in these + * structs, which is protected by ->lock, because it has to be written by + * another backend while resetting + * ---------- + */ + +typedef struct PgStatShared_Archiver +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_ArchiverStats stats; + PgStat_ArchiverStats reset_offset; +} PgStatShared_Archiver; + +typedef struct PgStatShared_BgWriter +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_BgWriterStats stats; + PgStat_BgWriterStats reset_offset; +} PgStatShared_BgWriter; + +typedef struct PgStatShared_Checkpointer +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_CheckpointerStats stats; + PgStat_CheckpointerStats reset_offset; +} PgStatShared_Checkpointer; + +typedef struct PgStatShared_SLRU +{ + /* lock protects ->stats */ + LWLock lock; + PgStat_SLRUStats stats[SLRU_NUM_ELEMENTS]; +} PgStatShared_SLRU; + +typedef struct PgStatShared_Wal +{ + /* lock protects ->stats */ + LWLock lock; + PgStat_WalStats stats; +} PgStatShared_Wal; + + + +/* ---------- + * Types and definitions for different kinds of variable-amount stats. + * + * Each struct has to start with PgStatShared_Common, containing information + * common across the different types of stats. Kind-specific data follows. + * ---------- + */ + +typedef struct PgStatShared_Database +{ + PgStatShared_Common header; + PgStat_StatDBEntry stats; +} PgStatShared_Database; + +typedef struct PgStatShared_Relation +{ + PgStatShared_Common header; + PgStat_StatTabEntry stats; +} PgStatShared_Relation; + +typedef struct PgStatShared_Function +{ + PgStatShared_Common header; + PgStat_StatFuncEntry stats; +} PgStatShared_Function; + +typedef struct PgStatShared_Subscription +{ + PgStatShared_Common header; + PgStat_StatSubEntry stats; +} PgStatShared_Subscription; + +typedef struct PgStatShared_ReplSlot +{ + PgStatShared_Common header; + PgStat_StatReplSlotEntry stats; +} PgStatShared_ReplSlot; + + +/* + * Central shared memory entry for the cumulative stats system. + * + * Fixed amount stats, the dynamic shared memory hash table for + * non-fixed-amount stats, as well as remaining bits and pieces are all + * reached from here. + */ +typedef struct PgStat_ShmemControl +{ + void *raw_dsa_area; + + /* + * Stats for variable-numbered objects are kept in this shared hash table. + * See comment above PgStat_Kind for details. + */ + dshash_table_handle hash_handle; /* shared dbstat hash */ + + /* Has the stats system already been shut down? Just a debugging check. */ + bool is_shutdown; + + /* + * Whenever statistics for dropped objects could not be freed - because + * backends still have references - the dropping backend calls + * pgstat_request_entry_refs_gc() incrementing this counter. Eventually + * that causes backends to run pgstat_gc_entry_refs(), allowing memory to + * be reclaimed. + */ + pg_atomic_uint64 gc_request_count; + + /* + * Stats data for fixed-numbered objects. + */ + PgStatShared_Archiver archiver; + PgStatShared_BgWriter bgwriter; + PgStatShared_Checkpointer checkpointer; + PgStatShared_SLRU slru; + PgStatShared_Wal wal; +} PgStat_ShmemControl; + + +/* + * Cached statistics snapshot + */ +typedef struct PgStat_Snapshot +{ + PgStat_FetchConsistency mode; + + /* time at which snapshot was taken */ + TimestampTz snapshot_timestamp; + + bool fixed_valid[PGSTAT_NUM_KINDS]; + + PgStat_ArchiverStats archiver; + + PgStat_BgWriterStats bgwriter; + + PgStat_CheckpointerStats checkpointer; + + PgStat_SLRUStats slru[SLRU_NUM_ELEMENTS]; + + PgStat_WalStats wal; + + /* to free snapshot in bulk */ + MemoryContext context; + struct pgstat_snapshot_hash *stats; +} PgStat_Snapshot; + + +/* + * Collection of backend-local stats state. + */ +typedef struct PgStat_LocalState +{ + PgStat_ShmemControl *shmem; + dsa_area *dsa; + dshash_table *shared_hash; + + /* the current statistics snapshot */ + PgStat_Snapshot snapshot; +} PgStat_LocalState; + + +/* + * Inline functions defined further below. + */ + +static inline void pgstat_begin_changecount_write(uint32 *cc); +static inline void pgstat_end_changecount_write(uint32 *cc); +static inline uint32 pgstat_begin_changecount_read(uint32 *cc); +static inline bool pgstat_end_changecount_read(uint32 *cc, uint32 cc_before); + +static inline void pgstat_copy_changecounted_stats(void *dst, void *src, size_t len, + uint32 *cc); + +static inline int pgstat_cmp_hash_key(const void *a, const void *b, size_t size, void *arg); +static inline uint32 pgstat_hash_hash_key(const void *d, size_t size, void *arg); +static inline size_t pgstat_get_entry_len(PgStat_Kind kind); +static inline void *pgstat_get_entry_data(PgStat_Kind kind, PgStatShared_Common *entry); + + /* * Functions in pgstat.c */ -extern void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype); -extern void pgstat_send(void *msg, int len); +const PgStat_KindInfo *pgstat_get_kind_info(PgStat_Kind kind); + #ifdef USE_ASSERT_CHECKING extern void pgstat_assert_is_up(void); #else #define pgstat_assert_is_up() ((void)true) #endif +extern void pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref); +extern PgStat_EntryRef *pgstat_prep_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid, bool *created_entry); +extern PgStat_EntryRef *pgstat_fetch_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid); + +extern void *pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, Oid objoid); +extern void pgstat_snapshot_fixed(PgStat_Kind kind); + + +/* + * Functions in pgstat_archiver.c + */ + +extern void pgstat_archiver_reset_all_cb(TimestampTz ts); +extern void pgstat_archiver_snapshot_cb(void); + + +/* + * Functions in pgstat_bgwriter.c + */ + +extern void pgstat_bgwriter_reset_all_cb(TimestampTz ts); +extern void pgstat_bgwriter_snapshot_cb(void); + + +/* + * Functions in pgstat_checkpointer.c + */ + +extern void pgstat_checkpointer_reset_all_cb(TimestampTz ts); +extern void pgstat_checkpointer_snapshot_cb(void); + /* * Functions in pgstat_database.c */ -extern void AtEOXact_PgStat_Database(bool isCommit, bool parallel); extern void pgstat_report_disconnect(Oid dboid); -extern void pgstat_update_dbstats(PgStat_MsgTabstat *tsmsg, TimestampTz now); +extern void pgstat_update_dbstats(TimestampTz ts); +extern void AtEOXact_PgStat_Database(bool isCommit, bool parallel); + +extern PgStat_StatDBEntry *pgstat_prep_database_pending(Oid dboid); +extern void pgstat_reset_database_timestamp(Oid dboid, TimestampTz ts); +extern bool pgstat_database_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_database_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); /* * Functions in pgstat_function.c */ -extern void pgstat_send_funcstats(void); +extern bool pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); /* @@ -120,23 +557,73 @@ extern void AtEOXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isC extern void AtEOSubXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit, int nestDepth); extern void AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); extern void PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); -extern void pgstat_send_tabstats(TimestampTz now, bool disconnect); + +extern bool pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref); + + +/* + * Functions in pgstat_replslot.c + */ + +extern void pgstat_replslot_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); +extern void pgstat_replslot_to_serialized_name_cb(const PgStatShared_Common *tmp, NameData *name); +extern bool pgstat_replslot_from_serialized_name_cb(const NameData *name, PgStat_HashKey *key); + + +/* + * Functions in pgstat_shmem.c + */ + +extern void pgstat_attach_shmem(void); +extern void pgstat_detach_shmem(void); + +extern PgStat_EntryRef *pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, Oid objoid, + bool create, bool *found); +extern bool pgstat_lock_entry(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_unlock_entry(PgStat_EntryRef *entry_ref); +extern bool pgstat_drop_entry(PgStat_Kind kind, Oid dboid, Oid objoid); +extern void pgstat_drop_all_entries(void); +extern PgStat_EntryRef *pgstat_get_entry_ref_locked(PgStat_Kind kind, Oid dboid, Oid objoid, + bool nowait); +extern void pgstat_reset_entry(PgStat_Kind kind, Oid dboid, Oid objoid, TimestampTz ts); +extern void pgstat_reset_entries_of_kind(PgStat_Kind kind, TimestampTz ts); +extern void pgstat_reset_matching_entries(bool (*do_reset) (PgStatShared_HashEntry *, Datum), + Datum match_data, + TimestampTz ts); + +extern void pgstat_request_entry_refs_gc(void); +extern PgStatShared_Common *pgstat_init_entry(PgStat_Kind kind, + PgStatShared_HashEntry *shhashent); /* * Functions in pgstat_slru.c */ -extern void pgstat_send_slru(void); +extern bool pgstat_slru_flush(bool nowait); +extern void pgstat_slru_reset_all_cb(TimestampTz ts); +extern void pgstat_slru_snapshot_cb(void); /* * Functions in pgstat_wal.c */ +extern bool pgstat_flush_wal(bool nowait); extern void pgstat_init_wal(void); extern bool pgstat_have_pending_wal(void); +extern void pgstat_wal_reset_all_cb(TimestampTz ts); +extern void pgstat_wal_snapshot_cb(void); + + +/* + * Functions in pgstat_subscription.c + */ + +extern bool pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_subscription_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); /* * Functions in pgstat_xact.c @@ -151,29 +638,145 @@ extern void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, Oid objoid) * Variables in pgstat.c */ -extern pgsocket pgStatSock; +extern PgStat_LocalState pgStatLocal; /* - * Variables in pgstat_database.c + * Variables in pgstat_slru.c */ -extern int pgStatXactCommit; -extern int pgStatXactRollback; +extern bool have_slrustats; /* - * Variables in pgstat_functions.c + * Implementation of inline functions declared above. */ -extern bool have_function_stats; +/* + * Helpers for changecount manipulation. See comments around struct + * PgBackendStatus for details. + */ + +static inline void +pgstat_begin_changecount_write(uint32 *cc) +{ + Assert((*cc & 1) == 0); + + START_CRIT_SECTION(); + (*cc)++; + pg_write_barrier(); +} + +static inline void +pgstat_end_changecount_write(uint32 *cc) +{ + Assert((*cc & 1) == 1); + + pg_write_barrier(); + + (*cc)++; + + END_CRIT_SECTION(); +} + +static inline uint32 +pgstat_begin_changecount_read(uint32 *cc) +{ + uint32 before_cc = *cc; + + CHECK_FOR_INTERRUPTS(); + + pg_read_barrier(); + + return before_cc; +} + +/* + * Returns true if the read succeeded, false if it needs to be repeated. + */ +static inline bool +pgstat_end_changecount_read(uint32 *cc, uint32 before_cc) +{ + uint32 after_cc; + + pg_read_barrier(); + + after_cc = *cc; + + /* was a write in progress when we started? */ + if (before_cc & 1) + return false; + + /* did writes start and complete while we read? */ + return before_cc == after_cc; +} /* - * Variables in pgstat_relation.c + * helper function for PgStat_KindInfo->snapshot_cb + * PgStat_KindInfo->reset_all_cb callbacks. + * + * Copies out the specified memory area following change-count protocol. */ +static inline void +pgstat_copy_changecounted_stats(void *dst, void *src, size_t len, + uint32 *cc) +{ + uint32 cc_before; -extern bool have_relation_stats; + do + { + cc_before = pgstat_begin_changecount_read(cc); + memcpy(dst, src, len); + } + while (!pgstat_end_changecount_read(cc, cc_before)); +} + +/* helpers for dshash / simplehash hashtables */ +static inline int +pgstat_cmp_hash_key(const void *a, const void *b, size_t size, void *arg) +{ + AssertArg(size == sizeof(PgStat_HashKey) && arg == NULL); + return memcmp(a, b, sizeof(PgStat_HashKey)); +} + +static inline uint32 +pgstat_hash_hash_key(const void *d, size_t size, void *arg) +{ + const PgStat_HashKey *key = (PgStat_HashKey *) d; + uint32 hash; + + AssertArg(size == sizeof(PgStat_HashKey) && arg == NULL); + + hash = murmurhash32(key->kind); + hash = hash_combine(hash, murmurhash32(key->dboid)); + hash = hash_combine(hash, murmurhash32(key->objoid)); + + return hash; +} + +/* + * The length of the data portion of a shared memory stats entry (i.e. without + * transient data such as refcounts, lwlocks, ...). + */ +static inline size_t +pgstat_get_entry_len(PgStat_Kind kind) +{ + return pgstat_get_kind_info(kind)->shared_data_len; +} + +/* + * Returns a pointer to the data portion of a shared memory stats entry. + */ +static inline void * +pgstat_get_entry_data(PgStat_Kind kind, PgStatShared_Common *entry) +{ + size_t off = pgstat_get_kind_info(kind)->shared_data_off; + + Assert(off != 0 && off < PG_UINT32_MAX); + + return ((char *) (entry)) + off; +} #endif /* PGSTAT_INTERNAL_H */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 121dbbc9a9..eadbd00904 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -246,6 +246,7 @@ typedef struct RelationData */ Oid rd_toastoid; /* Real TOAST table's OID, or InvalidOid */ + bool pgstat_enabled; /* should relation stats be counted */ /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ } RelationData; diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h index 099f91c61d..c068986d09 100644 --- a/src/include/utils/timeout.h +++ b/src/include/utils/timeout.h @@ -32,6 +32,7 @@ typedef enum TimeoutId STANDBY_LOCK_TIMEOUT, IDLE_IN_TRANSACTION_SESSION_TIMEOUT, IDLE_SESSION_TIMEOUT, + IDLE_STATS_UPDATE_TIMEOUT, CLIENT_CONNECTION_CHECK_TIMEOUT, STARTUP_PROGRESS_TIMEOUT, /* First user-definable timeout reason */ diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index d870c59263..b578e2ec75 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -42,7 +42,6 @@ typedef enum WAIT_EVENT_CHECKPOINTER_MAIN, WAIT_EVENT_LOGICAL_APPLY_MAIN, WAIT_EVENT_LOGICAL_LAUNCHER_MAIN, - WAIT_EVENT_PGSTAT_MAIN, WAIT_EVENT_RECOVERY_WAL_STREAM, WAIT_EVENT_SYSLOGGER_MAIN, WAIT_EVENT_WAL_RECEIVER_MAIN, diff --git a/src/test/modules/worker_spi/worker_spi.c b/src/test/modules/worker_spi/worker_spi.c index 48829df29c..5b541ec47f 100644 --- a/src/test/modules/worker_spi/worker_spi.c +++ b/src/test/modules/worker_spi/worker_spi.c @@ -265,7 +265,7 @@ worker_spi_main(Datum main_arg) PopActiveSnapshot(); CommitTransactionCommand(); debug_query_string = NULL; - pgstat_report_stat(false); + pgstat_report_stat(true); pgstat_report_activity(STATE_IDLE, NULL); } diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 494fb26237..64e2ff6b29 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -17,6 +17,8 @@ SET enable_indexscan TO on; -- for the moment, we don't want index-only scans here SET enable_indexonlyscan TO off; -- save counters +BEGIN; +SET LOCAL stats_fetch_consistency = snapshot; CREATE TABLE prevstats AS SELECT t.seq_scan, t.seq_tup_read, t.idx_scan, t.idx_tup_fetch, (b.heap_blks_read + b.heap_blks_hit) AS heap_blks, @@ -25,6 +27,7 @@ SELECT t.seq_scan, t.seq_tup_read, t.idx_scan, t.idx_tup_fetch, FROM pg_catalog.pg_stat_user_tables AS t, pg_catalog.pg_statio_user_tables AS b WHERE t.relname='tenk2' AND b.relname='tenk2'; +COMMIT; -- function to wait for counters to advance create function wait_for_stats() returns void as $$ declare @@ -34,6 +37,8 @@ declare updated3 bool; updated4 bool; begin + SET LOCAL stats_fetch_consistency = snapshot; + -- We don't want to wait forever. No timeout suffices if the OS drops our -- stats traffic because an earlier test file left a full UDP buffer. -- Hence, don't use PG_TEST_TIMEOUT_DEFAULT, which may be large for @@ -163,6 +168,8 @@ SELECT wait_for_stats(); (1 row) -- check effects +BEGIN; +SET LOCAL stats_fetch_consistency = snapshot; SELECT relname, n_tup_ins, n_tup_upd, n_tup_del, n_live_tup, n_dead_tup FROM pg_stat_user_tables WHERE relname like 'trunc_stats_test%' order by relname; @@ -202,6 +209,7 @@ FROM prevstats AS pr; t (1 row) +COMMIT; DROP TABLE trunc_stats_test, trunc_stats_test1, trunc_stats_test2, trunc_stats_test3, trunc_stats_test4; DROP TABLE prevstats; -- test BRIN index doesn't block HOT update - we include this test here, as it diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index d0ba1f6d7b..85a253bcd4 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -15,6 +15,8 @@ SET enable_indexscan TO on; SET enable_indexonlyscan TO off; -- save counters +BEGIN; +SET LOCAL stats_fetch_consistency = snapshot; CREATE TABLE prevstats AS SELECT t.seq_scan, t.seq_tup_read, t.idx_scan, t.idx_tup_fetch, (b.heap_blks_read + b.heap_blks_hit) AS heap_blks, @@ -23,6 +25,7 @@ SELECT t.seq_scan, t.seq_tup_read, t.idx_scan, t.idx_tup_fetch, FROM pg_catalog.pg_stat_user_tables AS t, pg_catalog.pg_statio_user_tables AS b WHERE t.relname='tenk2' AND b.relname='tenk2'; +COMMIT; -- function to wait for counters to advance create function wait_for_stats() returns void as $$ @@ -33,6 +36,8 @@ declare updated3 bool; updated4 bool; begin + SET LOCAL stats_fetch_consistency = snapshot; + -- We don't want to wait forever. No timeout suffices if the OS drops our -- stats traffic because an earlier test file left a full UDP buffer. -- Hence, don't use PG_TEST_TIMEOUT_DEFAULT, which may be large for @@ -158,6 +163,9 @@ RESET enable_bitmapscan; SELECT wait_for_stats(); -- check effects +BEGIN; +SET LOCAL stats_fetch_consistency = snapshot; + SELECT relname, n_tup_ins, n_tup_upd, n_tup_del, n_live_tup, n_dead_tup FROM pg_stat_user_tables WHERE relname like 'trunc_stats_test%' order by relname; @@ -177,6 +185,8 @@ SELECT st.heap_blks_read + st.heap_blks_hit >= pr.heap_blks + cl.relpages, SELECT pr.snap_ts < pg_stat_get_snapshot_timestamp() as snapshot_newer FROM prevstats AS pr; +COMMIT; + DROP TABLE trunc_stats_test, trunc_stats_test1, trunc_stats_test2, trunc_stats_test3, trunc_stats_test4; DROP TABLE prevstats; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index dc38e16405..566ecbf091 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1933,51 +1933,39 @@ PgFdwPathExtraData PgFdwRelationInfo PgFdwScanState PgIfAddrCallback +PgStatShared_Archiver +PgStatShared_BgWriter +PgStatShared_Checkpointer +PgStatShared_Common +PgStatShared_Database +PgStatShared_Function +PgStatShared_HashEntry +PgStatShared_Relation +PgStatShared_ReplSlot +PgStatShared_SLRU +PgStatShared_Subscription +PgStatShared_Wal PgStat_ArchiverStats PgStat_BackendFunctionEntry +PgStat_BackendSubEntry PgStat_BgWriterStats PgStat_CheckpointerStats PgStat_Counter +PgStat_EntryRef +PgStat_EntryRefHashEntry +PgStat_FetchConsistency PgStat_FunctionCallUsage PgStat_FunctionCounts -PgStat_FunctionEntry -PgStat_GlobalStats +PgStat_HashKey PgStat_Kind -PgStat_Msg -PgStat_MsgAnalyze -PgStat_MsgAnlAncestors -PgStat_MsgArchiver -PgStat_MsgAutovacStart -PgStat_MsgBgWriter -PgStat_MsgCheckpointer -PgStat_MsgChecksumFailure -PgStat_MsgConnect -PgStat_MsgDeadlock -PgStat_MsgDisconnect -PgStat_MsgDropdb -PgStat_MsgDummy -PgStat_MsgFuncpurge -PgStat_MsgFuncstat -PgStat_MsgHdr -PgStat_MsgInquiry -PgStat_MsgRecoveryConflict -PgStat_MsgReplSlot -PgStat_MsgResetcounter -PgStat_MsgResetreplslotcounter -PgStat_MsgResetsharedcounter -PgStat_MsgResetsinglecounter -PgStat_MsgResetslrucounter -PgStat_MsgResetsubcounter -PgStat_MsgSLRU -PgStat_MsgSubscriptionDrop -PgStat_MsgSubscriptionError -PgStat_MsgTabpurge -PgStat_MsgTabstat -PgStat_MsgTempFile -PgStat_MsgVacuum -PgStat_MsgWal +PgStat_KindInfo +PgStat_LocalState PgStat_PendingDroppedStatsItem +PgStat_ReplSlotStats PgStat_SLRUStats +PgStat_ShmemControl +PgStat_Snapshot +PgStat_SnapshotEntry PgStat_StatDBEntry PgStat_StatFuncEntry PgStat_StatReplSlotEntry @@ -1985,7 +1973,6 @@ PgStat_StatSubEntry PgStat_StatTabEntry PgStat_SubXactStatus PgStat_TableCounts -PgStat_TableEntry PgStat_TableStatus PgStat_TableXactStatus PgStat_WalStats @@ -2533,7 +2520,6 @@ StartReplicationCmd StartupStatusEnum StatEntry StatExtEntry -StatMsgType StateFileChunk StatisticExtInfo Stats @@ -2647,8 +2633,6 @@ TXNEntryFile TYPCATEGORY T_Action T_WorkerStatus -TabStatHashEntry -TabStatusArray TableAmRoutine TableAttachInfo TableDataInfo @@ -3433,6 +3417,7 @@ pgssHashKey pgssSharedState pgssStoreKind pgssVersion +pgstat_entry_ref_hash_hash pgstat_page pgstattuple_type pgthreadlock_t diff --git a/src/tools/valgrind.supp b/src/tools/valgrind.supp index e3a179d210..4e8c482757 100644 --- a/src/tools/valgrind.supp +++ b/src/tools/valgrind.supp @@ -14,24 +14,6 @@ # These may contain uninitialized padding bytes. Since recipients also ignore # those bytes as padding, this is harmless. -{ - padding_pgstat_send - Memcheck:Param - socketcall.send(msg) - - fun:*send* - fun:pgstat_send -} - -{ - padding_pgstat_sendto - Memcheck:Param - socketcall.sendto(msg) - - fun:*send* - fun:pgstat_send -} - { padding_pgstat_write Memcheck:Param