From 5891c7a8ed8f2d3d577e7eea34dacff12d7b6bbd Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Wed, 6 Apr 2022 21:29:46 -0700 Subject: [PATCH] pgstat: store statistics in shared memory. Previously the statistics collector received statistics updates via UDP and shared statistics data by writing them out to temporary files regularly. These files can reach tens of megabytes and are written out up to twice a second. This has repeatedly prevented us from adding additional useful statistics. Now statistics are stored in shared memory. Statistics for variable-numbered objects are stored in a dshash hashtable (backed by dynamic shared memory). Fixed-numbered stats are stored in plain shared memory. The header for pgstat.c contains an overview of the architecture. The stats collector is not needed anymore, remove it. By utilizing the transactional statistics drop infrastructure introduced in a prior commit statistics entries cannot "leak" anymore. Previously leaked statistics were dropped by pgstat_vacuum_stat(), called from [auto-]vacuum. On systems with many small relations pgstat_vacuum_stat() could be quite expensive. Now that replicas drop statistics entries for dropped objects, it is not necessary anymore to reset stats when starting from a cleanly shut down replica. Subsequent commits will perform some further code cleanup, adapt docs and add tests. Bumps PGSTAT_FILE_FORMAT_ID. Author: Kyotaro Horiguchi Author: Andres Freund Author: Melanie Plageman Reviewed-By: Andres Freund Reviewed-By: Thomas Munro Reviewed-By: Justin Pryzby Reviewed-By: "David G. Johnston" Reviewed-By: Tomas Vondra (in a much earlier version) Reviewed-By: Arthur Zakirov (in a much earlier version) Reviewed-By: Antonin Houska (in a much earlier version) Discussion: https://postgr.es/m/20220303021600.hs34ghqcw6zcokdh@alap3.anarazel.de Discussion: https://postgr.es/m/20220308205351.2xcn6k4x5yivcxyd@alap3.anarazel.de Discussion: https://postgr.es/m/20210319235115.y3wz7hpnnrshdyv6@alap3.anarazel.de --- doc/src/sgml/monitoring.sgml | 19 +- src/backend/access/transam/xlog.c | 39 +- src/backend/commands/vacuum.c | 7 - src/backend/commands/vacuumparallel.c | 2 + src/backend/postmaster/autovacuum.c | 197 +- src/backend/postmaster/checkpointer.c | 20 +- src/backend/postmaster/interrupt.c | 5 +- src/backend/postmaster/pgstat.c | 4921 ++++------------- src/backend/postmaster/postmaster.c | 91 +- src/backend/replication/logical/logical.c | 1 - src/backend/replication/logical/tablesync.c | 8 +- src/backend/replication/logical/worker.c | 6 + src/backend/replication/slot.c | 26 +- src/backend/storage/buffer/bufmgr.c | 8 +- src/backend/storage/ipc/ipci.c | 2 + src/backend/storage/lmgr/lwlock.c | 8 +- src/backend/tcop/postgres.c | 31 +- src/backend/utils/activity/Makefile | 1 + src/backend/utils/activity/pgstat_archiver.c | 89 +- src/backend/utils/activity/pgstat_bgwriter.c | 82 +- .../utils/activity/pgstat_checkpointer.c | 93 +- src/backend/utils/activity/pgstat_database.c | 345 +- src/backend/utils/activity/pgstat_function.c | 167 +- src/backend/utils/activity/pgstat_relation.c | 598 +- src/backend/utils/activity/pgstat_replslot.c | 185 +- src/backend/utils/activity/pgstat_shmem.c | 987 ++++ src/backend/utils/activity/pgstat_slru.c | 158 +- .../utils/activity/pgstat_subscription.c | 67 +- src/backend/utils/activity/pgstat_wal.c | 169 +- src/backend/utils/activity/pgstat_xact.c | 37 +- src/backend/utils/activity/wait_event.c | 3 - src/backend/utils/adt/pgstatfuncs.c | 10 +- src/backend/utils/cache/relcache.c | 7 +- src/backend/utils/init/globals.c | 1 + src/backend/utils/init/miscinit.c | 3 - src/backend/utils/init/postinit.c | 12 + src/backend/utils/misc/guc.c | 21 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/miscadmin.h | 2 +- src/include/pgstat.h | 679 +-- src/include/storage/lwlock.h | 3 + src/include/utils/pgstat_internal.h | 665 ++- src/include/utils/rel.h | 1 + src/include/utils/timeout.h | 1 + src/include/utils/wait_event.h | 1 - src/test/modules/worker_spi/worker_spi.c | 2 +- src/test/regress/expected/stats.out | 8 + src/test/regress/sql/stats.sql | 10 + src/tools/pgindent/typedefs.list | 63 +- src/tools/valgrind.supp | 18 - 50 files changed, 4395 insertions(+), 5485 deletions(-) create mode 100644 src/backend/utils/activity/pgstat_shmem.c diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 487331c115..24924647b5 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1110,10 +1110,6 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser LogicalLauncherMain Waiting in main loop of logical replication launcher process. - - PgStatMain - Waiting in main loop of statistics collector process. - RecoveryWalStream Waiting in main loop of startup process for WAL to arrive, during @@ -2115,6 +2111,18 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting to access the list of predicate locks held by serializable transactions. + + PgStatsDSA + Waiting for stats dynamic shared memory allocator access + + + PgStatsHash + Waiting for stats shared memory hash table access + + + PgStatsData + Waiting for shared memory stats data access + SerializableXactHash Waiting to read or update information about serializable @@ -5142,7 +5150,8 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i timestamp with time zone - Returns the timestamp of the current statistics snapshot. + Returns the timestamp of the current statistics snapshot, or NULL if + no statistics snapshot has been taken. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8ae0a0ba53..c076e48445 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1842,7 +1842,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) WriteRqst.Flush = 0; XLogWrite(WriteRqst, tli, false); LWLockRelease(WALWriteLock); - WalStats.m_wal_buffers_full++; + PendingWalStats.wal_buffers_full++; TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); } /* Re-acquire WALBufMappingLock and retry */ @@ -2200,10 +2200,10 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration); + PendingWalStats.wal_write_time += INSTR_TIME_GET_MICROSEC(duration); } - WalStats.m_wal_write++; + PendingWalStats.wal_write++; if (written <= 0) { @@ -4877,6 +4877,7 @@ StartupXLOG(void) XLogCtlInsert *Insert; CheckPoint checkPoint; bool wasShutdown; + bool didCrash; bool haveTblspcMap; bool haveBackupLabel; XLogRecPtr EndOfLog; @@ -4994,7 +4995,10 @@ StartupXLOG(void) { RemoveTempXlogFiles(); SyncDataDirectory(); + didCrash = true; } + else + didCrash = false; /* * Prepare for WAL recovery if needed. @@ -5106,6 +5110,22 @@ StartupXLOG(void) */ restoreTwoPhaseData(); + /* + * When starting with crash recovery, reset pgstat data - it might not be + * valid. Otherwise restore pgstat data. It's safe to do this here, + * because postmaster will not yet have started any other processes. + * + * NB: Restoring replication slot stats relies on slot state to have + * already been restored from disk. + * + * TODO: With a bit of extra work we could just start with a pgstat file + * associated with the checkpoint redo location we're starting from. + */ + if (didCrash) + pgstat_discard_stats(); + else + pgstat_restore_stats(); + lastFullPageWrites = checkPoint.fullPageWrites; RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; @@ -5180,11 +5200,6 @@ StartupXLOG(void) LocalMinRecoveryPointTLI = 0; } - /* - * Reset pgstat data, because it may be invalid after recovery. - */ - pgstat_reset_all(); - /* Check that the GUCs used to generate the WAL allow recovery */ CheckRequiredParameterValues(); @@ -6081,8 +6096,8 @@ LogCheckpointEnd(bool restartpoint) CheckpointStats.ckpt_sync_end_t); /* Accumulate checkpoint timing summary data, in milliseconds. */ - PendingCheckpointerStats.m_checkpoint_write_time += write_msecs; - PendingCheckpointerStats.m_checkpoint_sync_time += sync_msecs; + PendingCheckpointerStats.checkpoint_write_time += write_msecs; + PendingCheckpointerStats.checkpoint_sync_time += sync_msecs; /* * All of the published timing statistics are accounted for. Only @@ -8009,10 +8024,10 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); - WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration); + PendingWalStats.wal_sync_time += INSTR_TIME_GET_MICROSEC(duration); } - WalStats.m_wal_sync++; + PendingWalStats.wal_sync++; } /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 04dbbe5530..e0fc7e8d79 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -351,13 +351,6 @@ vacuum(List *relations, VacuumParams *params, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("PROCESS_TOAST required with VACUUM FULL"))); - /* - * Send info about dead objects to the cumulative stats system, unless - * we are in autovacuum --- autovacuum.c does this for itself. - */ - if ((params->options & VACOPT_VACUUM) && !IsAutoVacuumWorkerProcess()) - pgstat_vacuum_stat(); - /* * Create special memory context for cross-transaction storage. * diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 974a29e7a9..6b4f742578 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -28,6 +28,7 @@ #include "access/amapi.h" #include "access/table.h" +#include "access/xact.h" #include "catalog/index.h" #include "commands/vacuum.h" #include "optimizer/paths.h" @@ -35,6 +36,7 @@ #include "storage/bufmgr.h" #include "tcop/tcopprot.h" #include "utils/lsyscache.h" +#include "utils/rel.h" /* * DSM keys for parallel vacuum. Unlike other parallel execution code, since diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index c6d30fa527..f36c40e852 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -44,11 +44,12 @@ * Note that there can be more than one worker in a database concurrently. * They will store the table they are currently vacuuming in shared memory, so * that other workers avoid being blocked waiting for the vacuum lock for that - * table. They will also reload the pgstats data just before vacuuming each - * table, to avoid vacuuming a table that was just finished being vacuumed by - * another worker and thus is no longer noted in shared memory. However, - * there is a window (caused by pgstat delay) on which a worker may choose a - * table that was already vacuumed; this is a bug in the current design. + * table. They will also fetch the last time the table was vacuumed from + * pgstats just before vacuuming each table, to avoid vacuuming a table that + * was just finished being vacuumed by another worker and thus is no longer + * noted in shared memory. However, there is a small window (due to not yet + * holding the relation lock) during which a worker may choose a table that was + * already vacuumed; this is a bug in the current design. * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -129,9 +130,6 @@ int autovacuum_vac_cost_limit; int Log_autovacuum_min_duration = 600000; -/* how long to keep pgstat data in the launcher, in milliseconds */ -#define STATS_READ_DELAY 1000 - /* the minimum allowed time between two awakenings of the launcher */ #define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */ #define MAX_AUTOVAC_SLEEPTIME 300 /* seconds */ @@ -342,15 +340,11 @@ static void autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy); static AutoVacOpts *extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc); -static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared, - PgStat_StatDBEntry *shared, - PgStat_StatDBEntry *dbentry); static void perform_work_item(AutoVacuumWorkItem *workitem); static void autovac_report_activity(autovac_table *tab); static void autovac_report_workitem(AutoVacuumWorkItem *workitem, const char *nspname, const char *relname); static void avl_sigusr2_handler(SIGNAL_ARGS); -static void autovac_refresh_stats(void); @@ -555,12 +549,6 @@ AutoVacLauncherMain(int argc, char *argv[]) DatabaseListCxt = NULL; dlist_init(&DatabaseList); - /* - * Make sure pgstat also considers our stat data as gone. Note: we - * mustn't use autovac_refresh_stats here. - */ - pgstat_clear_snapshot(); - /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); @@ -611,6 +599,12 @@ AutoVacLauncherMain(int argc, char *argv[]) SetConfigOption("default_transaction_isolation", "read committed", PGC_SUSET, PGC_S_OVERRIDE); + /* + * Even when system is configured to use a different fetch consistency, + * for autovac we always want fresh stats. + */ + SetConfigOption("stats_fetch_consistency", "none", PGC_SUSET, PGC_S_OVERRIDE); + /* * In emergency mode, just start a worker (unless shutdown was requested) * and go away. @@ -963,9 +957,6 @@ rebuild_database_list(Oid newdb) HTAB *dbhash; dlist_iter iter; - /* use fresh stats */ - autovac_refresh_stats(); - newcxt = AllocSetContextCreate(AutovacMemCxt, "Autovacuum database list", ALLOCSET_DEFAULT_SIZES); @@ -1184,9 +1175,6 @@ do_start_worker(void) ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(tmpcxt); - /* use fresh stats */ - autovac_refresh_stats(); - /* Get a list of databases */ dblist = get_database_list(); @@ -1642,6 +1630,12 @@ AutoVacWorkerMain(int argc, char *argv[]) SetConfigOption("synchronous_commit", "local", PGC_SUSET, PGC_S_OVERRIDE); + /* + * Even when system is configured to use a different fetch consistency, + * for autovac we always want fresh stats. + */ + SetConfigOption("stats_fetch_consistency", "none", PGC_SUSET, PGC_S_OVERRIDE); + /* * Get the info about the database we're going to work on. */ @@ -1966,8 +1960,6 @@ do_autovacuum(void) HASHCTL ctl; HTAB *table_toast_map; ListCell *volatile cell; - PgStat_StatDBEntry *shared; - PgStat_StatDBEntry *dbentry; BufferAccessStrategy bstrategy; ScanKeyData key; TupleDesc pg_class_desc; @@ -1986,22 +1978,9 @@ do_autovacuum(void) ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(AutovacMemCxt); - /* - * may be NULL if we couldn't find an entry (only happens if we are - * forcing a vacuum for anti-wrap purposes). - */ - dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); - /* Start a transaction so our commands have one to play into. */ StartTransactionCommand(); - /* - * Clean up any dead statistics entries for this DB. We always want to do - * this exactly once per DB-processing cycle, even if we find nothing - * worth vacuuming in the database. - */ - pgstat_vacuum_stat(); - /* * Compute the multixact age for which freezing is urgent. This is * normally autovacuum_multixact_freeze_max_age, but may be less if we are @@ -2039,9 +2018,6 @@ do_autovacuum(void) /* StartTransactionCommand changed elsewhere */ MemoryContextSwitchTo(AutovacMemCxt); - /* The database hash where pgstat keeps shared relations */ - shared = pgstat_fetch_stat_dbentry(InvalidOid); - classRel = table_open(RelationRelationId, AccessShareLock); /* create a copy so we can use it after closing pg_class */ @@ -2119,8 +2095,8 @@ do_autovacuum(void) /* Fetch reloptions and the pgstat entry for this table */ relopts = extract_autovac_opts(tuple, pg_class_desc); - tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, - shared, dbentry); + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); /* Check if it needs vacuum or analyze */ relation_needs_vacanalyze(relid, relopts, classForm, tabentry, @@ -2203,8 +2179,8 @@ do_autovacuum(void) } /* Fetch the pgstat entry for this table */ - tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, - shared, dbentry); + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); relation_needs_vacanalyze(relid, relopts, classForm, tabentry, effective_multixact_freeze_max_age, @@ -2418,12 +2394,8 @@ do_autovacuum(void) /* * Check whether pgstat data still says we need to vacuum this table. * It could have changed if something else processed the table while - * we weren't looking. - * - * Note: we have a special case in pgstat code to ensure that the - * stats we read are as up-to-date as possible, to avoid the problem - * that somebody just finished vacuuming this table. The window to - * the race condition is not closed but it is very small. + * we weren't looking. This doesn't entirely close the race condition, + * but it is very small. */ MemoryContextSwitchTo(AutovacMemCxt); tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc, @@ -2768,29 +2740,6 @@ extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) return av; } -/* - * get_pgstat_tabentry_relid - * - * Fetch the pgstat entry of a table, either local to a database or shared. - */ -static PgStat_StatTabEntry * -get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared, - PgStat_StatDBEntry *dbentry) -{ - PgStat_StatTabEntry *tabentry = NULL; - - if (isshared) - { - if (PointerIsValid(shared)) - tabentry = hash_search(shared->tables, &relid, - HASH_FIND, NULL); - } - else if (PointerIsValid(dbentry)) - tabentry = hash_search(dbentry->tables, &relid, - HASH_FIND, NULL); - - return tabentry; -} /* * table_recheck_autovac @@ -2812,7 +2761,6 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, autovac_table *tab = NULL; bool wraparound; AutoVacOpts *avopts; - static bool reuse_stats = false; /* fetch the relation's relcache entry */ classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); @@ -2836,35 +2784,6 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, avopts = &hentry->ar_reloptions; } - /* - * Reuse the stats to recheck whether a relation needs to be vacuumed or - * analyzed if it was reloaded before and has not been cleared yet. This - * is necessary to avoid frequent refresh of stats, especially when there - * are very large number of relations and the refresh can cause lots of - * overhead. - * - * If we determined that a relation needs to be vacuumed or analyzed, - * based on the old stats, we refresh stats and recheck the necessity - * again. Because a relation may have already been vacuumed or analyzed by - * someone since the last reload of stats. - */ - if (reuse_stats) - { - recheck_relation_needs_vacanalyze(relid, avopts, classForm, - effective_multixact_freeze_max_age, - &dovacuum, &doanalyze, &wraparound); - - /* Quick exit if a relation doesn't need to be vacuumed or analyzed */ - if (!doanalyze && !dovacuum) - { - heap_freetuple(classTup); - return NULL; - } - } - - /* Use fresh stats and recheck again */ - autovac_refresh_stats(); - recheck_relation_needs_vacanalyze(relid, avopts, classForm, effective_multixact_freeze_max_age, &dovacuum, &doanalyze, &wraparound); @@ -2962,21 +2881,6 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, tab->at_dobalance = !(avopts && (avopts->vacuum_cost_limit > 0 || avopts->vacuum_cost_delay > 0)); - - /* - * When we decide to do vacuum or analyze, the existing stats cannot - * be reused in the next cycle because it's cleared at the end of - * vacuum or analyze (by AtEOXact_PgStat()). - */ - reuse_stats = false; - } - else - { - /* - * If neither vacuum nor analyze is necessary, the existing stats is - * not cleared and can be reused in the next cycle. - */ - reuse_stats = true; } heap_freetuple(classTup); @@ -3001,17 +2905,10 @@ recheck_relation_needs_vacanalyze(Oid relid, bool *wraparound) { PgStat_StatTabEntry *tabentry; - PgStat_StatDBEntry *shared = NULL; - PgStat_StatDBEntry *dbentry = NULL; - - if (classForm->relisshared) - shared = pgstat_fetch_stat_dbentry(InvalidOid); - else - dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); /* fetch the pgstat table entry */ - tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, - shared, dbentry); + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); relation_needs_vacanalyze(relid, avopts, classForm, tabentry, effective_multixact_freeze_max_age, @@ -3169,11 +3066,11 @@ relation_needs_vacanalyze(Oid relid, } /* - * If we found the table in the stats hash, and autovacuum is currently - * enabled, make a threshold-based decision whether to vacuum and/or - * analyze. If autovacuum is currently disabled, we must be here for - * anti-wraparound vacuuming only, so don't vacuum (or analyze) anything - * that's not being forced. + * If we found stats for the table, and autovacuum is currently enabled, + * make a threshold-based decision whether to vacuum and/or analyze. If + * autovacuum is currently disabled, we must be here for anti-wraparound + * vacuuming only, so don't vacuum (or analyze) anything that's not being + * forced. */ if (PointerIsValid(tabentry) && AutoVacuumingActive()) { @@ -3472,35 +3369,3 @@ AutoVacuumShmemInit(void) else Assert(found); } - -/* - * autovac_refresh_stats - * Refresh pgstats data for an autovacuum process - * - * Cause the next pgstats read operation to obtain fresh data, but throttle - * such refreshing in the autovacuum launcher. This is mostly to avoid - * rereading the pgstats files too many times in quick succession when there - * are many databases. - * - * Note: we avoid throttling in the autovac worker, as it would be - * counterproductive in the recheck logic. - */ -static void -autovac_refresh_stats(void) -{ - if (IsAutoVacuumLauncherProcess()) - { - static TimestampTz last_read = 0; - TimestampTz current_time; - - current_time = GetCurrentTimestamp(); - - if (!TimestampDifferenceExceeds(last_read, current_time, - STATS_READ_DELAY)) - return; - - last_read = current_time; - } - - pgstat_clear_snapshot(); -} diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index e733c70368..c937c39f50 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -212,6 +212,16 @@ CheckpointerMain(void) */ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); + /* + * Write out stats after shutdown. This needs to be called by exactly one + * process during a normal shutdown, and since checkpointer is shut down + * very late... + * + * Walsenders are shut down after the checkpointer, but currently don't + * report stats. If that changes, we need a more complicated solution. + */ + before_shmem_exit(pgstat_before_server_shutdown, 0); + /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid @@ -358,7 +368,7 @@ CheckpointerMain(void) if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) { do_checkpoint = true; - PendingCheckpointerStats.m_requested_checkpoints++; + PendingCheckpointerStats.requested_checkpoints++; } /* @@ -372,7 +382,7 @@ CheckpointerMain(void) if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) - PendingCheckpointerStats.m_timed_checkpoints++; + PendingCheckpointerStats.timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } @@ -569,7 +579,7 @@ HandleCheckpointerInterrupts(void) * updates the statistics, increment the checkpoint request and flush * out pending statistic. */ - PendingCheckpointerStats.m_requested_checkpoints++; + PendingCheckpointerStats.requested_checkpoints++; ShutdownXLOG(0, 0); pgstat_report_checkpointer(); pgstat_report_wal(true); @@ -1262,9 +1272,9 @@ AbsorbSyncRequests(void) LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); /* Transfer stats counts into pending pgstats message */ - PendingCheckpointerStats.m_buf_written_backend + PendingCheckpointerStats.buf_written_backend += CheckpointerShmem->num_backend_writes; - PendingCheckpointerStats.m_buf_fsync_backend + PendingCheckpointerStats.buf_fsync_backend += CheckpointerShmem->num_backend_fsync; CheckpointerShmem->num_backend_writes = 0; diff --git a/src/backend/postmaster/interrupt.c b/src/backend/postmaster/interrupt.c index 3f412dad2e..1aed2e2e99 100644 --- a/src/backend/postmaster/interrupt.c +++ b/src/backend/postmaster/interrupt.c @@ -98,9 +98,8 @@ SignalHandlerForCrashExit(SIGNAL_ARGS) * shut down and exit. * * Typically, this handler would be used for SIGTERM, but some processes use - * other signals. In particular, the checkpointer exits on SIGUSR2, the - * stats collector on SIGQUIT, and the WAL writer exits on either SIGINT - * or SIGTERM. + * other signals. In particular, the checkpointer exits on SIGUSR2, and the + * WAL writer exits on either SIGINT or SIGTERM. * * ShutdownRequestPending should be checked at a convenient place within the * main loop, or else the main loop should call HandleMainLoopInterrupts. diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 20c4629e55..a9f3a7ef49 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1,100 +1,161 @@ /* ---------- * pgstat.c + * Infrastructure for the cumulative statistics system. * - * All the statistics collector stuff hacked up in one big, ugly file. + * The cumulative statistics system accumulates statistics for different kinds + * of objects. Some kinds of statistics are collected for a fixed number of + * objects (most commonly 1), e.g., checkpointer statistics. Other kinds of + * statistics are collected for a varying number of objects + * (e.g. relations). See PgStat_KindInfo for a list of currently handled + * statistics. * - * TODO: - Separate collector, postmaster and backend stuff - * into different files. + * Statistics are loaded from the filesystem during startup (by the startup + * process), unless preceded by a crash, in which case all stats are + * discarded. They are written out by the checkpointer process just before + * shutting down, except when shutting down in immediate mode. * - * - Add some automatic call for pgstat vacuuming. + * Fixed-numbered stats are stored in plain (non-dynamic) shared memory. * - * - Add a pgstat config column to pg_database, so this - * entire thing can be enabled/disabled on a per db basis. + * Statistics for variable-numbered objects are stored in dynamic shared + * memory and can be found via a dshash hashtable. The statistics counters are + * not part of the dshash entry (PgStatShared_HashEntry) directly, but are + * separately allocated (PgStatShared_HashEntry->body). The separate + * allocation allows different kinds of statistics to be stored in the same + * hashtable without wasting space in PgStatShared_HashEntry. * - * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * Variable-numbered stats are addressed by PgStat_HashKey while running. It + * is not possible to have statistics for an object that cannot be addressed + * that way at runtime. A wider identifier can be used when serializing to + * disk (used for replication slot stats). * - * src/backend/postmaster/pgstat.c + * To avoid contention on the shared hashtable, each backend has a + * backend-local hashtable (pgStatEntryRefHash) in front of the shared + * hashtable, containing references (PgStat_EntryRef) to shared hashtable + * entries. The shared hashtable only needs to be accessed when no prior + * reference is found in the local hashtable. Besides pointing to the the + * shared hashtable entry (PgStatShared_HashEntry) PgStat_EntryRef also + * contains a pointer to the the shared statistics data, as a process-local + * address, to reduce access costs. + * + * The names for structs stored in shared memory are prefixed with + * PgStatShared instead of PgStat. Each stats entry in shared memory is + * protected by a dedicated lwlock. + * + * Most stats updates are first accumulated locally in each process as pending + * entries, then later flushed to shared memory (just after commit, or by + * idle-timeout). This practically eliminates contention on individual stats + * entries. For most kinds of variable-numbered pending stats data is stored + * in PgStat_EntryRef->pending. All entries with pending data are in the + * pgStatPending list. Pending statistics updates are flushed out by + * pgstat_report_stat(). + * + * The behavior of different kinds of statistics is determined by the kind's + * entry in pgstat_kind_infos, see PgStat_KindInfo for details. + * + * The consistency of read accesses to statistics can be configured using the + * stats_fetch_consistency GUC (see config.sgml and monitoring.sgml for the + * settings). When using PGSTAT_FETCH_CONSISTENCY_CACHE or + * PGSTAT_FETCH_CONSISTENCY_SNAPSHOT statistics are stored in + * pgStatLocal.snapshot. + * + * To keep things manageable, stats handling is split across several + * files. Infrastructure pieces are in: + * - pgstat.c - this file, to tie it all together + * - pgstat_shmem.c - nearly everything dealing with shared memory, including + * the maintenance of hashtable entries + * - pgstat_xact.c - transactional integration, including the transactional + * creation and dropping of stats entries + * + * Each statistics kind is handled in a dedicated file: + * - pgstat_archiver.c + * - pgstat_bgwriter.c + * - pgstat_checkpointer.c + * - pgstat_database.c + * - pgstat_function.c + * - pgstat_relation.c + * - pgstat_slru.c + * - pgstat_subscription.c + * - pgstat_wal.c + * + * Whenever possible infrastructure files should not contain code related to + * specific kinds of stats. + * + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/postmaster/pgstat.c * ---------- */ #include "postgres.h" #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_SYS_SELECT_H -#include -#endif -#include "access/heapam.h" -#include "access/htup_details.h" -#include "access/tableam.h" #include "access/transam.h" #include "access/xact.h" -#include "catalog/catalog.h" -#include "catalog/pg_database.h" -#include "catalog/pg_proc.h" -#include "catalog/pg_subscription.h" -#include "common/ip.h" -#include "libpq/libpq.h" -#include "libpq/pqsignal.h" -#include "mb/pg_wchar.h" -#include "miscadmin.h" +#include "lib/dshash.h" #include "pgstat.h" -#include "postmaster/autovacuum.h" -#include "postmaster/fork_process.h" -#include "postmaster/interrupt.h" -#include "postmaster/postmaster.h" -#include "replication/slot.h" -#include "replication/walsender.h" -#include "storage/backendid.h" -#include "storage/dsm.h" +#include "port/atomics.h" #include "storage/fd.h" #include "storage/ipc.h" -#include "storage/latch.h" -#include "storage/lmgr.h" +#include "storage/lwlock.h" #include "storage/pg_shmem.h" -#include "storage/proc.h" -#include "storage/procsignal.h" -#include "utils/builtins.h" +#include "storage/shmem.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/pgstat_internal.h" -#include "utils/ps_status.h" -#include "utils/rel.h" -#include "utils/snapmgr.h" #include "utils/timestamp.h" /* ---------- * Timer definitions. + * + * In milliseconds. * ---------- */ -#define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a - * new file; in milliseconds. */ +/* minimum interval non-forced stats flushes.*/ +#define PGSTAT_MIN_INTERVAL 1000 +/* how long until to block flushing pending stats updates */ +#define PGSTAT_MAX_INTERVAL 60000 +/* when to call pgstat_report_stat() again, even when idle */ +#define PGSTAT_IDLE_INTERVAL 10000 -#define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats - * file update; in milliseconds. */ +/* ---------- + * Initial size hints for the hash tables used in statistics. + * ---------- + */ -#define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a - * new file; in milliseconds. */ +#define PGSTAT_SNAPSHOT_HASH_SIZE 512 -#define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a - * failed statistics collector; in - * seconds. */ -#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY) -#define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY) +/* hash table for statistics snapshots entry */ +typedef struct PgStat_SnapshotEntry +{ + PgStat_HashKey key; + char status; /* for simplehash use */ + void *data; /* the stats data itself */ +} PgStat_SnapshotEntry; -/* Minimum receive buffer size for the collector's socket. */ -#define PGSTAT_MIN_RCVBUF (100 * 1024) + +/* ---------- + * Backend-local Hash Table Definitions + * ---------- + */ + +/* for stats snapshot entries */ +#define SH_PREFIX pgstat_snapshot +#define SH_ELEMENT_TYPE PgStat_SnapshotEntry +#define SH_KEY_TYPE PgStat_HashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL) +#define SH_EQUAL(tb, a, b) \ + pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" /* ---------- @@ -102,63 +163,18 @@ * ---------- */ -#ifdef EXEC_BACKEND -static pid_t pgstat_forkexec(void); -#endif +static void pgstat_write_statsfile(void); +static void pgstat_read_statsfile(void); -NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn(); +static void pgstat_reset_after_failure(TimestampTz ts); -static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create); -static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, - Oid tableoid, bool create); -static PgStat_StatSubEntry *pgstat_get_subscription_entry(Oid subid, bool create); -static void pgstat_reset_subscription(PgStat_StatSubEntry *subentry, TimestampTz ts); -static void pgstat_write_statsfiles(bool permanent, bool allDbs); -static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent); -static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep); -static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, - bool permanent); -static void backend_read_statsfile(void); +static bool pgstat_flush_pending_entries(bool nowait); -static bool pgstat_write_statsfile_needed(void); -static bool pgstat_db_requested(Oid databaseid); +static void pgstat_prep_snapshot(void); +static void pgstat_build_snapshot(void); +static void pgstat_build_snapshot_fixed(PgStat_Kind kind); -static PgStat_StatReplSlotEntry *pgstat_get_replslot_entry(NameData name, bool create_it); -static void pgstat_reset_replslot_entry(PgStat_StatReplSlotEntry *slotstats, TimestampTz ts); - -static HTAB *pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid); - -static void pgstat_setup_memcxt(void); - -static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len); -static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len); -static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len); -static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len); -static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len); -static void pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len); -static void pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len); -static void pgstat_recv_resetslrucounter(PgStat_MsgResetslrucounter *msg, int len); -static void pgstat_recv_resetreplslotcounter(PgStat_MsgResetreplslotcounter *msg, int len); -static void pgstat_recv_resetsubcounter(PgStat_MsgResetsubcounter *msg, int len); -static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len); -static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len); -static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len); -static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len); -static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len); -static void pgstat_recv_checkpointer(PgStat_MsgCheckpointer *msg, int len); -static void pgstat_recv_wal(PgStat_MsgWal *msg, int len); -static void pgstat_recv_slru(PgStat_MsgSLRU *msg, int len); -static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len); -static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); -static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); -static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); -static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len); -static void pgstat_recv_connect(PgStat_MsgConnect *msg, int len); -static void pgstat_recv_disconnect(PgStat_MsgDisconnect *msg, int len); -static void pgstat_recv_replslot(PgStat_MsgReplSlot *msg, int len); -static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); -static void pgstat_recv_subscription_drop(PgStat_MsgSubscriptionDrop *msg, int len); -static void pgstat_recv_subscription_error(PgStat_MsgSubscriptionError *msg, int len); +static inline bool pgstat_is_kind_valid(int ikind); /* ---------- @@ -167,6 +183,7 @@ static void pgstat_recv_subscription_error(PgStat_MsgSubscriptionError *msg, int */ bool pgstat_track_counts = false; +int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE; /* ---------- @@ -184,44 +201,33 @@ char *pgstat_stat_tmpname = NULL; * ---------- */ -pgsocket pgStatSock = PGINVALID_SOCKET; +PgStat_LocalState pgStatLocal; /* ---------- * Local data + * + * NB: There should be only variables related to stats infrastructure here, + * not for specific kinds of stats. * ---------- */ -static struct sockaddr_storage pgStatAddr; +/* + * Memory contexts containing the pgStatEntryRefHash table, the + * pgStatSharedRef entries, and pending data respectively. Mostly to make it + * easier to track / attribute memory usage. + */ -static time_t last_pgstat_start_time; - -static bool pgStatRunningInCollector = false; +static MemoryContext pgStatPendingContext = NULL; /* - * Info about current "snapshot" of stats file + * Backend local list of PgStat_EntryRef with unflushed pending stats. + * + * Newly pending entries should only ever be added to the end of the list, + * otherwise pgstat_flush_pending_entries() might not see them immediately. */ -static MemoryContext pgStatLocalContext = NULL; -static HTAB *pgStatDBHash = NULL; +static dlist_head pgStatPending = DLIST_STATIC_INIT(pgStatPending); -/* - * Cluster wide statistics, kept in the stats collector. - * Contains statistics that are not collected per database - * or per table. - */ -static PgStat_ArchiverStats archiverStats; -static PgStat_GlobalStats globalStats; -static PgStat_WalStats walStats; -static PgStat_SLRUStats slruStats[SLRU_NUM_ELEMENTS]; -static HTAB *replSlotStatHash = NULL; -static HTAB *subscriptionStatHash = NULL; - -/* - * List of OIDs of databases we need to write out. If an entry is InvalidOid, - * it means to write only the shared-catalog stats ("DB 0"); otherwise, we - * will write both that DB's data and the shared stats. - */ -static List *pending_write_requests = NIL; /* * For assertions that check pgstat is not used before initialization / after @@ -233,455 +239,234 @@ static bool pgstat_is_shutdown = false; #endif +/* + * The different kinds of statistics. + * + * If reasonably possible, handling specific to one kind of stats should go + * through this abstraction, rather than making more of pgstat.c aware. + * + * See comments for struct PgStat_KindInfo for details about the individual + * fields. + * + * XXX: It'd be nicer to define this outside of this file. But there doesn't + * seem to be a great way of doing that, given the split across multiple + * files. + */ +static const PgStat_KindInfo pgstat_kind_infos[PGSTAT_NUM_KINDS] = { + + /* stats kinds for variable-numbered objects */ + + [PGSTAT_KIND_DATABASE] = { + .name = "database", + + .fixed_amount = false, + /* so pg_stat_database entries can be seen in all databases */ + .accessed_across_databases = true, + + .shared_size = sizeof(PgStatShared_Database), + .shared_data_off = offsetof(PgStatShared_Database, stats), + .shared_data_len = sizeof(((PgStatShared_Database *) 0)->stats), + .pending_size = sizeof(PgStat_StatDBEntry), + + .flush_pending_cb = pgstat_database_flush_cb, + .reset_timestamp_cb = pgstat_database_reset_timestamp_cb, + }, + + [PGSTAT_KIND_RELATION] = { + .name = "relation", + + .fixed_amount = false, + + .shared_size = sizeof(PgStatShared_Relation), + .shared_data_off = offsetof(PgStatShared_Relation, stats), + .shared_data_len = sizeof(((PgStatShared_Relation *) 0)->stats), + .pending_size = sizeof(PgStat_TableStatus), + + .flush_pending_cb = pgstat_relation_flush_cb, + .delete_pending_cb = pgstat_relation_delete_pending_cb, + }, + + [PGSTAT_KIND_FUNCTION] = { + .name = "function", + + .fixed_amount = false, + + .shared_size = sizeof(PgStatShared_Function), + .shared_data_off = offsetof(PgStatShared_Function, stats), + .shared_data_len = sizeof(((PgStatShared_Function *) 0)->stats), + .pending_size = sizeof(PgStat_BackendFunctionEntry), + + .flush_pending_cb = pgstat_function_flush_cb, + }, + + [PGSTAT_KIND_REPLSLOT] = { + .name = "replslot", + + .fixed_amount = false, + + .accessed_across_databases = true, + .named_on_disk = true, + + .shared_size = sizeof(PgStatShared_ReplSlot), + .shared_data_off = offsetof(PgStatShared_ReplSlot, stats), + .shared_data_len = sizeof(((PgStatShared_ReplSlot *) 0)->stats), + + .reset_timestamp_cb = pgstat_replslot_reset_timestamp_cb, + .to_serialized_name = pgstat_replslot_to_serialized_name_cb, + .from_serialized_name = pgstat_replslot_from_serialized_name_cb, + }, + + [PGSTAT_KIND_SUBSCRIPTION] = { + .name = "subscription", + + .fixed_amount = false, + /* so pg_stat_subscription_stats entries can be seen in all databases */ + .accessed_across_databases = true, + + .shared_size = sizeof(PgStatShared_Subscription), + .shared_data_off = offsetof(PgStatShared_Subscription, stats), + .shared_data_len = sizeof(((PgStatShared_Subscription *) 0)->stats), + .pending_size = sizeof(PgStat_BackendSubEntry), + + .flush_pending_cb = pgstat_subscription_flush_cb, + .reset_timestamp_cb = pgstat_subscription_reset_timestamp_cb, + }, + + + /* stats for fixed-numbered (mostly 1) objects */ + + [PGSTAT_KIND_ARCHIVER] = { + .name = "archiver", + + .fixed_amount = true, + + .reset_all_cb = pgstat_archiver_reset_all_cb, + .snapshot_cb = pgstat_archiver_snapshot_cb, + }, + + [PGSTAT_KIND_BGWRITER] = { + .name = "bgwriter", + + .fixed_amount = true, + + .reset_all_cb = pgstat_bgwriter_reset_all_cb, + .snapshot_cb = pgstat_bgwriter_snapshot_cb, + }, + + [PGSTAT_KIND_CHECKPOINTER] = { + .name = "checkpointer", + + .fixed_amount = true, + + .reset_all_cb = pgstat_checkpointer_reset_all_cb, + .snapshot_cb = pgstat_checkpointer_snapshot_cb, + }, + + [PGSTAT_KIND_SLRU] = { + .name = "slru", + + .fixed_amount = true, + + .reset_all_cb = pgstat_slru_reset_all_cb, + .snapshot_cb = pgstat_slru_snapshot_cb, + }, + + [PGSTAT_KIND_WAL] = { + .name = "wal", + + .fixed_amount = true, + + .reset_all_cb = pgstat_wal_reset_all_cb, + .snapshot_cb = pgstat_wal_snapshot_cb, + }, +}; + + /* ------------------------------------------------------------ - * Public functions called from postmaster follow + * Functions managing the state of the stats system for all backends. * ------------------------------------------------------------ */ /* - * Called from postmaster at startup. Create the resources required - * by the statistics collector process. If unable to do so, do not - * fail --- better to let the postmaster start with stats collection - * disabled. + * Read on-disk stats into memory at server start. + * + * Should only be called by the startup process or in single user mode. */ void -pgstat_init(void) +pgstat_restore_stats(void) +{ + pgstat_read_statsfile(); +} + +/* + * Remove the stats file. This is currently used only if WAL recovery is + * needed after a crash. + * + * Should only be called by the startup process or in single user mode. + */ +void +pgstat_discard_stats(void) { - socklen_t alen; - struct addrinfo *addrs = NULL, - *addr, - hints; int ret; - fd_set rset; - struct timeval tv; - char test_byte; - int sel_res; - int tries = 0; -#define TESTBYTEVAL ((char) 199) + /* NB: this needs to be done even in single user mode */ - /* - * This static assertion verifies that we didn't mess up the calculations - * involved in selecting maximum payload sizes for our UDP messages. - * Because the only consequence of overrunning PGSTAT_MAX_MSG_SIZE would - * be silent performance loss from fragmentation, it seems worth having a - * compile-time cross-check that we didn't. - */ - StaticAssertStmt(sizeof(PgStat_Msg) <= PGSTAT_MAX_MSG_SIZE, - "maximum stats message size exceeds PGSTAT_MAX_MSG_SIZE"); - - /* - * Create the UDP socket for sending and receiving statistic messages - */ - hints.ai_flags = AI_PASSIVE; - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_DGRAM; - hints.ai_protocol = 0; - hints.ai_addrlen = 0; - hints.ai_addr = NULL; - hints.ai_canonname = NULL; - hints.ai_next = NULL; - ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs); - if (ret || !addrs) + ret = unlink(PGSTAT_STAT_PERMANENT_FILENAME); + if (ret != 0) { - ereport(LOG, - (errmsg("could not resolve \"localhost\": %s", - gai_strerror(ret)))); - goto startup_failed; - } - - /* - * On some platforms, pg_getaddrinfo_all() may return multiple addresses - * only one of which will actually work (eg, both IPv6 and IPv4 addresses - * when kernel will reject IPv6). Worse, the failure may occur at the - * bind() or perhaps even connect() stage. So we must loop through the - * results till we find a working combination. We will generate LOG - * messages, but no error, for bogus combinations. - */ - for (addr = addrs; addr; addr = addr->ai_next) - { -#ifdef HAVE_UNIX_SOCKETS - /* Ignore AF_UNIX sockets, if any are returned. */ - if (addr->ai_family == AF_UNIX) - continue; -#endif - - if (++tries > 1) - ereport(LOG, - (errmsg("trying another address for the statistics collector"))); - - /* - * Create the socket. - */ - if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not create socket for statistics collector: %m"))); - continue; - } - - /* - * Bind it to a kernel assigned port on localhost and get the assigned - * port via getsockname(). - */ - if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not bind socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - alen = sizeof(pgStatAddr); - if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not get address of socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - /* - * Connect the socket to its own address. This saves a few cycles by - * not having to respecify the target address on every send. This also - * provides a kernel-level check that only packets from this same - * address will be received. - */ - if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not connect socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - /* - * Try to send and receive a one-byte test message on the socket. This - * is to catch situations where the socket can be created but will not - * actually pass data (for instance, because kernel packet filtering - * rules prevent it). - */ - test_byte = TESTBYTEVAL; - -retry1: - if (send(pgStatSock, &test_byte, 1, 0) != 1) - { - if (errno == EINTR) - goto retry1; /* if interrupted, just retry */ - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not send test message on socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - /* - * There could possibly be a little delay before the message can be - * received. We arbitrarily allow up to half a second before deciding - * it's broken. - */ - for (;;) /* need a loop to handle EINTR */ - { - FD_ZERO(&rset); - FD_SET(pgStatSock, &rset); - - tv.tv_sec = 0; - tv.tv_usec = 500000; - sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv); - if (sel_res >= 0 || errno != EINTR) - break; - } - if (sel_res < 0) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("select() failed in statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset)) - { - /* - * This is the case we actually think is likely, so take pains to - * give a specific message for it. - * - * errno will not be set meaningfully here, so don't use it. - */ - ereport(LOG, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("test message did not get through on socket for statistics collector"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - test_byte++; /* just make sure variable is changed */ - -retry2: - if (recv(pgStatSock, &test_byte, 1, 0) != 1) - { - if (errno == EINTR) - goto retry2; /* if interrupted, just retry */ - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not receive test message on socket for statistics collector: %m"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */ - { - ereport(LOG, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("incorrect test message transmission on socket for statistics collector"))); - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - continue; - } - - /* If we get here, we have a working socket */ - break; - } - - /* Did we find a working address? */ - if (!addr || pgStatSock == PGINVALID_SOCKET) - goto startup_failed; - - /* - * Set the socket to non-blocking IO. This ensures that if the collector - * falls behind, statistics messages will be discarded; backends won't - * block waiting to send messages to the collector. - */ - if (!pg_set_noblock(pgStatSock)) - { - ereport(LOG, - (errcode_for_socket_access(), - errmsg("could not set statistics collector socket to nonblocking mode: %m"))); - goto startup_failed; - } - - /* - * Try to ensure that the socket's receive buffer is at least - * PGSTAT_MIN_RCVBUF bytes, so that it won't easily overflow and lose - * data. Use of UDP protocol means that we are willing to lose data under - * heavy load, but we don't want it to happen just because of ridiculously - * small default buffer sizes (such as 8KB on older Windows versions). - */ - { - int old_rcvbuf; - int new_rcvbuf; - socklen_t rcvbufsize = sizeof(old_rcvbuf); - - if (getsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF, - (char *) &old_rcvbuf, &rcvbufsize) < 0) - { - ereport(LOG, - (errmsg("%s(%s) failed: %m", "getsockopt", "SO_RCVBUF"))); - /* if we can't get existing size, always try to set it */ - old_rcvbuf = 0; - } - - new_rcvbuf = PGSTAT_MIN_RCVBUF; - if (old_rcvbuf < new_rcvbuf) - { - if (setsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF, - (char *) &new_rcvbuf, sizeof(new_rcvbuf)) < 0) - ereport(LOG, - (errmsg("%s(%s) failed: %m", "setsockopt", "SO_RCVBUF"))); - } - } - - pg_freeaddrinfo_all(hints.ai_family, addrs); - - /* Now that we have a long-lived socket, tell fd.c about it. */ - ReserveExternalFD(); - - return; - -startup_failed: - ereport(LOG, - (errmsg("disabling statistics collector for lack of working socket"))); - - if (addrs) - pg_freeaddrinfo_all(hints.ai_family, addrs); - - if (pgStatSock != PGINVALID_SOCKET) - closesocket(pgStatSock); - pgStatSock = PGINVALID_SOCKET; - - /* - * Adjust GUC variables to suppress useless activity, and for debugging - * purposes (seeing track_counts off is a clue that we failed here). We - * use PGC_S_OVERRIDE because there is no point in trying to turn it back - * on from postgresql.conf without a restart. - */ - SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE); -} - -/* - * subroutine for pgstat_reset_all - */ -static void -pgstat_reset_remove_files(const char *directory) -{ - DIR *dir; - struct dirent *entry; - char fname[MAXPGPATH * 2]; - - dir = AllocateDir(directory); - while ((entry = ReadDir(dir, directory)) != NULL) - { - int nchars; - Oid tmp_oid; - - /* - * Skip directory entries that don't match the file names we write. - * See get_dbstat_filename for the database-specific pattern. - */ - if (strncmp(entry->d_name, "global.", 7) == 0) - nchars = 7; + if (errno == ENOENT) + elog(DEBUG2, + "didn't need to unlink permanent stats file \"%s\" - didn't exist", + PGSTAT_STAT_PERMANENT_FILENAME); else - { - nchars = 0; - (void) sscanf(entry->d_name, "db_%u.%n", - &tmp_oid, &nchars); - if (nchars <= 0) - continue; - /* %u allows leading whitespace, so reject that */ - if (strchr("0123456789", entry->d_name[3]) == NULL) - continue; - } - - if (strcmp(entry->d_name + nchars, "tmp") != 0 && - strcmp(entry->d_name + nchars, "stat") != 0) - continue; - - snprintf(fname, sizeof(fname), "%s/%s", directory, - entry->d_name); - unlink(fname); - } - FreeDir(dir); -} - -/* - * Remove the stats files. This is currently used only if WAL - * recovery is needed after a crash. - */ -void -pgstat_reset_all(void) -{ - pgstat_reset_remove_files(pgstat_stat_directory); - pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY); -} - -#ifdef EXEC_BACKEND - -/* - * Format up the arglist for, then fork and exec, statistics collector process - */ -static pid_t -pgstat_forkexec(void) -{ - char *av[10]; - int ac = 0; - - av[ac++] = "postgres"; - av[ac++] = "--forkcol"; - av[ac++] = NULL; /* filled in by postmaster_forkexec */ - - av[ac] = NULL; - Assert(ac < lengthof(av)); - - return postmaster_forkexec(ac, av); -} -#endif /* EXEC_BACKEND */ - - -/* - * Called from postmaster at startup or after an existing collector - * died. Attempt to fire up a fresh statistics collector. - * - * Returns PID of child process, or 0 if fail. - * - * Note: if fail, we will be called again from the postmaster main loop. - */ -int -pgstat_start(void) -{ - time_t curtime; - pid_t pgStatPid; - - /* - * Check that the socket is there, else pgstat_init failed and we can do - * nothing useful. - */ - if (pgStatSock == PGINVALID_SOCKET) - return 0; - - /* - * Do nothing if too soon since last collector start. This is a safety - * valve to protect against continuous respawn attempts if the collector - * is dying immediately at launch. Note that since we will be re-called - * from the postmaster main loop, we will get another chance later. - */ - curtime = time(NULL); - if ((unsigned int) (curtime - last_pgstat_start_time) < - (unsigned int) PGSTAT_RESTART_INTERVAL) - return 0; - last_pgstat_start_time = curtime; - - /* - * Okay, fork off the collector. - */ -#ifdef EXEC_BACKEND - switch ((pgStatPid = pgstat_forkexec())) -#else - switch ((pgStatPid = fork_process())) -#endif - { - case -1: ereport(LOG, - (errmsg("could not fork statistics collector: %m"))); - return 0; - -#ifndef EXEC_BACKEND - case 0: - /* in postmaster child ... */ - InitPostmasterChild(); - - /* Close the postmaster's sockets */ - ClosePostmasterPorts(false); - - /* Drop our connection to postmaster's shared memory, as well */ - dsm_detach_all(); - PGSharedMemoryDetach(); - - PgstatCollectorMain(0, NULL); - break; -#endif - - default: - return (int) pgStatPid; + (errcode_for_file_access(), + errmsg("could not unlink permanent statistics file \"%s\": %m", + PGSTAT_STAT_PERMANENT_FILENAME))); + } + else + { + ereport(DEBUG2, + (errcode_for_file_access(), + errmsg("unlinked permanent statistics file \"%s\"", + PGSTAT_STAT_PERMANENT_FILENAME))); } - - /* shouldn't get here */ - return 0; } +/* + * pgstat_before_server_shutdown() needs to be called by exactly one process + * during regular server shutdowns. Otherwise all stats will be lost. + * + * We currently only write out stats for proc_exit(0). We might want to change + * that at some point... But right now pgstat_discard_stats() would be called + * during the start after a disorderly shutdown, anyway. + */ void -allow_immediate_pgstat_restart(void) +pgstat_before_server_shutdown(int code, Datum arg) { - last_pgstat_start_time = 0; + Assert(pgStatLocal.shmem != NULL); + Assert(!pgStatLocal.shmem->is_shutdown); + + /* + * Stats should only be reported after pgstat_initialize() and before + * pgstat_shutdown(). This is a convenient point to catch most violations + * of this rule. + */ + Assert(pgstat_is_initialized && !pgstat_is_shutdown); + + /* flush out our own pending changes before writing out */ + pgstat_report_stat(true); + + /* + * Only write out file during normal shutdown. Don't even signal that + * we've shutdown during irregular shutdowns, because the shutdown + * sequence isn't coordinated to ensure this backend shuts down last. + */ + if (code == 0) + { + pgStatLocal.shmem->is_shutdown = true; + pgstat_write_statsfile(); + } } @@ -701,6 +486,7 @@ static void pgstat_shutdown_hook(int code, Datum arg) { Assert(!pgstat_is_shutdown); + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); /* * If we got as far as discovering our own database ID, we can flush out @@ -709,7 +495,15 @@ pgstat_shutdown_hook(int code, Datum arg) * failed backend starts might never get counted.) */ if (OidIsValid(MyDatabaseId)) - pgstat_report_stat(true); + pgstat_report_disconnect(MyDatabaseId); + + pgstat_report_stat(true); + + /* there shouldn't be any pending changes left */ + Assert(dlist_is_empty(&pgStatPending)); + dlist_init(&pgStatPending); + + pgstat_detach_shmem(); #ifdef USE_ASSERT_CHECKING pgstat_is_shutdown = true; @@ -727,6 +521,8 @@ pgstat_initialize(void) { Assert(!pgstat_is_initialized); + pgstat_attach_shmem(); + pgstat_init_wal(); /* Set up a process-exit hook to clean up */ @@ -745,331 +541,119 @@ pgstat_initialize(void) /* * Must be called by processes that performs DML: tcop/postgres.c, logical - * receiver processes, SPI worker, etc. to send the so far collected - * per-table and function usage statistics to the collector. Note that this - * is called only when not within a transaction, so it is fair to use - * transaction stop time as an approximation of current time. + * receiver processes, SPI worker, etc. to flush pending statistics updates to + * shared memory. * - * "disconnect" is "true" only for the last call before the backend - * exits. This makes sure that no data is lost and that interrupted - * sessions are reported correctly. + * Unless called with 'force', pending stats updates are flushed happen once + * per PGSTAT_MIN_INTERVAL (1000ms). When not forced, stats flushes do not + * block on lock acquisition, except if stats updates have been pending for + * longer than PGSTAT_MAX_INTERVAL (60000ms). + * + * Whenever pending stats updates remain at the end of pgstat_report_stat() a + * suggested idle timeout is returned. Currently this is always + * PGSTAT_IDLE_INTERVAL (10000ms). Callers can use the returned time to set up + * a timeout after which to call pgstat_report_stat(true), but are not + * required to to do so. + * + * Note that this is called only when not within a transaction, so it is fair + * to use transaction stop time as an approximation of current time. */ -void -pgstat_report_stat(bool disconnect) +long +pgstat_report_stat(bool force) { - static TimestampTz last_report = 0; - + static TimestampTz pending_since = 0; + static TimestampTz last_flush = 0; + bool partial_flush; TimestampTz now; + bool nowait; pgstat_assert_is_up(); + Assert(!IsTransactionBlock()); + + /* Don't expend a clock check if nothing to do */ + if (dlist_is_empty(&pgStatPending) && + !have_slrustats && + !pgstat_have_pending_wal()) + { + Assert(pending_since == 0); + return 0; + } /* - * Don't expend a clock check if nothing to do. + * There should never be stats to report once stats are shut down. Can't + * assert that before the checks above, as there is an unconditional + * pgstat_report_stat() call in pgstat_shutdown_hook() - which at least + * the process that ran pgstat_before_server_shutdown() will still call. */ - if (!have_relation_stats && - pgStatXactCommit == 0 && pgStatXactRollback == 0 && - !pgstat_have_pending_wal() && - !have_function_stats && !disconnect) - return; + Assert(!pgStatLocal.shmem->is_shutdown); - /* - * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL - * msec since we last sent one, or the backend is about to exit. - */ now = GetCurrentTransactionStopTimestamp(); - if (!disconnect && - !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL)) - return; - last_report = now; + if (!force) + { + if (pending_since > 0 && + TimestampDifferenceExceeds(pending_since, now, PGSTAT_MAX_INTERVAL)) + { + /* don't keep pending updates longer than PGSTAT_MAX_INTERVAL */ + force = true; + } + else if (last_flush > 0 && + !TimestampDifferenceExceeds(last_flush, now, PGSTAT_MIN_INTERVAL)) + { + /* don't flush too frequently */ + if (pending_since == 0) + pending_since = now; - if (disconnect) - pgstat_report_disconnect(MyDatabaseId); + return PGSTAT_IDLE_INTERVAL; + } + } - /* First, send relation statistics */ - pgstat_send_tabstats(now, disconnect); + pgstat_update_dbstats(now); - /* Now, send function statistics */ - pgstat_send_funcstats(); + /* don't wait for lock acquisition when !force */ + nowait = !force; - /* Send WAL statistics */ - pgstat_report_wal(true); + partial_flush = false; - /* Finally send SLRU statistics */ - pgstat_send_slru(); + /* flush database / relation / function / ... stats */ + partial_flush |= pgstat_flush_pending_entries(nowait); + + /* flush wal stats */ + partial_flush |= pgstat_flush_wal(nowait); + + /* flush SLRU stats */ + partial_flush |= pgstat_slru_flush(nowait); + + last_flush = now; + + /* + * If some of the pending stats could not be flushed due to lock + * contention, let the caller know when to retry. + */ + if (partial_flush) + { + /* force should have prevented us from getting here */ + Assert(!force); + + /* remember since when stats have been pending */ + if (pending_since == 0) + pending_since = now; + + return PGSTAT_IDLE_INTERVAL; + } + + pending_since = 0; + + return 0; } /* - * Will tell the collector about objects he can get rid of. + * Only for use by pgstat_reset_counters() */ -void -pgstat_vacuum_stat(void) +static bool +match_db_entries(PgStatShared_HashEntry *entry, Datum match_data) { - HTAB *htab; - PgStat_MsgTabpurge msg; - PgStat_MsgFuncpurge f_msg; - HASH_SEQ_STATUS hstat; - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; - PgStat_StatFuncEntry *funcentry; - int len; - - if (pgStatSock == PGINVALID_SOCKET) - return; - - /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. - */ - backend_read_statsfile(); - - /* - * Read pg_database and make a list of OIDs of all existing databases - */ - htab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid); - - /* - * Search the database hash table for dead databases and tell the - * collector to drop them. - */ - hash_seq_init(&hstat, pgStatDBHash); - while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) - { - Oid dbid = dbentry->databaseid; - - CHECK_FOR_INTERRUPTS(); - - /* the DB entry for shared tables (with InvalidOid) is never dropped */ - if (OidIsValid(dbid) && - hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL) - pgstat_drop_database(dbid); - } - - /* Clean up */ - hash_destroy(htab); - - /* - * Search for all the dead replication slots in stats hashtable and tell - * the stats collector to drop them. - */ - if (replSlotStatHash) - { - PgStat_StatReplSlotEntry *slotentry; - - hash_seq_init(&hstat, replSlotStatHash); - while ((slotentry = (PgStat_StatReplSlotEntry *) hash_seq_search(&hstat)) != NULL) - { - CHECK_FOR_INTERRUPTS(); - - if (SearchNamedReplicationSlot(NameStr(slotentry->slotname), true) == NULL) - { - PgStat_MsgReplSlot msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(slotentry->slotname)); - msg.m_create = false; - msg.m_drop = true; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); - } - } - } - - /* - * Repeat the above steps for subscriptions, if subscription stats are - * being collected. - */ - if (subscriptionStatHash) - { - PgStat_StatSubEntry *subentry; - - /* - * Read pg_subscription and make a list of OIDs of all existing - * subscriptions. - */ - htab = pgstat_collect_oids(SubscriptionRelationId, Anum_pg_subscription_oid); - - hash_seq_init(&hstat, subscriptionStatHash); - while ((subentry = (PgStat_StatSubEntry *) hash_seq_search(&hstat)) != NULL) - { - CHECK_FOR_INTERRUPTS(); - - if (hash_search(htab, (void *) &(subentry->subid), HASH_FIND, NULL) == NULL) - pgstat_drop_subscription(subentry->subid); - } - - hash_destroy(htab); - } - - /* - * Lookup our own database entry; if not found, nothing more to do. - */ - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &MyDatabaseId, - HASH_FIND, NULL); - if (dbentry == NULL || dbentry->tables == NULL) - return; - - /* - * Similarly to above, make a list of all known relations in this DB. - */ - htab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid); - - /* - * Initialize our messages table counter to zero - */ - msg.m_nentries = 0; - - /* - * Check for all tables listed in stats hashtable if they still exist. - */ - hash_seq_init(&hstat, dbentry->tables); - while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL) - { - Oid tabid = tabentry->tableid; - - CHECK_FOR_INTERRUPTS(); - - if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL) - continue; - - /* - * Not there, so add this table's Oid to the message - */ - msg.m_tableid[msg.m_nentries++] = tabid; - - /* - * If the message is full, send it out and reinitialize to empty - */ - if (msg.m_nentries >= PGSTAT_NUM_TABPURGE) - { - len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) - + msg.m_nentries * sizeof(Oid); - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, len); - - msg.m_nentries = 0; - } - } - - /* - * Send the rest - */ - if (msg.m_nentries > 0) - { - len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) - + msg.m_nentries * sizeof(Oid); - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, len); - } - - /* Clean up */ - hash_destroy(htab); - - /* - * Now repeat the above steps for functions. However, we needn't bother - * in the common case where no function stats are being collected. - */ - if (dbentry->functions != NULL && - hash_get_num_entries(dbentry->functions) > 0) - { - htab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid); - - pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE); - f_msg.m_databaseid = MyDatabaseId; - f_msg.m_nentries = 0; - - hash_seq_init(&hstat, dbentry->functions); - while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL) - { - Oid funcid = funcentry->functionid; - - CHECK_FOR_INTERRUPTS(); - - if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL) - continue; - - /* - * Not there, so add this function's Oid to the message - */ - f_msg.m_functionid[f_msg.m_nentries++] = funcid; - - /* - * If the message is full, send it out and reinitialize to empty - */ - if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE) - { - len = offsetof(PgStat_MsgFuncpurge, m_functionid[0]) - + f_msg.m_nentries * sizeof(Oid); - - pgstat_send(&f_msg, len); - - f_msg.m_nentries = 0; - } - } - - /* - * Send the rest - */ - if (f_msg.m_nentries > 0) - { - len = offsetof(PgStat_MsgFuncpurge, m_functionid[0]) - + f_msg.m_nentries * sizeof(Oid); - - pgstat_send(&f_msg, len); - } - - hash_destroy(htab); - } -} - -/* - * Collect the OIDs of all objects listed in the specified system catalog - * into a temporary hash table. Caller should hash_destroy the result - * when done with it. (However, we make the table in CurrentMemoryContext - * so that it will be freed properly in event of an error.) - */ -static HTAB * -pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid) -{ - HTAB *htab; - HASHCTL hash_ctl; - Relation rel; - TableScanDesc scan; - HeapTuple tup; - Snapshot snapshot; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(Oid); - hash_ctl.hcxt = CurrentMemoryContext; - htab = hash_create("Temporary table of OIDs", - PGSTAT_TAB_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - rel = table_open(catalogid, AccessShareLock); - snapshot = RegisterSnapshot(GetLatestSnapshot()); - scan = table_beginscan(rel, snapshot, 0, NULL); - while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL) - { - Oid thisoid; - bool isnull; - - thisoid = heap_getattr(tup, anum_oid, RelationGetDescr(rel), &isnull); - Assert(!isnull); - - CHECK_FOR_INTERRUPTS(); - - (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL); - } - table_endscan(scan); - UnregisterSnapshot(snapshot); - table_close(rel, AccessShareLock); - - return htab; + return entry->key.dboid == DatumGetObjectId(MyDatabaseId); } /* @@ -1081,14 +665,11 @@ pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid) void pgstat_reset_counters(void) { - PgStat_MsgResetcounter msg; + TimestampTz ts = GetCurrentTimestamp(); - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, sizeof(msg)); + pgstat_reset_matching_entries(match_db_entries, + ObjectIdGetDatum(MyDatabaseId), + ts); } /* @@ -1103,38 +684,17 @@ pgstat_reset_counters(void) void pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objoid) { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + TimestampTz ts = GetCurrentTimestamp(); - if (pgStatSock == PGINVALID_SOCKET) - return; + /* not needed atm, and doesn't make sense with the current signature */ + Assert(!pgstat_get_kind_info(kind)->fixed_amount); - switch (kind) - { - case PGSTAT_KIND_FUNCTION: - case PGSTAT_KIND_RELATION: - { - PgStat_MsgResetsinglecounter msg; + /* reset the "single counter" */ + pgstat_reset_entry(kind, dboid, objoid, ts); - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSINGLECOUNTER); - msg.m_databaseid = dboid; - msg.m_resettype = kind; - msg.m_objectid = objoid; - pgstat_send(&msg, sizeof(msg)); - } - break; - - case PGSTAT_KIND_SUBSCRIPTION: - { - PgStat_MsgResetsubcounter msg; - - Assert(dboid == InvalidOid); - msg.m_subid = objoid; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSUBCOUNTER); - } - break; - - default: - elog(ERROR, "unexpected"); - } + if (!kind_info->accessed_across_databases) + pgstat_reset_database_timestamp(dboid, ts); } /* @@ -1146,87 +706,20 @@ pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objoid) void pgstat_reset_of_kind(PgStat_Kind kind) { - if (pgStatSock == PGINVALID_SOCKET) - return; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + TimestampTz ts = GetCurrentTimestamp(); - switch (kind) - { - case PGSTAT_KIND_ARCHIVER: - case PGSTAT_KIND_BGWRITER: - case PGSTAT_KIND_CHECKPOINTER: - case PGSTAT_KIND_WAL: - { - PgStat_MsgResetsharedcounter msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER); - msg.m_resettarget = kind; - pgstat_send(&msg, sizeof(msg)); - } - break; - case PGSTAT_KIND_SLRU: - { - PgStat_MsgResetslrucounter msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSLRUCOUNTER); - msg.m_index = -1; - pgstat_send(&msg, sizeof(msg)); - } - break; - case PGSTAT_KIND_REPLSLOT: - { - PgStat_MsgResetreplslotcounter msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETREPLSLOTCOUNTER); - msg.clearall = true; - pgstat_send(&msg, sizeof(msg)); - } - break; - - case PGSTAT_KIND_SUBSCRIPTION: - { - PgStat_MsgResetsubcounter msg; - - msg.m_subid = InvalidOid; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSUBCOUNTER); - - pgstat_send(&msg, sizeof(msg)); - } - break; - - default: - elog(ERROR, "unexpected"); - } + if (kind_info->fixed_amount) + kind_info->reset_all_cb(ts); + else + pgstat_reset_entries_of_kind(kind, ts); } -/* - * Send some junk data to the collector to increase traffic. + +/* ------------------------------------------------------------ + * Fetching of stats + * ------------------------------------------------------------ */ -void -pgstat_ping(void) -{ - PgStat_MsgDummy msg; - - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY); - pgstat_send(&msg, sizeof(msg)); -} - -/* - * Notify collector that we need fresh data. - */ -static void -pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid) -{ - PgStat_MsgInquiry msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); - msg.clock_time = clock_time; - msg.cutoff_time = cutoff_time; - msg.databaseid = databaseid; - pgstat_send(&msg, sizeof(msg)); -} /* * Discard any data collected in the current transaction. Any subsequent @@ -1240,15 +733,19 @@ pgstat_clear_snapshot(void) { pgstat_assert_is_up(); - /* Release memory, if any was allocated */ - if (pgStatLocalContext) - MemoryContextDelete(pgStatLocalContext); + memset(&pgStatLocal.snapshot.fixed_valid, 0, + sizeof(pgStatLocal.snapshot.fixed_valid)); + pgStatLocal.snapshot.stats = NULL; + pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_NONE; - /* Reset variables */ - pgStatLocalContext = NULL; - pgStatDBHash = NULL; - replSlotStatHash = NULL; - subscriptionStatHash = NULL; + /* Release memory, if any was allocated */ + if (pgStatLocal.snapshot.context) + { + MemoryContextDelete(pgStatLocal.snapshot.context); + + /* Reset variables */ + pgStatLocal.snapshot.context = NULL; + } /* * Historically the backend_status.c facilities lived in this file, and @@ -1258,204 +755,399 @@ pgstat_clear_snapshot(void) pgstat_clear_backend_activity_snapshot(); } -/* - * Support function for the SQL-callable pgstat* functions. Returns - * the collected statistics for one database or NULL. NULL doesn't mean - * that the database doesn't exist, just that there are no statistics, so the - * caller is better off to report ZERO instead. - */ -PgStat_StatDBEntry * -pgstat_fetch_stat_dbentry(Oid dbid) +void * +pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, Oid objoid) { - /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. - */ - backend_read_statsfile(); + PgStat_HashKey key; + PgStat_EntryRef *entry_ref; + void *stats_data; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - /* - * Lookup the requested database; return NULL if not found - */ - return (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &dbid, - HASH_FIND, NULL); -} + /* should be called from backends */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + AssertArg(!kind_info->fixed_amount); -/* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the global statistics struct. - */ -PgStat_GlobalStats * -pgstat_fetch_global(void) -{ - backend_read_statsfile(); + pgstat_prep_snapshot(); - return &globalStats; -} + key.kind = kind; + key.dboid = dboid; + key.objoid = objoid; -/* - * Support function for the SQL-callable pgstat* functions. Returns - * the collected statistics for one table or NULL. NULL doesn't mean - * that the table doesn't exist, just that there are no statistics, so the - * caller is better off to report ZERO instead. - */ -PgStat_StatTabEntry * -pgstat_fetch_stat_tabentry(Oid relid) -{ - Oid dbid; - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; + /* if we need to build a full snapshot, do so */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + pgstat_build_snapshot(); - /* - * If not done for this transaction, read the statistics collector stats - * file into some hash tables. - */ - backend_read_statsfile(); - - /* - * Lookup our database, then look in its table hash table. - */ - dbid = MyDatabaseId; - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &dbid, - HASH_FIND, NULL); - if (dbentry != NULL && dbentry->tables != NULL) + /* if caching is desired, look up in cache */ + if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE) { - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &relid, - HASH_FIND, NULL); - if (tabentry) - return tabentry; + PgStat_SnapshotEntry *entry = NULL; + + entry = pgstat_snapshot_lookup(pgStatLocal.snapshot.stats, key); + + if (entry) + return entry->data; + + /* + * If we built a full snapshot and the key is not in + * pgStatLocal.snapshot.stats, there are no matching stats. + */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + return NULL; + } + + pgStatLocal.snapshot.mode = pgstat_fetch_consistency; + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + + if (entry_ref == NULL || entry_ref->shared_entry->dropped) + { + /* create empty entry when using PGSTAT_FETCH_CONSISTENCY_CACHE */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE) + { + PgStat_SnapshotEntry *entry = NULL; + bool found; + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found); + Assert(!found); + entry->data = NULL; + } + return NULL; } /* - * If we didn't find it, maybe it's a shared table. + * Allocate in caller's context for PGSTAT_FETCH_CONSISTENCY_NONE, + * otherwise we could quickly end up with a fair bit of memory used due to + * repeated accesses. */ - dbid = InvalidOid; - dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - (void *) &dbid, - HASH_FIND, NULL); - if (dbentry != NULL && dbentry->tables != NULL) + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE) + stats_data = palloc(kind_info->shared_data_len); + else + stats_data = MemoryContextAlloc(pgStatLocal.snapshot.context, + kind_info->shared_data_len); + memcpy(stats_data, + pgstat_get_entry_data(kind, entry_ref->shared_stats), + kind_info->shared_data_len); + + if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE) { - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &relid, - HASH_FIND, NULL); - if (tabentry) - return tabentry; + PgStat_SnapshotEntry *entry = NULL; + bool found; + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found); + entry->data = stats_data; } - return NULL; + return stats_data; } - /* - * Support function for the SQL-callable pgstat* functions. Returns - * the collected statistics for one function or NULL. + * If a stats snapshot has been taken, return the timestamp at which that was + * done, and set *have_snapshot to true. Otherwise *have_snapshot is set to + * false. */ -PgStat_StatFuncEntry * -pgstat_fetch_stat_funcentry(Oid func_id) +TimestampTz +pgstat_get_stat_snapshot_timestamp(bool *have_snapshot) { - PgStat_StatDBEntry *dbentry; - PgStat_StatFuncEntry *funcentry = NULL; - - /* load the stats file if needed */ - backend_read_statsfile(); - - /* Lookup our database, then find the requested function. */ - dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); - if (dbentry != NULL && dbentry->functions != NULL) + if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) { - funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, - (void *) &func_id, - HASH_FIND, NULL); + *have_snapshot = true; + return pgStatLocal.snapshot.snapshot_timestamp; } - return funcentry; + *have_snapshot = false; + + return 0; } /* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the archiver statistics struct. + * Ensure snapshot for fixed-numbered 'kind' exists. + * + * Typically used by the pgstat_fetch_* functions for a kind of stats, before + * massaging the data into the desired format. */ -PgStat_ArchiverStats * -pgstat_fetch_stat_archiver(void) +void +pgstat_snapshot_fixed(PgStat_Kind kind) { - backend_read_statsfile(); + AssertArg(pgstat_is_kind_valid(kind)); + AssertArg(pgstat_get_kind_info(kind)->fixed_amount); - return &archiverStats; + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + pgstat_build_snapshot(); + else + pgstat_build_snapshot_fixed(kind); + + Assert(pgStatLocal.snapshot.fixed_valid[kind]); +} + +static void +pgstat_prep_snapshot(void) +{ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE || + pgStatLocal.snapshot.stats != NULL) + return; + + if (!pgStatLocal.snapshot.context) + pgStatLocal.snapshot.context = AllocSetContextCreate(TopMemoryContext, + "PgStat Snapshot", + ALLOCSET_SMALL_SIZES); + + pgStatLocal.snapshot.stats = + pgstat_snapshot_create(pgStatLocal.snapshot.context, + PGSTAT_SNAPSHOT_HASH_SIZE, + NULL); +} + +static void +pgstat_build_snapshot(void) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + + /* should only be called when we need a snapshot */ + Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT); + + /* snapshot already built */ + if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + return; + + pgstat_prep_snapshot(); + + Assert(pgStatLocal.snapshot.stats->members == 0); + + pgStatLocal.snapshot.snapshot_timestamp = GetCurrentTimestamp(); + + /* + * Snapshot all variable stats. + */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + PgStat_Kind kind = p->key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + bool found; + PgStat_SnapshotEntry *entry; + PgStatShared_Common *stats_data; + + /* + * Check if the stats object should be included in the snapshot. + * Unless the stats kind can be accessed from all databases (e.g., + * database stats themselves), we only include stats for the current + * database or objects not associated with a database (e.g. shared + * relations). + */ + if (p->key.dboid != MyDatabaseId && + p->key.dboid != InvalidOid && + !kind_info->accessed_across_databases) + continue; + + if (p->dropped) + continue; + + Assert(pg_atomic_read_u32(&p->refcount) > 0); + + stats_data = dsa_get_address(pgStatLocal.dsa, p->body); + Assert(stats_data); + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, p->key, &found); + Assert(!found); + + entry->data = MemoryContextAlloc(pgStatLocal.snapshot.context, + kind_info->shared_size); + memcpy(entry->data, + pgstat_get_entry_data(kind, stats_data), + kind_info->shared_size); + } + dshash_seq_term(&hstat); + + /* + * Build snapshot of all fixed-numbered stats. + */ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->fixed_amount) + { + Assert(kind_info->snapshot_cb == NULL); + continue; + } + + pgstat_build_snapshot_fixed(kind); + } + + pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_SNAPSHOT; +} + +static void +pgstat_build_snapshot_fixed(PgStat_Kind kind) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + Assert(kind_info->fixed_amount); + Assert(kind_info->snapshot_cb != NULL); + + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE) + { + /* rebuild every time */ + pgStatLocal.snapshot.fixed_valid[kind] = false; + } + else if (pgStatLocal.snapshot.fixed_valid[kind]) + { + /* in snapshot mode we shouldn't get called again */ + Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE); + return; + } + + Assert(!pgStatLocal.snapshot.fixed_valid[kind]); + + kind_info->snapshot_cb(); + + Assert(!pgStatLocal.snapshot.fixed_valid[kind]); + pgStatLocal.snapshot.fixed_valid[kind] = true; +} + + +/* ------------------------------------------------------------ + * Backend-local pending stats infrastructure + * ------------------------------------------------------------ + */ + +/* + * Returns the appropriate PgStat_EntryRef, preparing it to receive pending + * stats if not already done. + * + * If created_entry is non-NULL, it'll be set to true if the entry is newly + * created, false otherwise. + */ +PgStat_EntryRef * +pgstat_prep_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid, bool *created_entry) +{ + PgStat_EntryRef *entry_ref; + + /* need to be able to flush out */ + Assert(pgstat_get_kind_info(kind)->flush_pending_cb != NULL); + + if (unlikely(!pgStatPendingContext)) + { + pgStatPendingContext = + AllocSetContextCreate(CacheMemoryContext, + "PgStat Pending", + ALLOCSET_SMALL_SIZES); + } + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, + true, created_entry); + + if (entry_ref->pending == NULL) + { + size_t entrysize = pgstat_get_kind_info(kind)->pending_size; + + Assert(entrysize != (size_t) -1); + + entry_ref->pending = MemoryContextAllocZero(pgStatPendingContext, entrysize); + dlist_push_tail(&pgStatPending, &entry_ref->pending_node); + } + + return entry_ref; } /* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the bgwriter statistics struct. + * Return an existing stats entry, or NULL. + * + * This should only be used for helper function for pgstatfuncs.c - outside of + * that it shouldn't be needed. */ -PgStat_BgWriterStats * -pgstat_fetch_stat_bgwriter(void) +PgStat_EntryRef * +pgstat_fetch_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid) { - backend_read_statsfile(); + PgStat_EntryRef *entry_ref; - return &globalStats.bgwriter; + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + + if (entry_ref == NULL || entry_ref->pending == NULL) + return NULL; + + return entry_ref; +} + +void +pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref) +{ + PgStat_Kind kind = entry_ref->shared_entry->key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + void *pending_data = entry_ref->pending; + + Assert(pending_data != NULL); + /* !fixed_amount stats should be handled explicitly */ + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + if (kind_info->delete_pending_cb) + kind_info->delete_pending_cb(entry_ref); + + pfree(pending_data); + entry_ref->pending = NULL; + + dlist_delete(&entry_ref->pending_node); } /* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the checkpointer statistics struct. + * Flush out pending stats for database objects (databases, relations, + * functions). */ -PgStat_CheckpointerStats * -pgstat_fetch_stat_checkpointer(void) +static bool +pgstat_flush_pending_entries(bool nowait) { - backend_read_statsfile(); + bool have_pending = false; + dlist_node *cur = NULL; - return &globalStats.checkpointer; -} + /* + * Need to be a bit careful iterating over the list of pending entries. + * Processing a pending entry may queue further pending entries to the end + * of the list that we want to process, so a simple iteration won't do. + * Further complicating matters is that we want to delete the current + * entry in each iteration from the list if we flushed successfully. + * + * So we just keep track of the next pointer in each loop iteration. + */ + if (!dlist_is_empty(&pgStatPending)) + cur = dlist_head_node(&pgStatPending); -/* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the WAL statistics struct. - */ -PgStat_WalStats * -pgstat_fetch_stat_wal(void) -{ - backend_read_statsfile(); + while (cur) + { + PgStat_EntryRef *entry_ref = + dlist_container(PgStat_EntryRef, pending_node, cur); + PgStat_HashKey key = entry_ref->shared_entry->key; + PgStat_Kind kind = key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + bool did_flush; + dlist_node *next; - return &walStats; -} + Assert(!kind_info->fixed_amount); + Assert(kind_info->flush_pending_cb != NULL); -/* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the slru statistics struct. - */ -PgStat_SLRUStats * -pgstat_fetch_slru(void) -{ - backend_read_statsfile(); + /* flush the stats, if possible */ + did_flush = kind_info->flush_pending_cb(entry_ref, nowait); - return slruStats; -} + Assert(did_flush || nowait); -/* - * Support function for the SQL-callable pgstat* functions. Returns - * a pointer to the replication slot statistics struct. - */ -PgStat_StatReplSlotEntry * -pgstat_fetch_replslot(NameData slotname) -{ - backend_read_statsfile(); + /* determine next entry, before deleting the pending entry */ + if (dlist_has_next(&pgStatPending, cur)) + next = dlist_next_node(&pgStatPending, cur); + else + next = NULL; - return pgstat_get_replslot_entry(slotname, false); -} + /* if successfully flushed, remove entry */ + if (did_flush) + pgstat_delete_pending_entry(entry_ref); + else + have_pending = true; -/* - * Support function for the SQL-callable pgstat* functions. Returns - * the collected statistics for one subscription or NULL. - */ -PgStat_StatSubEntry * -pgstat_fetch_stat_subscription(Oid subid) -{ - /* Load the stats file if needed */ - backend_read_statsfile(); + cur = next; + } - return pgstat_get_subscription_entry(subid, false); + Assert(dlist_is_empty(&pgStatPending) == !have_pending); + + return have_pending; } @@ -1464,16 +1156,33 @@ pgstat_fetch_stat_subscription(Oid subid) * ------------------------------------------------------------ */ -/* - * Create pgStatLocalContext, if not already done. - */ -static void -pgstat_setup_memcxt(void) +PgStat_Kind +pgstat_get_kind_from_str(char *kind_str) { - if (!pgStatLocalContext) - pgStatLocalContext = AllocSetContextCreate(TopMemoryContext, - "Statistics snapshot", - ALLOCSET_SMALL_SIZES); + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + if (pg_strcasecmp(kind_str, pgstat_kind_infos[kind].name) == 0) + return kind; + } + + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid statistics kind: \"%s\"", kind_str))); + return PGSTAT_KIND_DATABASE; /* avoid compiler warnings */ +} + +static inline bool +pgstat_is_kind_valid(int ikind) +{ + return ikind >= PGSTAT_KIND_FIRST_VALID && ikind <= PGSTAT_KIND_LAST; +} + +const PgStat_KindInfo * +pgstat_get_kind_info(PgStat_Kind kind) +{ + AssertArg(pgstat_is_kind_valid(kind)); + + return &pgstat_kind_infos[kind]; } /* @@ -1489,642 +1198,44 @@ pgstat_assert_is_up(void) } #endif -/* - * Set common header fields in a statistics message - */ -void -pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype) -{ - hdr->m_type = mtype; -} - - -/* - * Send out one statistics message to the collector - */ -void -pgstat_send(void *msg, int len) -{ - int rc; - - pgstat_assert_is_up(); - - if (pgStatSock == PGINVALID_SOCKET) - return; - - ((PgStat_MsgHdr *) msg)->m_size = len; - - /* We'll retry after EINTR, but ignore all other failures */ - do - { - rc = send(pgStatSock, msg, len, 0); - } while (rc < 0 && errno == EINTR); - -#ifdef USE_ASSERT_CHECKING - /* In debug builds, log send failures ... */ - if (rc < 0) - elog(LOG, "could not send to statistics collector: %m"); -#endif -} - -/* - * Start up the statistics collector process. This is the body of the - * postmaster child process. - * - * The argc/argv parameters are valid only in EXEC_BACKEND case. - */ -NON_EXEC_STATIC void -PgstatCollectorMain(int argc, char *argv[]) -{ - int len; - PgStat_Msg msg; - int wr; - WaitEvent event; - WaitEventSet *wes; - - /* - * Ignore all signals usually bound to some action in the postmaster, - * except SIGHUP and SIGQUIT. Note we don't need a SIGUSR1 handler to - * support latch operations, because we only use a local latch. - */ - pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGINT, SIG_IGN); - pqsignal(SIGTERM, SIG_IGN); - pqsignal(SIGQUIT, SignalHandlerForShutdownRequest); - pqsignal(SIGALRM, SIG_IGN); - pqsignal(SIGPIPE, SIG_IGN); - pqsignal(SIGUSR1, SIG_IGN); - pqsignal(SIGUSR2, SIG_IGN); - /* Reset some signals that are accepted by postmaster but not here */ - pqsignal(SIGCHLD, SIG_DFL); - PG_SETMASK(&UnBlockSig); - - MyBackendType = B_STATS_COLLECTOR; - init_ps_display(NULL); - - /* - * Read in existing stats files or initialize the stats to zero. - */ - pgStatRunningInCollector = true; - pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true); - - /* Prepare to wait for our latch or data in our socket. */ - wes = CreateWaitEventSet(CurrentMemoryContext, 3); - AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); - AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL); - AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL); - - /* - * Loop to process messages until we get SIGQUIT or detect ungraceful - * death of our parent postmaster. - * - * For performance reasons, we don't want to do ResetLatch/WaitLatch after - * every message; instead, do that only after a recv() fails to obtain a - * message. (This effectively means that if backends are sending us stuff - * like mad, we won't notice postmaster death until things slack off a - * bit; which seems fine.) To do that, we have an inner loop that - * iterates as long as recv() succeeds. We do check ConfigReloadPending - * inside the inner loop, which means that such interrupts will get - * serviced but the latch won't get cleared until next time there is a - * break in the action. - */ - for (;;) - { - /* Clear any already-pending wakeups */ - ResetLatch(MyLatch); - - /* - * Quit if we get SIGQUIT from the postmaster. - */ - if (ShutdownRequestPending) - break; - - /* - * Inner loop iterates as long as we keep getting messages, or until - * ShutdownRequestPending becomes set. - */ - while (!ShutdownRequestPending) - { - /* - * Reload configuration if we got SIGHUP from the postmaster. - */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - } - - /* - * Write the stats file(s) if a new request has arrived that is - * not satisfied by existing file(s). - */ - if (pgstat_write_statsfile_needed()) - pgstat_write_statsfiles(false, false); - - /* - * Try to receive and process a message. This will not block, - * since the socket is set to non-blocking mode. - * - * XXX On Windows, we have to force pgwin32_recv to cooperate, - * despite the previous use of pg_set_noblock() on the socket. - * This is extremely broken and should be fixed someday. - */ -#ifdef WIN32 - pgwin32_noblock = 1; -#endif - - len = recv(pgStatSock, (char *) &msg, - sizeof(PgStat_Msg), 0); - -#ifdef WIN32 - pgwin32_noblock = 0; -#endif - - if (len < 0) - { - if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) - break; /* out of inner loop */ - ereport(ERROR, - (errcode_for_socket_access(), - errmsg("could not read statistics message: %m"))); - } - - /* - * We ignore messages that are smaller than our common header - */ - if (len < sizeof(PgStat_MsgHdr)) - continue; - - /* - * The received length must match the length in the header - */ - if (msg.msg_hdr.m_size != len) - continue; - - /* - * O.K. - we accept this message. Process it. - */ - switch (msg.msg_hdr.m_type) - { - case PGSTAT_MTYPE_DUMMY: - break; - - case PGSTAT_MTYPE_INQUIRY: - pgstat_recv_inquiry(&msg.msg_inquiry, len); - break; - - case PGSTAT_MTYPE_TABSTAT: - pgstat_recv_tabstat(&msg.msg_tabstat, len); - break; - - case PGSTAT_MTYPE_TABPURGE: - pgstat_recv_tabpurge(&msg.msg_tabpurge, len); - break; - - case PGSTAT_MTYPE_DROPDB: - pgstat_recv_dropdb(&msg.msg_dropdb, len); - break; - - case PGSTAT_MTYPE_RESETCOUNTER: - pgstat_recv_resetcounter(&msg.msg_resetcounter, len); - break; - - case PGSTAT_MTYPE_RESETSHAREDCOUNTER: - pgstat_recv_resetsharedcounter(&msg.msg_resetsharedcounter, - len); - break; - - case PGSTAT_MTYPE_RESETSINGLECOUNTER: - pgstat_recv_resetsinglecounter(&msg.msg_resetsinglecounter, - len); - break; - - case PGSTAT_MTYPE_RESETSLRUCOUNTER: - pgstat_recv_resetslrucounter(&msg.msg_resetslrucounter, - len); - break; - - case PGSTAT_MTYPE_RESETREPLSLOTCOUNTER: - pgstat_recv_resetreplslotcounter(&msg.msg_resetreplslotcounter, - len); - break; - - case PGSTAT_MTYPE_RESETSUBCOUNTER: - pgstat_recv_resetsubcounter(&msg.msg_resetsubcounter, len); - break; - - case PGSTAT_MTYPE_AUTOVAC_START: - pgstat_recv_autovac(&msg.msg_autovacuum_start, len); - break; - - case PGSTAT_MTYPE_VACUUM: - pgstat_recv_vacuum(&msg.msg_vacuum, len); - break; - - case PGSTAT_MTYPE_ANALYZE: - pgstat_recv_analyze(&msg.msg_analyze, len); - break; - - case PGSTAT_MTYPE_ARCHIVER: - pgstat_recv_archiver(&msg.msg_archiver, len); - break; - - case PGSTAT_MTYPE_BGWRITER: - pgstat_recv_bgwriter(&msg.msg_bgwriter, len); - break; - - case PGSTAT_MTYPE_CHECKPOINTER: - pgstat_recv_checkpointer(&msg.msg_checkpointer, len); - break; - - case PGSTAT_MTYPE_WAL: - pgstat_recv_wal(&msg.msg_wal, len); - break; - - case PGSTAT_MTYPE_SLRU: - pgstat_recv_slru(&msg.msg_slru, len); - break; - - case PGSTAT_MTYPE_FUNCSTAT: - pgstat_recv_funcstat(&msg.msg_funcstat, len); - break; - - case PGSTAT_MTYPE_FUNCPURGE: - pgstat_recv_funcpurge(&msg.msg_funcpurge, len); - break; - - case PGSTAT_MTYPE_RECOVERYCONFLICT: - pgstat_recv_recoveryconflict(&msg.msg_recoveryconflict, - len); - break; - - case PGSTAT_MTYPE_DEADLOCK: - pgstat_recv_deadlock(&msg.msg_deadlock, len); - break; - - case PGSTAT_MTYPE_TEMPFILE: - pgstat_recv_tempfile(&msg.msg_tempfile, len); - break; - - case PGSTAT_MTYPE_CHECKSUMFAILURE: - pgstat_recv_checksum_failure(&msg.msg_checksumfailure, - len); - break; - - case PGSTAT_MTYPE_REPLSLOT: - pgstat_recv_replslot(&msg.msg_replslot, len); - break; - - case PGSTAT_MTYPE_CONNECT: - pgstat_recv_connect(&msg.msg_connect, len); - break; - - case PGSTAT_MTYPE_DISCONNECT: - pgstat_recv_disconnect(&msg.msg_disconnect, len); - break; - - case PGSTAT_MTYPE_SUBSCRIPTIONDROP: - pgstat_recv_subscription_drop(&msg.msg_subscriptiondrop, len); - break; - - case PGSTAT_MTYPE_SUBSCRIPTIONERROR: - pgstat_recv_subscription_error(&msg.msg_subscriptionerror, len); - break; - - default: - break; - } - } /* end of inner message-processing loop */ - - /* Sleep until there's something to do */ -#ifndef WIN32 - wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN); -#else - - /* - * Windows, at least in its Windows Server 2003 R2 incarnation, - * sometimes loses FD_READ events. Waking up and retrying the recv() - * fixes that, so don't sleep indefinitely. This is a crock of the - * first water, but until somebody wants to debug exactly what's - * happening there, this is the best we can do. The two-second - * timeout matches our pre-9.2 behavior, and needs to be short enough - * to not provoke "using stale statistics" complaints from - * backend_read_statsfile. - */ - wr = WaitEventSetWait(wes, 2 * 1000L /* msec */ , &event, 1, - WAIT_EVENT_PGSTAT_MAIN); -#endif - - /* - * Emergency bailout if postmaster has died. This is to avoid the - * necessity for manual cleanup of all postmaster children. - */ - if (wr == 1 && event.events == WL_POSTMASTER_DEATH) - break; - } /* end of outer loop */ - - /* - * Save the final stats to reuse at next startup. - */ - pgstat_write_statsfiles(true, true); - - FreeWaitEventSet(wes); - - exit(0); -} - -/* - * Subroutine to clear stats in a database entry - * - * Tables and functions hashes are initialized to empty. - */ -static void -reset_dbentry_counters(PgStat_StatDBEntry *dbentry) -{ - HASHCTL hash_ctl; - - dbentry->n_xact_commit = 0; - dbentry->n_xact_rollback = 0; - dbentry->n_blocks_fetched = 0; - dbentry->n_blocks_hit = 0; - dbentry->n_tuples_returned = 0; - dbentry->n_tuples_fetched = 0; - dbentry->n_tuples_inserted = 0; - dbentry->n_tuples_updated = 0; - dbentry->n_tuples_deleted = 0; - dbentry->last_autovac_time = 0; - dbentry->n_conflict_tablespace = 0; - dbentry->n_conflict_lock = 0; - dbentry->n_conflict_snapshot = 0; - dbentry->n_conflict_bufferpin = 0; - dbentry->n_conflict_startup_deadlock = 0; - dbentry->n_temp_files = 0; - dbentry->n_temp_bytes = 0; - dbentry->n_deadlocks = 0; - dbentry->n_checksum_failures = 0; - dbentry->last_checksum_failure = 0; - dbentry->n_block_read_time = 0; - dbentry->n_block_write_time = 0; - dbentry->n_sessions = 0; - dbentry->total_session_time = 0; - dbentry->total_active_time = 0; - dbentry->total_idle_in_xact_time = 0; - dbentry->n_sessions_abandoned = 0; - dbentry->n_sessions_fatal = 0; - dbentry->n_sessions_killed = 0; - - dbentry->stat_reset_timestamp = GetCurrentTimestamp(); - dbentry->stats_timestamp = 0; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatTabEntry); - dbentry->tables = hash_create("Per-database table", - PGSTAT_TAB_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); - dbentry->functions = hash_create("Per-database function", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); -} - -/* - * Lookup the hash table entry for the specified database. If no hash - * table entry exists, initialize it, if the create parameter is true. - * Else, return NULL. - */ -static PgStat_StatDBEntry * -pgstat_get_db_entry(Oid databaseid, bool create) -{ - PgStat_StatDBEntry *result; - bool found; - HASHACTION action = (create ? HASH_ENTER : HASH_FIND); - - /* Lookup or create the hash table entry for this database */ - result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, - &databaseid, - action, &found); - - if (!create && !found) - return NULL; - - /* - * If not found, initialize the new one. This creates empty hash tables - * for tables and functions, too. - */ - if (!found) - reset_dbentry_counters(result); - - return result; -} - -/* - * Lookup the hash table entry for the specified table. If no hash - * table entry exists, initialize it, if the create parameter is true. - * Else, return NULL. - */ -static PgStat_StatTabEntry * -pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) -{ - PgStat_StatTabEntry *result; - bool found; - HASHACTION action = (create ? HASH_ENTER : HASH_FIND); - - /* Lookup or create the hash table entry for this table */ - result = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - &tableoid, - action, &found); - - if (!create && !found) - return NULL; - - /* If not found, initialize the new one. */ - if (!found) - { - result->numscans = 0; - result->tuples_returned = 0; - result->tuples_fetched = 0; - result->tuples_inserted = 0; - result->tuples_updated = 0; - result->tuples_deleted = 0; - result->tuples_hot_updated = 0; - result->n_live_tuples = 0; - result->n_dead_tuples = 0; - result->changes_since_analyze = 0; - result->inserts_since_vacuum = 0; - result->blocks_fetched = 0; - result->blocks_hit = 0; - result->vacuum_timestamp = 0; - result->vacuum_count = 0; - result->autovac_vacuum_timestamp = 0; - result->autovac_vacuum_count = 0; - result->analyze_timestamp = 0; - result->analyze_count = 0; - result->autovac_analyze_timestamp = 0; - result->autovac_analyze_count = 0; - } - - return result; -} - -/* - * Return the entry of replication slot stats with the given name. Return - * NULL if not found and the caller didn't request to create it. - * - * create tells whether to create the new slot entry if it is not found. - */ -static PgStat_StatReplSlotEntry * -pgstat_get_replslot_entry(NameData name, bool create) -{ - PgStat_StatReplSlotEntry *slotent; - bool found; - - if (replSlotStatHash == NULL) - { - HASHCTL hash_ctl; - - /* - * Quick return NULL if the hash table is empty and the caller didn't - * request to create the entry. - */ - if (!create) - return NULL; - - hash_ctl.keysize = sizeof(NameData); - hash_ctl.entrysize = sizeof(PgStat_StatReplSlotEntry); - replSlotStatHash = hash_create("Replication slots hash", - PGSTAT_REPLSLOT_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); - } - - slotent = (PgStat_StatReplSlotEntry *) hash_search(replSlotStatHash, - (void *) &name, - create ? HASH_ENTER : HASH_FIND, - &found); - - if (!slotent) - { - /* not found */ - Assert(!create && !found); - return NULL; - } - - /* initialize the entry */ - if (create && !found) - { - namestrcpy(&(slotent->slotname), NameStr(name)); - pgstat_reset_replslot_entry(slotent, 0); - } - - return slotent; -} - -/* - * Reset the given replication slot stats. - */ -static void -pgstat_reset_replslot_entry(PgStat_StatReplSlotEntry *slotent, TimestampTz ts) -{ - /* reset only counters. Don't clear slot name */ - slotent->spill_txns = 0; - slotent->spill_count = 0; - slotent->spill_bytes = 0; - slotent->stream_txns = 0; - slotent->stream_count = 0; - slotent->stream_bytes = 0; - slotent->total_txns = 0; - slotent->total_bytes = 0; - slotent->stat_reset_timestamp = ts; -} - -/* - * Return the subscription statistics entry with the given subscription OID. - * If no subscription entry exists, initialize it, if the create parameter is - * true. Else, return NULL. - */ -static PgStat_StatSubEntry * -pgstat_get_subscription_entry(Oid subid, bool create) -{ - PgStat_StatSubEntry *subentry; - bool found; - HASHACTION action = (create ? HASH_ENTER : HASH_FIND); - - if (subscriptionStatHash == NULL) - { - HASHCTL hash_ctl; - - /* - * Quick return NULL if the hash table is empty and the caller didn't - * request to create the entry. - */ - if (!create) - return NULL; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatSubEntry); - subscriptionStatHash = hash_create("Subscription hash", - PGSTAT_SUBSCRIPTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); - } - - subentry = (PgStat_StatSubEntry *) hash_search(subscriptionStatHash, - (void *) &subid, - action, &found); - - if (!create && !found) - return NULL; - - /* If not found, initialize the new one */ - if (!found) - pgstat_reset_subscription(subentry, 0); - - return subentry; -} - -/* - * Reset the given subscription stats. - */ -static void -pgstat_reset_subscription(PgStat_StatSubEntry *subentry, TimestampTz ts) -{ - subentry->apply_error_count = 0; - subentry->sync_error_count = 0; - subentry->stat_reset_timestamp = ts; -} - /* ------------------------------------------------------------ * reading and writing of on-disk stats file * ------------------------------------------------------------ */ +/* helpers for pgstat_write_statsfile() */ +static void +write_chunk(FILE *fpout, void *ptr, size_t len) +{ + int rc; + + rc = fwrite(ptr, len, 1, fpout); + + /* we'll check for errors with ferror once at the end */ + (void) rc; +} + +#define write_chunk_s(fpout, ptr) write_chunk(fpout, ptr, sizeof(*ptr)) + /* - * Write the global statistics file, as well as requested DB files. - * - * 'permanent' specifies writing to the permanent files not temporary ones. - * When true (happens only when the collector is shutting down), also remove - * the temporary files so that backends starting up under a new postmaster - * can't read old data before the new collector is ready. - * - * When 'allDbs' is false, only the requested databases (listed in - * pending_write_requests) will be written; otherwise, all databases - * will be written. + * This function is called in the last process that is accessing the shared + * stats so locking is not required. */ static void -pgstat_write_statsfiles(bool permanent, bool allDbs) +pgstat_write_statsfile(void) { - HASH_SEQ_STATUS hstat; - PgStat_StatDBEntry *dbentry; FILE *fpout; int32 format_id; - const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; - int rc; + const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + dshash_seq_status hstat; + PgStatShared_HashEntry *ps; + + pgstat_assert_is_up(); + + /* we're shutting down, so it's ok to just override this */ + pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE; elog(DEBUG2, "writing stats file \"%s\"", statfile); @@ -2141,2066 +1252,352 @@ pgstat_write_statsfiles(bool permanent, bool allDbs) return; } - /* - * Set the timestamp of the stats file. - */ - globalStats.stats_timestamp = GetCurrentTimestamp(); - /* * Write the file header --- currently just a format ID. */ format_id = PGSTAT_FILE_FORMAT_ID; - rc = fwrite(&format_id, sizeof(format_id), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + write_chunk_s(fpout, &format_id); /* - * Write global stats struct + * XXX: The following could now be generalized to just iterate over + * pgstat_kind_infos instead of knowing about the different kinds of + * stats. */ - rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ /* * Write archiver stats struct */ - rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_ARCHIVER); + write_chunk_s(fpout, &pgStatLocal.snapshot.archiver); /* - * Write WAL stats struct + * Write bgwriter stats struct */ - rc = fwrite(&walStats, sizeof(walStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_BGWRITER); + write_chunk_s(fpout, &pgStatLocal.snapshot.bgwriter); + + /* + * Write checkpointer stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER); + write_chunk_s(fpout, &pgStatLocal.snapshot.checkpointer); /* * Write SLRU stats struct */ - rc = fwrite(slruStats, sizeof(slruStats), 1, fpout); - (void) rc; /* we'll check for error with ferror */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_SLRU); + write_chunk_s(fpout, &pgStatLocal.snapshot.slru); /* - * Walk through the database table. + * Write WAL stats struct */ - hash_seq_init(&hstat, pgStatDBHash); - while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) + pgstat_build_snapshot_fixed(PGSTAT_KIND_WAL); + write_chunk_s(fpout, &pgStatLocal.snapshot.wal); + + /* + * Walk through the stats entries + */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((ps = dshash_seq_next(&hstat)) != NULL) { - /* - * Write out the table and function stats for this DB into the - * appropriate per-DB stat file, if required. - */ - if (allDbs || pgstat_db_requested(dbentry->databaseid)) - { - /* Make DB's timestamp consistent with the global stats */ - dbentry->stats_timestamp = globalStats.stats_timestamp; - - pgstat_write_db_statsfile(dbentry, permanent); - } - - /* - * Write out the DB entry. We don't write the tables or functions - * pointers, since they're of no use to any other process. - */ - fputc('D', fpout); - rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - - /* - * Write replication slot stats struct - */ - if (replSlotStatHash) - { - PgStat_StatReplSlotEntry *slotent; - - hash_seq_init(&hstat, replSlotStatHash); - while ((slotent = (PgStat_StatReplSlotEntry *) hash_seq_search(&hstat)) != NULL) - { - fputc('R', fpout); - rc = fwrite(slotent, sizeof(PgStat_StatReplSlotEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - } - - /* - * Write subscription stats struct - */ - if (subscriptionStatHash) - { - PgStat_StatSubEntry *subentry; - - hash_seq_init(&hstat, subscriptionStatHash); - while ((subentry = (PgStat_StatSubEntry *) hash_seq_search(&hstat)) != NULL) - { - fputc('S', fpout); - rc = fwrite(subentry, sizeof(PgStat_StatSubEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - } - - /* - * No more output to be done. Close the temp file and replace the old - * pgstat.stat with it. The ferror() check replaces testing for error - * after each individual fputc or fwrite above. - */ - fputc('E', fpout); - - if (ferror(fpout)) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not write temporary statistics file \"%s\": %m", - tmpfile))); - FreeFile(fpout); - unlink(tmpfile); - } - else if (FreeFile(fpout) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not close temporary statistics file \"%s\": %m", - tmpfile))); - unlink(tmpfile); - } - else if (rename(tmpfile, statfile) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", - tmpfile, statfile))); - unlink(tmpfile); - } - - if (permanent) - unlink(pgstat_stat_filename); - - /* - * Now throw away the list of requests. Note that requests sent after we - * started the write are still waiting on the network socket. - */ - list_free(pending_write_requests); - pending_write_requests = NIL; -} - -/* - * return the filename for a DB stat file; filename is the output buffer, - * of length len. - */ -static void -get_dbstat_filename(bool permanent, bool tempname, Oid databaseid, - char *filename, int len) -{ - int printed; - - /* NB -- pgstat_reset_remove_files knows about the pattern this uses */ - printed = snprintf(filename, len, "%s/db_%u.%s", - permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY : - pgstat_stat_directory, - databaseid, - tempname ? "tmp" : "stat"); - if (printed >= len) - elog(ERROR, "overlength pgstat path"); -} - -/* - * Write the stat file for a single database. - * - * If writing to the permanent file (happens when the collector is - * shutting down only), remove the temporary file so that backends - * starting up under a new postmaster can't read the old data before - * the new collector is ready. - */ -static void -pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) -{ - HASH_SEQ_STATUS tstat; - HASH_SEQ_STATUS fstat; - PgStat_StatTabEntry *tabentry; - PgStat_StatFuncEntry *funcentry; - FILE *fpout; - int32 format_id; - Oid dbid = dbentry->databaseid; - int rc; - char tmpfile[MAXPGPATH]; - char statfile[MAXPGPATH]; - - get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH); - get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH); - - elog(DEBUG2, "writing stats file \"%s\"", statfile); - - /* - * Open the statistics temp file to write out the current values. - */ - fpout = AllocateFile(tmpfile, PG_BINARY_W); - if (fpout == NULL) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not open temporary statistics file \"%s\": %m", - tmpfile))); - return; - } - - /* - * Write the file header --- currently just a format ID. - */ - format_id = PGSTAT_FILE_FORMAT_ID; - rc = fwrite(&format_id, sizeof(format_id), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - - /* - * Walk through the database's access stats per table. - */ - hash_seq_init(&tstat, dbentry->tables); - while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL) - { - fputc('T', fpout); - rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - - /* - * Walk through the database's function stats table. - */ - hash_seq_init(&fstat, dbentry->functions); - while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL) - { - fputc('F', fpout); - rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout); - (void) rc; /* we'll check for error with ferror */ - } - - /* - * No more output to be done. Close the temp file and replace the old - * pgstat.stat with it. The ferror() check replaces testing for error - * after each individual fputc or fwrite above. - */ - fputc('E', fpout); - - if (ferror(fpout)) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not write temporary statistics file \"%s\": %m", - tmpfile))); - FreeFile(fpout); - unlink(tmpfile); - } - else if (FreeFile(fpout) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not close temporary statistics file \"%s\": %m", - tmpfile))); - unlink(tmpfile); - } - else if (rename(tmpfile, statfile) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", - tmpfile, statfile))); - unlink(tmpfile); - } - - if (permanent) - { - get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH); - - elog(DEBUG2, "removing temporary stats file \"%s\"", statfile); - unlink(statfile); - } -} - -/* - * Reads in some existing statistics collector files and returns the - * databases hash table that is the top level of the data. - * - * If 'onlydb' is not InvalidOid, it means we only want data for that DB - * plus the shared catalogs ("DB 0"). We'll still populate the DB hash - * table for all databases, but we don't bother even creating table/function - * hash tables for other databases. - * - * 'permanent' specifies reading from the permanent files not temporary ones. - * When true (happens only when the collector is starting up), remove the - * files after reading; the in-memory status is now authoritative, and the - * files would be out of date in case somebody else reads them. - * - * If a 'deep' read is requested, table/function stats are read, otherwise - * the table/function hash tables remain empty. - */ -static HTAB * -pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) -{ - PgStat_StatDBEntry *dbentry; - PgStat_StatDBEntry dbbuf; - HASHCTL hash_ctl; - HTAB *dbhash; - FILE *fpin; - int32 format_id; - bool found; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; - int i; - TimestampTz ts; - - /* - * The tables will live in pgStatLocalContext. - */ - pgstat_setup_memcxt(); - - /* - * Create the DB hashtable - */ - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatDBEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - /* - * Clear out global, archiver, WAL and SLRU statistics so they start from - * zero in case we can't load an existing statsfile. - */ - memset(&globalStats, 0, sizeof(globalStats)); - memset(&archiverStats, 0, sizeof(archiverStats)); - memset(&walStats, 0, sizeof(walStats)); - memset(&slruStats, 0, sizeof(slruStats)); - - /* - * Set the current timestamp (will be kept only in case we can't load an - * existing statsfile). - */ - ts = GetCurrentTimestamp(); - globalStats.bgwriter.stat_reset_timestamp = ts; - archiverStats.stat_reset_timestamp = ts; - walStats.stat_reset_timestamp = ts; - - /* - * Set the same reset timestamp for all SLRU items too. - */ - for (i = 0; i < SLRU_NUM_ELEMENTS; i++) - slruStats[i].stat_reset_timestamp = ts; - - /* - * Try to open the stats file. If it doesn't exist, the backends simply - * return zero for anything and the collector simply starts from scratch - * with empty counters. - * - * ENOENT is a possibility if the stats collector is not running or has - * not yet written the stats file the first time. Any other failure - * condition is suspicious. - */ - if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) - { - if (errno != ENOENT) - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errcode_for_file_access(), - errmsg("could not open statistics file \"%s\": %m", - statfile))); - return dbhash; - } - - /* - * Verify it's of the expected format. - */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || - format_id != PGSTAT_FILE_FORMAT_ID) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - goto done; - } - - /* - * Read global stats struct - */ - if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&globalStats, 0, sizeof(globalStats)); - goto done; - } - - /* - * In the collector, disregard the timestamp we read from the permanent - * stats file; we should be willing to write a temp stats file immediately - * upon the first request from any backend. This only matters if the old - * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not - * an unusual scenario. - */ - if (pgStatRunningInCollector) - globalStats.stats_timestamp = 0; - - /* - * Read archiver stats struct - */ - if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&archiverStats, 0, sizeof(archiverStats)); - goto done; - } - - /* - * Read WAL stats struct - */ - if (fread(&walStats, 1, sizeof(walStats), fpin) != sizeof(walStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&walStats, 0, sizeof(walStats)); - goto done; - } - - /* - * Read SLRU stats struct - */ - if (fread(slruStats, 1, sizeof(slruStats), fpin) != sizeof(slruStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - memset(&slruStats, 0, sizeof(slruStats)); - goto done; - } - - /* - * We found an existing collector stats file. Read it and put all the - * hashtable entries into place. - */ - for (;;) - { - switch (fgetc(fpin)) - { - /* - * 'D' A PgStat_StatDBEntry struct describing a database - * follows. - */ - case 'D': - if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables), - fpin) != offsetof(PgStat_StatDBEntry, tables)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - /* - * Add to the DB hash - */ - dbentry = (PgStat_StatDBEntry *) hash_search(dbhash, - (void *) &dbbuf.databaseid, - HASH_ENTER, - &found); - if (found) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry)); - dbentry->tables = NULL; - dbentry->functions = NULL; - - /* - * In the collector, disregard the timestamp we read from the - * permanent stats file; we should be willing to write a temp - * stats file immediately upon the first request from any - * backend. - */ - if (pgStatRunningInCollector) - dbentry->stats_timestamp = 0; - - /* - * Don't create tables/functions hashtables for uninteresting - * databases. - */ - if (onlydb != InvalidOid) - { - if (dbbuf.databaseid != onlydb && - dbbuf.databaseid != InvalidOid) - break; - } - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatTabEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbentry->tables = hash_create("Per-database table", - PGSTAT_TAB_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); - hash_ctl.hcxt = pgStatLocalContext; - dbentry->functions = hash_create("Per-database function", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - /* - * If requested, read the data from the database-specific - * file. Otherwise we just leave the hashtables empty. - */ - if (deep) - pgstat_read_db_statsfile(dbentry->databaseid, - dbentry->tables, - dbentry->functions, - permanent); - - break; - - /* - * 'R' A PgStat_StatReplSlotEntry struct describing a - * replication slot follows. - */ - case 'R': - { - PgStat_StatReplSlotEntry slotbuf; - PgStat_StatReplSlotEntry *slotent; - - if (fread(&slotbuf, 1, sizeof(PgStat_StatReplSlotEntry), fpin) - != sizeof(PgStat_StatReplSlotEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - /* Create hash table if we don't have it already. */ - if (replSlotStatHash == NULL) - { - HASHCTL hash_ctl; - - hash_ctl.keysize = sizeof(NameData); - hash_ctl.entrysize = sizeof(PgStat_StatReplSlotEntry); - hash_ctl.hcxt = pgStatLocalContext; - replSlotStatHash = hash_create("Replication slots hash", - PGSTAT_REPLSLOT_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - } - - slotent = (PgStat_StatReplSlotEntry *) hash_search(replSlotStatHash, - (void *) &slotbuf.slotname, - HASH_ENTER, NULL); - memcpy(slotent, &slotbuf, sizeof(PgStat_StatReplSlotEntry)); - break; - } - - /* - * 'S' A PgStat_StatSubEntry struct describing subscription - * statistics. - */ - case 'S': - { - PgStat_StatSubEntry subbuf; - PgStat_StatSubEntry *subentry; - - if (fread(&subbuf, 1, sizeof(PgStat_StatSubEntry), fpin) - != sizeof(PgStat_StatSubEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - if (subscriptionStatHash == NULL) - { - HASHCTL hash_ctl; - - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_StatSubEntry); - hash_ctl.hcxt = pgStatLocalContext; - subscriptionStatHash = hash_create("Subscription hash", - PGSTAT_SUBSCRIPTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - } - - subentry = (PgStat_StatSubEntry *) hash_search(subscriptionStatHash, - (void *) &subbuf.subid, - HASH_ENTER, NULL); - - memcpy(subentry, &subbuf, sizeof(subbuf)); - break; - } - - case 'E': - goto done; - - default: - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - } - -done: - FreeFile(fpin); - - /* If requested to read the permanent file, also get rid of it. */ - if (permanent) - { - elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); - unlink(statfile); - } - - return dbhash; -} - - -/* - * Reads in the existing statistics collector file for the given database, - * filling the passed-in tables and functions hash tables. - * - * As in pgstat_read_statsfiles, if the permanent file is requested, it is - * removed after reading. - * - * Note: this code has the ability to skip storing per-table or per-function - * data, if NULL is passed for the corresponding hashtable. That's not used - * at the moment though. - */ -static void -pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, - bool permanent) -{ - PgStat_StatTabEntry *tabentry; - PgStat_StatTabEntry tabbuf; - PgStat_StatFuncEntry funcbuf; - PgStat_StatFuncEntry *funcentry; - FILE *fpin; - int32 format_id; - bool found; - char statfile[MAXPGPATH]; - - get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH); - - /* - * Try to open the stats file. If it doesn't exist, the backends simply - * return zero for anything and the collector simply starts from scratch - * with empty counters. - * - * ENOENT is a possibility if the stats collector is not running or has - * not yet written the stats file the first time. Any other failure - * condition is suspicious. - */ - if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) - { - if (errno != ENOENT) - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errcode_for_file_access(), - errmsg("could not open statistics file \"%s\": %m", - statfile))); - return; - } - - /* - * Verify it's of the expected format. - */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || - format_id != PGSTAT_FILE_FORMAT_ID) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - goto done; - } - - /* - * We found an existing collector stats file. Read it and put all the - * hashtable entries into place. - */ - for (;;) - { - switch (fgetc(fpin)) - { - /* - * 'T' A PgStat_StatTabEntry follows. - */ - case 'T': - if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry), - fpin) != sizeof(PgStat_StatTabEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - /* - * Skip if table data not wanted. - */ - if (tabhash == NULL) - break; - - tabentry = (PgStat_StatTabEntry *) hash_search(tabhash, - (void *) &tabbuf.tableid, - HASH_ENTER, &found); - - if (found) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - memcpy(tabentry, &tabbuf, sizeof(tabbuf)); - break; - - /* - * 'F' A PgStat_StatFuncEntry follows. - */ - case 'F': - if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry), - fpin) != sizeof(PgStat_StatFuncEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - /* - * Skip if function data not wanted. - */ - if (funchash == NULL) - break; - - funcentry = (PgStat_StatFuncEntry *) hash_search(funchash, - (void *) &funcbuf.functionid, - HASH_ENTER, &found); - - if (found) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - - memcpy(funcentry, &funcbuf, sizeof(funcbuf)); - break; - - /* - * 'E' The EOF marker of a complete stats file. - */ - case 'E': - goto done; - - default: - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - goto done; - } - } - -done: - FreeFile(fpin); - - if (permanent) - { - elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); - unlink(statfile); - } -} - -/* - * Attempt to determine the timestamp of the last db statfile write. - * Returns true if successful; the timestamp is stored in *ts. The caller must - * rely on timestamp stored in *ts iff the function returns true. - * - * This needs to be careful about handling databases for which no stats file - * exists, such as databases without a stat entry or those not yet written: - * - * - if there's a database entry in the global file, return the corresponding - * stats_timestamp value. - * - * - if there's no db stat entry (e.g. for a new or inactive database), - * there's no stats_timestamp value, but also nothing to write so we return - * the timestamp of the global statfile. - */ -static bool -pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, - TimestampTz *ts) -{ - PgStat_StatDBEntry dbentry; - PgStat_GlobalStats myGlobalStats; - PgStat_ArchiverStats myArchiverStats; - PgStat_WalStats myWalStats; - PgStat_SLRUStats mySLRUStats[SLRU_NUM_ELEMENTS]; - PgStat_StatReplSlotEntry myReplSlotStats; - PgStat_StatSubEntry mySubStats; - FILE *fpin; - int32 format_id; - const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; - - /* - * Try to open the stats file. As above, anything but ENOENT is worthy of - * complaining about. - */ - if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) - { - if (errno != ENOENT) - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errcode_for_file_access(), - errmsg("could not open statistics file \"%s\": %m", - statfile))); - return false; - } - - /* - * Verify it's of the expected format. - */ - if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || - format_id != PGSTAT_FILE_FORMAT_ID) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read global stats struct - */ - if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), - fpin) != sizeof(myGlobalStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read archiver stats struct - */ - if (fread(&myArchiverStats, 1, sizeof(myArchiverStats), - fpin) != sizeof(myArchiverStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read WAL stats struct - */ - if (fread(&myWalStats, 1, sizeof(myWalStats), fpin) != sizeof(myWalStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* - * Read SLRU stats struct - */ - if (fread(mySLRUStats, 1, sizeof(mySLRUStats), fpin) != sizeof(mySLRUStats)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", statfile))); - FreeFile(fpin); - return false; - } - - /* By default, we're going to return the timestamp of the global file. */ - *ts = myGlobalStats.stats_timestamp; - - /* - * We found an existing collector stats file. Read it and look for a - * record for the requested database. If found, use its timestamp. - */ - for (;;) - { - switch (fgetc(fpin)) - { - /* - * 'D' A PgStat_StatDBEntry struct describing a database - * follows. - */ - case 'D': - if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables), - fpin) != offsetof(PgStat_StatDBEntry, tables)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - FreeFile(fpin); - return false; - } - - /* - * If this is the DB we're looking for, save its timestamp and - * we're done. - */ - if (dbentry.databaseid == databaseid) - { - *ts = dbentry.stats_timestamp; - goto done; - } - - break; - - /* - * 'R' A PgStat_StatReplSlotEntry struct describing a - * replication slot follows. - */ - case 'R': - if (fread(&myReplSlotStats, 1, sizeof(PgStat_StatReplSlotEntry), fpin) - != sizeof(PgStat_StatReplSlotEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - FreeFile(fpin); - return false; - } - break; - - /* - * 'S' A PgStat_StatSubEntry struct describing subscription - * statistics follows. - */ - case 'S': - if (fread(&mySubStats, 1, sizeof(PgStat_StatSubEntry), fpin) - != sizeof(PgStat_StatSubEntry)) - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - FreeFile(fpin); - return false; - } - break; - - case 'E': - goto done; - - default: - { - ereport(pgStatRunningInCollector ? LOG : WARNING, - (errmsg("corrupted statistics file \"%s\"", - statfile))); - FreeFile(fpin); - return false; - } - } - } - -done: - FreeFile(fpin); - return true; -} - -/* - * If not already done, read the statistics collector stats file into - * some hash tables. The results will be kept until pgstat_clear_snapshot() - * is called (typically, at end of transaction). - */ -static void -backend_read_statsfile(void) -{ - TimestampTz min_ts = 0; - TimestampTz ref_ts = 0; - Oid inquiry_db; - int count; - - pgstat_assert_is_up(); - - /* already read it? */ - if (pgStatDBHash) - return; - Assert(!pgStatRunningInCollector); - - /* - * In a normal backend, we check staleness of the data for our own DB, and - * so we send MyDatabaseId in inquiry messages. In the autovac launcher, - * check staleness of the shared-catalog data, and send InvalidOid in - * inquiry messages so as not to force writing unnecessary data. - */ - if (IsAutoVacuumLauncherProcess()) - inquiry_db = InvalidOid; - else - inquiry_db = MyDatabaseId; - - /* - * Loop until fresh enough stats file is available or we ran out of time. - * The stats inquiry message is sent repeatedly in case collector drops - * it; but not every single time, as that just swamps the collector. - */ - for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++) - { - bool ok; - TimestampTz file_ts = 0; - TimestampTz cur_ts; + PgStatShared_Common *shstats; + const PgStat_KindInfo *kind_info = NULL; CHECK_FOR_INTERRUPTS(); - ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts); + /* we may have some "dropped" entries not yet removed, skip them */ + Assert(!ps->dropped); + if (ps->dropped) + continue; - cur_ts = GetCurrentTimestamp(); - /* Calculate min acceptable timestamp, if we didn't already */ - if (count == 0 || cur_ts < ref_ts) + shstats = (PgStatShared_Common *) dsa_get_address(pgStatLocal.dsa, ps->body); + + kind_info = pgstat_get_kind_info(ps->key.kind); + + /* if not dropped the valid-entry refcount should exist */ + Assert(pg_atomic_read_u32(&ps->refcount) > 0); + + if (!kind_info->to_serialized_name) { - /* - * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL - * msec before now. This indirectly ensures that the collector - * needn't write the file more often than PGSTAT_STAT_INTERVAL. In - * an autovacuum worker, however, we want a lower delay to avoid - * using stale data, so we use PGSTAT_RETRY_DELAY (since the - * number of workers is low, this shouldn't be a problem). - * - * We don't recompute min_ts after sleeping, except in the - * unlikely case that cur_ts went backwards. So we might end up - * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In - * practice that shouldn't happen, though, as long as the sleep - * time is less than PGSTAT_STAT_INTERVAL; and we don't want to - * tell the collector that our cutoff time is less than what we'd - * actually accept. - */ - ref_ts = cur_ts; - if (IsAutoVacuumWorkerProcess()) - min_ts = TimestampTzPlusMilliseconds(ref_ts, - -PGSTAT_RETRY_DELAY); - else - min_ts = TimestampTzPlusMilliseconds(ref_ts, - -PGSTAT_STAT_INTERVAL); + /* normal stats entry, identified by PgStat_HashKey */ + fputc('S', fpout); + write_chunk_s(fpout, &ps->key); + } + else + { + /* stats entry identified by name on disk (e.g. slots) */ + NameData name; + + kind_info->to_serialized_name(shstats, &name); + + fputc('N', fpout); + write_chunk_s(fpout, &ps->key.kind); + write_chunk_s(fpout, &name); } - /* - * If the file timestamp is actually newer than cur_ts, we must have - * had a clock glitch (system time went backwards) or there is clock - * skew between our processor and the stats collector's processor. - * Accept the file, but send an inquiry message anyway to make - * pgstat_recv_inquiry do a sanity check on the collector's time. - */ - if (ok && file_ts > cur_ts) - { - /* - * A small amount of clock skew between processors isn't terribly - * surprising, but a large difference is worth logging. We - * arbitrarily define "large" as 1000 msec. - */ - if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000)) - { - char *filetime; - char *mytime; - - /* Copy because timestamptz_to_str returns a static buffer */ - filetime = pstrdup(timestamptz_to_str(file_ts)); - mytime = pstrdup(timestamptz_to_str(cur_ts)); - ereport(LOG, - (errmsg("statistics collector's time %s is later than backend local time %s", - filetime, mytime))); - pfree(filetime); - pfree(mytime); - } - - pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); - break; - } - - /* Normal acceptance case: file is not older than cutoff time */ - if (ok && file_ts >= min_ts) - break; - - /* Not there or too old, so kick the collector and wait a bit */ - if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) - pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); - - pg_usleep(PGSTAT_RETRY_DELAY * 1000L); + /* Write except the header part of the entry */ + write_chunk(fpout, + pgstat_get_entry_data(ps->key.kind, shstats), + pgstat_get_entry_len(ps->key.kind)); } + dshash_seq_term(&hstat); - if (count >= PGSTAT_POLL_LOOP_COUNT) + /* + * No more output to be done. Close the temp file and replace the old + * pgstat.stat with it. The ferror() check replaces testing for error + * after each individual fputc or fwrite (in write_chunk()) above. + */ + fputc('E', fpout); + + if (ferror(fpout)) + { ereport(LOG, - (errmsg("using stale statistics instead of current ones " - "because stats collector is not responding"))); - - /* - * Autovacuum launcher wants stats about all databases, but a shallow read - * is sufficient. Regular backends want a deep read for just the tables - * they can see (MyDatabaseId + shared catalogs). - */ - if (IsAutoVacuumLauncherProcess()) - pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false); - else - pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true); + (errcode_for_file_access(), + errmsg("could not write temporary statistics file \"%s\": %m", + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); + } + else if (FreeFile(fpout) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close temporary statistics file \"%s\": %m", + tmpfile))); + unlink(tmpfile); + } + else if (rename(tmpfile, statfile) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", + tmpfile, statfile))); + unlink(tmpfile); + } } -/* - * Do we need to write out any stats files? - */ +/* helpers for pgstat_read_statsfile() */ static bool -pgstat_write_statsfile_needed(void) +read_chunk(FILE *fpin, void *ptr, size_t len) { - if (pending_write_requests != NIL) - return true; - - /* Everything was written recently */ - return false; + return fread(ptr, 1, len, fpin) == len; } -/* - * Checks whether stats for a particular DB need to be written to a file. - */ -static bool -pgstat_db_requested(Oid databaseid) -{ - /* - * If any requests are outstanding at all, we should write the stats for - * shared catalogs (the "database" with OID 0). This ensures that - * backends will see up-to-date stats for shared catalogs, even though - * they send inquiry messages mentioning only their own DB. - */ - if (databaseid == InvalidOid && pending_write_requests != NIL) - return true; - - /* Search to see if there's an open request to write this database. */ - if (list_member_oid(pending_write_requests, databaseid)) - return true; - - return false; -} - - -/* ------------------------------------------------------------ - * stats collector message processing functions - * ------------------------------------------------------------ - */ +#define read_chunk_s(fpin, ptr) read_chunk(fpin, ptr, sizeof(*ptr)) /* - * Process stat inquiry requests. + * Reads in existing statistics file into the shared stats hash. + * + * This function is called in the only process that is accessing the shared + * stats so locking is not required. */ static void -pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) +pgstat_read_statsfile(void) { - PgStat_StatDBEntry *dbentry; - - elog(DEBUG2, "received inquiry for database %u", msg->databaseid); - - /* - * If there's already a write request for this DB, there's nothing to do. - * - * Note that if a request is found, we return early and skip the below - * check for clock skew. This is okay, since the only way for a DB - * request to be present in the list is that we have been here since the - * last write round. It seems sufficient to check for clock skew once per - * write round. - */ - if (list_member_oid(pending_write_requests, msg->databaseid)) - return; - - /* - * Check to see if we last wrote this database at a time >= the requested - * cutoff time. If so, this is a stale request that was generated before - * we updated the DB file, and we don't need to do so again. - * - * If the requestor's local clock time is older than stats_timestamp, we - * should suspect a clock glitch, ie system time going backwards; though - * the more likely explanation is just delayed message receipt. It is - * worth expending a GetCurrentTimestamp call to be sure, since a large - * retreat in the system clock reading could otherwise cause us to neglect - * to update the stats file for a long time. - */ - dbentry = pgstat_get_db_entry(msg->databaseid, false); - if (dbentry == NULL) - { - /* - * We have no data for this DB. Enter a write request anyway so that - * the global stats will get updated. This is needed to prevent - * backend_read_statsfile from waiting for data that we cannot supply, - * in the case of a new DB that nobody has yet reported any stats for. - * See the behavior of pgstat_read_db_statsfile_timestamp. - */ - } - else if (msg->clock_time < dbentry->stats_timestamp) - { - TimestampTz cur_ts = GetCurrentTimestamp(); - - if (cur_ts < dbentry->stats_timestamp) - { - /* - * Sure enough, time went backwards. Force a new stats file write - * to get back in sync; but first, log a complaint. - */ - char *writetime; - char *mytime; - - /* Copy because timestamptz_to_str returns a static buffer */ - writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp)); - mytime = pstrdup(timestamptz_to_str(cur_ts)); - ereport(LOG, - (errmsg("stats_timestamp %s is later than collector's time %s for database %u", - writetime, mytime, dbentry->databaseid))); - pfree(writetime); - pfree(mytime); - } - else - { - /* - * Nope, it's just an old request. Assuming msg's clock_time is - * >= its cutoff_time, it must be stale, so we can ignore it. - */ - return; - } - } - else if (msg->cutoff_time <= dbentry->stats_timestamp) - { - /* Stale request, ignore it */ - return; - } - - /* - * We need to write this DB, so create a request. - */ - pending_write_requests = lappend_oid(pending_write_requests, - msg->databaseid); -} - -/* - * Count what the backend has done. - */ -static void -pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; - int i; + FILE *fpin; + int32 format_id; bool found; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - /* - * Update database-wide stats. - */ - dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit); - dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback); - dbentry->n_block_read_time += msg->m_block_read_time; - dbentry->n_block_write_time += msg->m_block_write_time; - - dbentry->total_session_time += msg->m_session_time; - dbentry->total_active_time += msg->m_active_time; - dbentry->total_idle_in_xact_time += msg->m_idle_in_xact_time; - - /* - * Process all table entries in the message. - */ - for (i = 0; i < msg->m_nentries; i++) - { - PgStat_TableEntry *tabmsg = &(msg->m_entry[i]); - - tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, - (void *) &(tabmsg->t_id), - HASH_ENTER, &found); - - if (!found) - { - /* - * If it's a new table entry, initialize counters to the values we - * just got. - */ - tabentry->numscans = tabmsg->t_counts.t_numscans; - tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned; - tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched; - tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted; - tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated; - tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted; - tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated; - tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples; - tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples; - tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples; - tabentry->inserts_since_vacuum = tabmsg->t_counts.t_tuples_inserted; - tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched; - tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit; - - tabentry->vacuum_timestamp = 0; - tabentry->vacuum_count = 0; - tabentry->autovac_vacuum_timestamp = 0; - tabentry->autovac_vacuum_count = 0; - tabentry->analyze_timestamp = 0; - tabentry->analyze_count = 0; - tabentry->autovac_analyze_timestamp = 0; - tabentry->autovac_analyze_count = 0; - } - else - { - /* - * Otherwise add the values to the existing entry. - */ - tabentry->numscans += tabmsg->t_counts.t_numscans; - tabentry->tuples_returned += tabmsg->t_counts.t_tuples_returned; - tabentry->tuples_fetched += tabmsg->t_counts.t_tuples_fetched; - tabentry->tuples_inserted += tabmsg->t_counts.t_tuples_inserted; - tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated; - tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted; - tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated; - - /* - * If table was truncated/dropped, first reset the live/dead - * counters. - */ - if (tabmsg->t_counts.t_truncdropped) - { - tabentry->n_live_tuples = 0; - tabentry->n_dead_tuples = 0; - tabentry->inserts_since_vacuum = 0; - } - tabentry->n_live_tuples += tabmsg->t_counts.t_delta_live_tuples; - tabentry->n_dead_tuples += tabmsg->t_counts.t_delta_dead_tuples; - tabentry->changes_since_analyze += tabmsg->t_counts.t_changed_tuples; - tabentry->inserts_since_vacuum += tabmsg->t_counts.t_tuples_inserted; - tabentry->blocks_fetched += tabmsg->t_counts.t_blocks_fetched; - tabentry->blocks_hit += tabmsg->t_counts.t_blocks_hit; - } - - /* Clamp n_live_tuples in case of negative delta_live_tuples */ - tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); - /* Likewise for n_dead_tuples */ - tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); - - /* - * Add per-table stats to the per-database entry, too. - */ - dbentry->n_tuples_returned += tabmsg->t_counts.t_tuples_returned; - dbentry->n_tuples_fetched += tabmsg->t_counts.t_tuples_fetched; - dbentry->n_tuples_inserted += tabmsg->t_counts.t_tuples_inserted; - dbentry->n_tuples_updated += tabmsg->t_counts.t_tuples_updated; - dbentry->n_tuples_deleted += tabmsg->t_counts.t_tuples_deleted; - dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched; - dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit; - } -} - -/* - * Arrange for dead table removal. - */ -static void -pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - int i; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - - /* - * No need to purge if we don't even know the database. - */ - if (!dbentry || !dbentry->tables) - return; - - /* - * Process all table entries in the message. - */ - for (i = 0; i < msg->m_nentries; i++) - { - /* Remove from hashtable if present; we don't care if it's not. */ - (void) hash_search(dbentry->tables, - (void *) &(msg->m_tableid[i]), - HASH_REMOVE, NULL); - } -} - -/* - * Arrange for dead database removal - */ -static void -pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len) -{ - Oid dbid = msg->m_databaseid; - PgStat_StatDBEntry *dbentry; - - /* - * Lookup the database in the hashtable. - */ - dbentry = pgstat_get_db_entry(dbid, false); - - /* - * If found, remove it (along with the db statfile). - */ - if (dbentry) - { - char statfile[MAXPGPATH]; - - get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH); - - elog(DEBUG2, "removing stats file \"%s\"", statfile); - unlink(statfile); - - if (dbentry->tables != NULL) - hash_destroy(dbentry->tables); - if (dbentry->functions != NULL) - hash_destroy(dbentry->functions); - - if (hash_search(pgStatDBHash, - (void *) &dbid, - HASH_REMOVE, NULL) == NULL) - ereport(ERROR, - (errmsg("database hash table corrupted during cleanup --- abort"))); - } -} - -/* - * Reset the statistics for the specified database. - */ -static void -pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - /* - * Lookup the database in the hashtable. Nothing to do if not there. - */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - - if (!dbentry) - return; - - /* - * We simply throw away all the database's table entries by recreating a - * new hash table for them. - */ - if (dbentry->tables != NULL) - hash_destroy(dbentry->tables); - if (dbentry->functions != NULL) - hash_destroy(dbentry->functions); - - dbentry->tables = NULL; - dbentry->functions = NULL; - - /* - * Reset database-level stats, too. This creates empty hash tables for - * tables and functions. - */ - reset_dbentry_counters(dbentry); -} - -/* - * Reset some shared statistics of the cluster. - */ -static void -pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len) -{ - if (msg->m_resettarget == PGSTAT_KIND_BGWRITER || - msg->m_resettarget == PGSTAT_KIND_CHECKPOINTER) - { - /* - * Reset the global, bgwriter and checkpointer statistics for the - * cluster. - */ - memset(&globalStats, 0, sizeof(globalStats)); - globalStats.bgwriter.stat_reset_timestamp = GetCurrentTimestamp(); - } - else if (msg->m_resettarget == PGSTAT_KIND_ARCHIVER) - { - /* Reset the archiver statistics for the cluster. */ - memset(&archiverStats, 0, sizeof(archiverStats)); - archiverStats.stat_reset_timestamp = GetCurrentTimestamp(); - } - else if (msg->m_resettarget == PGSTAT_KIND_WAL) - { - /* Reset the WAL statistics for the cluster. */ - memset(&walStats, 0, sizeof(walStats)); - walStats.stat_reset_timestamp = GetCurrentTimestamp(); - } - - /* - * Presumably the sender of this message validated the target, don't - * complain here if it's not valid - */ -} - -/* - * Reset a statistics for a single object, which may be of current - * database or shared across all databases in the cluster. - */ -static void -pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - if (IsSharedRelation(msg->m_objectid)) - dbentry = pgstat_get_db_entry(InvalidOid, false); - else - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - - if (!dbentry) - return; - - /* Set the reset timestamp for the whole database */ - dbentry->stat_reset_timestamp = GetCurrentTimestamp(); - - /* Remove object if it exists, ignore it if not */ - if (msg->m_resettype == PGSTAT_KIND_RELATION) - (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid), - HASH_REMOVE, NULL); - else if (msg->m_resettype == PGSTAT_KIND_FUNCTION) - (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid), - HASH_REMOVE, NULL); -} - -/* - * Reset some SLRU statistics of the cluster. - */ -static void -pgstat_recv_resetslrucounter(PgStat_MsgResetslrucounter *msg, int len) -{ - int i; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + PgStat_ShmemControl *shmem = pgStatLocal.shmem; TimestampTz ts = GetCurrentTimestamp(); - for (i = 0; i < SLRU_NUM_ELEMENTS; i++) + /* shouldn't be called from postmaster */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + elog(DEBUG2, "reading stats file \"%s\"", statfile); + + /* + * Try to open the stats file. If it doesn't exist, the backends simply + * returns zero for anything and statistics simply starts from scratch + * with empty counters. + * + * ENOENT is a possibility if stats collection was previously disabled or + * has not yet written the stats file for the first time. Any other + * failure condition is suspicious. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) { - /* reset entry with the given index, or all entries (index is -1) */ - if ((msg->m_index == -1) || (msg->m_index == i)) - { - memset(&slruStats[i], 0, sizeof(slruStats[i])); - slruStats[i].stat_reset_timestamp = ts; - } - } -} - -/* - * Reset some replication slot statistics of the cluster. - */ -static void -pgstat_recv_resetreplslotcounter(PgStat_MsgResetreplslotcounter *msg, - int len) -{ - PgStat_StatReplSlotEntry *slotent; - TimestampTz ts; - - /* Return if we don't have replication slot statistics */ - if (replSlotStatHash == NULL) + if (errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + pgstat_reset_after_failure(ts); return; - - ts = GetCurrentTimestamp(); - if (msg->clearall) - { - HASH_SEQ_STATUS sstat; - - hash_seq_init(&sstat, replSlotStatHash); - while ((slotent = (PgStat_StatReplSlotEntry *) hash_seq_search(&sstat)) != NULL) - pgstat_reset_replslot_entry(slotent, ts); } - else - { - /* Get the slot statistics to reset */ - slotent = pgstat_get_replslot_entry(msg->m_slotname, false); - - /* - * Nothing to do if the given slot entry is not found. This could - * happen when the slot with the given name is removed and the - * corresponding statistics entry is also removed before receiving the - * reset message. - */ - if (!slotent) - return; - - /* Reset the stats for the requested replication slot */ - pgstat_reset_replslot_entry(slotent, ts); - } -} - -/* - * Reset some subscription statistics of the cluster. - */ -static void -pgstat_recv_resetsubcounter(PgStat_MsgResetsubcounter *msg, int len) -{ - PgStat_StatSubEntry *subentry; - TimestampTz ts; - - /* Return if we don't have replication subscription statistics */ - if (subscriptionStatHash == NULL) - return; - - ts = GetCurrentTimestamp(); - if (!OidIsValid(msg->m_subid)) - { - HASH_SEQ_STATUS sstat; - - /* Clear all subscription counters */ - hash_seq_init(&sstat, subscriptionStatHash); - while ((subentry = (PgStat_StatSubEntry *) hash_seq_search(&sstat)) != NULL) - pgstat_reset_subscription(subentry, ts); - } - else - { - /* Get the subscription statistics to reset */ - subentry = pgstat_get_subscription_entry(msg->m_subid, false); - - /* - * Nothing to do if the given subscription entry is not found. This - * could happen when the subscription with the subid is removed and - * the corresponding statistics entry is also removed before receiving - * the reset message. - */ - if (!subentry) - return; - - /* Reset the stats for the requested subscription */ - pgstat_reset_subscription(subentry, ts); - } -} - -/* - * Process an autovacuum signaling message. - */ -static void -pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len) -{ - PgStat_StatDBEntry *dbentry; /* - * Store the last autovacuum time in the database's hashtable entry. + * Verify it's of the expected format. */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - dbentry->last_autovac_time = msg->m_start_time; -} - -/* - * Process a VACUUM message. - */ -static void -pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; + if (!read_chunk_s(fpin, &format_id) || + format_id != PGSTAT_FILE_FORMAT_ID) + goto error; /* - * Store the data in the table's hashtable entry. + * XXX: The following could now be generalized to just iterate over + * pgstat_kind_infos instead of knowing about the different kinds of + * stats. */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); - - tabentry->n_live_tuples = msg->m_live_tuples; - tabentry->n_dead_tuples = msg->m_dead_tuples; /* - * It is quite possible that a non-aggressive VACUUM ended up skipping - * various pages, however, we'll zero the insert counter here regardless. - * It's currently used only to track when we need to perform an "insert" - * autovacuum, which are mainly intended to freeze newly inserted tuples. - * Zeroing this may just mean we'll not try to vacuum the table again - * until enough tuples have been inserted to trigger another insert - * autovacuum. An anti-wraparound autovacuum will catch any persistent - * stragglers. + * Read archiver stats struct */ - tabentry->inserts_since_vacuum = 0; - - if (msg->m_autovacuum) - { - tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime; - tabentry->autovac_vacuum_count++; - } - else - { - tabentry->vacuum_timestamp = msg->m_vacuumtime; - tabentry->vacuum_count++; - } -} - -/* - * Process an ANALYZE message. - */ -static void -pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - PgStat_StatTabEntry *tabentry; + if (!read_chunk_s(fpin, &shmem->archiver.stats)) + goto error; /* - * Store the data in the table's hashtable entry. + * Read bgwriter stats struct */ - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); - - tabentry->n_live_tuples = msg->m_live_tuples; - tabentry->n_dead_tuples = msg->m_dead_tuples; + if (!read_chunk_s(fpin, &shmem->bgwriter.stats)) + goto error; /* - * If commanded, reset changes_since_analyze to zero. This forgets any - * changes that were committed while the ANALYZE was in progress, but we - * have no good way to estimate how many of those there were. + * Read checkpointer stats struct */ - if (msg->m_resetcounter) - tabentry->changes_since_analyze = 0; + if (!read_chunk_s(fpin, &shmem->checkpointer.stats)) + goto error; - if (msg->m_autovacuum) + /* + * Read SLRU stats struct + */ + if (!read_chunk_s(fpin, &shmem->slru.stats)) + goto error; + + /* + * Read WAL stats struct + */ + if (!read_chunk_s(fpin, &shmem->wal.stats)) + goto error; + + /* + * We found an existing statistics file. Read it and put all the hash + * table entries into place. + */ + for (;;) { - tabentry->autovac_analyze_timestamp = msg->m_analyzetime; - tabentry->autovac_analyze_count++; - } - else - { - tabentry->analyze_timestamp = msg->m_analyzetime; - tabentry->analyze_count++; - } -} + char t = fgetc(fpin); -/* - * Process a ARCHIVER message. - */ -static void -pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len) -{ - if (msg->m_failed) - { - /* Failed archival attempt */ - ++archiverStats.failed_count; - memcpy(archiverStats.last_failed_wal, msg->m_xlog, - sizeof(archiverStats.last_failed_wal)); - archiverStats.last_failed_timestamp = msg->m_timestamp; - } - else - { - /* Successful archival operation */ - ++archiverStats.archived_count; - memcpy(archiverStats.last_archived_wal, msg->m_xlog, - sizeof(archiverStats.last_archived_wal)); - archiverStats.last_archived_timestamp = msg->m_timestamp; - } -} - -/* - * Process a BGWRITER message. - */ -static void -pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len) -{ - globalStats.bgwriter.buf_written_clean += msg->m_buf_written_clean; - globalStats.bgwriter.maxwritten_clean += msg->m_maxwritten_clean; - globalStats.bgwriter.buf_alloc += msg->m_buf_alloc; -} - -/* - * Process a CHECKPOINTER message. - */ -static void -pgstat_recv_checkpointer(PgStat_MsgCheckpointer *msg, int len) -{ - globalStats.checkpointer.timed_checkpoints += msg->m_timed_checkpoints; - globalStats.checkpointer.requested_checkpoints += msg->m_requested_checkpoints; - globalStats.checkpointer.checkpoint_write_time += msg->m_checkpoint_write_time; - globalStats.checkpointer.checkpoint_sync_time += msg->m_checkpoint_sync_time; - globalStats.checkpointer.buf_written_checkpoints += msg->m_buf_written_checkpoints; - globalStats.checkpointer.buf_written_backend += msg->m_buf_written_backend; - globalStats.checkpointer.buf_fsync_backend += msg->m_buf_fsync_backend; -} - -/* - * Process a WAL message. - */ -static void -pgstat_recv_wal(PgStat_MsgWal *msg, int len) -{ - walStats.wal_records += msg->m_wal_records; - walStats.wal_fpi += msg->m_wal_fpi; - walStats.wal_bytes += msg->m_wal_bytes; - walStats.wal_buffers_full += msg->m_wal_buffers_full; - walStats.wal_write += msg->m_wal_write; - walStats.wal_sync += msg->m_wal_sync; - walStats.wal_write_time += msg->m_wal_write_time; - walStats.wal_sync_time += msg->m_wal_sync_time; -} - -/* - * Process a SLRU message. - */ -static void -pgstat_recv_slru(PgStat_MsgSLRU *msg, int len) -{ - slruStats[msg->m_index].blocks_zeroed += msg->m_blocks_zeroed; - slruStats[msg->m_index].blocks_hit += msg->m_blocks_hit; - slruStats[msg->m_index].blocks_read += msg->m_blocks_read; - slruStats[msg->m_index].blocks_written += msg->m_blocks_written; - slruStats[msg->m_index].blocks_exists += msg->m_blocks_exists; - slruStats[msg->m_index].flush += msg->m_flush; - slruStats[msg->m_index].truncate += msg->m_truncate; -} - -/* - * Process a RECOVERYCONFLICT message. - */ -static void -pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - switch (msg->m_reason) - { - case PROCSIG_RECOVERY_CONFLICT_DATABASE: - - /* - * Since we drop the information about the database as soon as it - * replicates, there is no point in counting these conflicts. - */ - break; - case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: - dbentry->n_conflict_tablespace++; - break; - case PROCSIG_RECOVERY_CONFLICT_LOCK: - dbentry->n_conflict_lock++; - break; - case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: - dbentry->n_conflict_snapshot++; - break; - case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: - dbentry->n_conflict_bufferpin++; - break; - case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: - dbentry->n_conflict_startup_deadlock++; - break; - } -} - -/* - * Process a DEADLOCK message. - */ -static void -pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - dbentry->n_deadlocks++; -} - -/* - * Process a CHECKSUMFAILURE message. - */ -static void -pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - dbentry->n_checksum_failures += msg->m_failurecount; - dbentry->last_checksum_failure = msg->m_failure_time; -} - -/* - * Process a REPLSLOT message. - */ -static void -pgstat_recv_replslot(PgStat_MsgReplSlot *msg, int len) -{ - if (msg->m_drop) - { - Assert(!msg->m_create); - - /* Remove the replication slot statistics with the given name */ - if (replSlotStatHash != NULL) - (void) hash_search(replSlotStatHash, - (void *) &(msg->m_slotname), - HASH_REMOVE, - NULL); - } - else - { - PgStat_StatReplSlotEntry *slotent; - - slotent = pgstat_get_replslot_entry(msg->m_slotname, true); - Assert(slotent); - - if (msg->m_create) + switch (t) { - /* - * If the message for dropping the slot with the same name gets - * lost, slotent has stats for the old slot. So we initialize all - * counters at slot creation. - */ - pgstat_reset_replslot_entry(slotent, 0); - } - else - { - /* Update the replication slot statistics */ - slotent->spill_txns += msg->m_spill_txns; - slotent->spill_count += msg->m_spill_count; - slotent->spill_bytes += msg->m_spill_bytes; - slotent->stream_txns += msg->m_stream_txns; - slotent->stream_count += msg->m_stream_count; - slotent->stream_bytes += msg->m_stream_bytes; - slotent->total_txns += msg->m_total_txns; - slotent->total_bytes += msg->m_total_bytes; + case 'S': + case 'N': + { + PgStat_HashKey key; + PgStatShared_HashEntry *p; + PgStatShared_Common *header; + + CHECK_FOR_INTERRUPTS(); + + if (t == 'S') + { + /* normal stats entry, identified by PgStat_HashKey */ + if (!read_chunk_s(fpin, &key)) + goto error; + + if (!pgstat_is_kind_valid(key.kind)) + goto error; + } + else + { + /* stats entry identified by name on disk (e.g. slots) */ + const PgStat_KindInfo *kind_info = NULL; + PgStat_Kind kind; + NameData name; + + if (!read_chunk_s(fpin, &kind)) + goto error; + if (!read_chunk_s(fpin, &name)) + goto error; + if (!pgstat_is_kind_valid(kind)) + goto error; + + kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->from_serialized_name) + goto error; + + if (!kind_info->from_serialized_name(&name, &key)) + { + /* skip over data for entry we don't care about */ + if (fseek(fpin, pgstat_get_entry_len(kind), SEEK_CUR) != 0) + goto error; + + continue; + } + + Assert(key.kind == kind); + } + + /* + * This intentionally doesn't use pgstat_get_entry_ref() - + * putting all stats into checkpointer's + * pgStatEntryRefHash would be wasted effort and memory. + */ + p = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &found); + + /* don't allow duplicate entries */ + if (found) + { + dshash_release_lock(pgStatLocal.shared_hash, p); + elog(WARNING, "found duplicate stats entry %d/%u/%u", + key.kind, key.dboid, key.objoid); + goto error; + } + + header = pgstat_init_entry(key.kind, p); + dshash_release_lock(pgStatLocal.shared_hash, p); + + if (!read_chunk(fpin, + pgstat_get_entry_data(key.kind, header), + pgstat_get_entry_len(key.kind))) + goto error; + + break; + } + case 'E': + goto done; + + default: + goto error; } } + +done: + FreeFile(fpin); + + elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); + unlink(statfile); + + return; + +error: + ereport(LOG, + (errmsg("corrupted statistics file \"%s\"", statfile))); + + /* Set the current timestamp as reset timestamp */ + pgstat_reset_after_failure(ts); + + goto done; } /* - * Process a CONNECT message. + * Helper to reset / drop stats after restoring stats from disk failed, + * potentially after already loading parts. */ static void -pgstat_recv_connect(PgStat_MsgConnect *msg, int len) +pgstat_reset_after_failure(TimestampTz ts) { - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - dbentry->n_sessions++; -} - -/* - * Process a DISCONNECT message. - */ -static void -pgstat_recv_disconnect(PgStat_MsgDisconnect *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - switch (msg->m_cause) + /* reset fixed-numbered stats */ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) { - case DISCONNECT_NOT_YET: - case DISCONNECT_NORMAL: - /* we don't collect these */ - break; - case DISCONNECT_CLIENT_EOF: - dbentry->n_sessions_abandoned++; - break; - case DISCONNECT_FATAL: - dbentry->n_sessions_fatal++; - break; - case DISCONNECT_KILLED: - dbentry->n_sessions_killed++; - break; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->fixed_amount) + continue; + + kind_info->reset_all_cb(ts); } -} - -/* - * Process a TEMPFILE message. - */ -static void -pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - dbentry->n_temp_bytes += msg->m_filesize; - dbentry->n_temp_files += 1; -} - -/* - * Count what the backend has done. - */ -static void -pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len) -{ - PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]); - PgStat_StatDBEntry *dbentry; - PgStat_StatFuncEntry *funcentry; - int i; - bool found; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, true); - - /* - * Process all function entries in the message. - */ - for (i = 0; i < msg->m_nentries; i++, funcmsg++) - { - funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, - (void *) &(funcmsg->f_id), - HASH_ENTER, &found); - - if (!found) - { - /* - * If it's a new function entry, initialize counters to the values - * we just got. - */ - funcentry->f_numcalls = funcmsg->f_numcalls; - funcentry->f_total_time = funcmsg->f_total_time; - funcentry->f_self_time = funcmsg->f_self_time; - } - else - { - /* - * Otherwise add the values to the existing entry. - */ - funcentry->f_numcalls += funcmsg->f_numcalls; - funcentry->f_total_time += funcmsg->f_total_time; - funcentry->f_self_time += funcmsg->f_self_time; - } - } -} - -/* - * Arrange for dead function removal. - */ -static void -pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len) -{ - PgStat_StatDBEntry *dbentry; - int i; - - dbentry = pgstat_get_db_entry(msg->m_databaseid, false); - - /* - * No need to purge if we don't even know the database. - */ - if (!dbentry || !dbentry->functions) - return; - - /* - * Process all function entries in the message. - */ - for (i = 0; i < msg->m_nentries; i++) - { - /* Remove from hashtable if present; we don't care if it's not. */ - (void) hash_search(dbentry->functions, - (void *) &(msg->m_functionid[i]), - HASH_REMOVE, NULL); - } -} - -/* - * Process a SUBSCRIPTIONDROP message. - */ -static void -pgstat_recv_subscription_drop(PgStat_MsgSubscriptionDrop *msg, int len) -{ - /* Return if we don't have replication subscription statistics */ - if (subscriptionStatHash == NULL) - return; - - /* Remove from hashtable if present; we don't care if it's not */ - (void) hash_search(subscriptionStatHash, (void *) &(msg->m_subid), - HASH_REMOVE, NULL); -} - -/* - * Process a SUBSCRIPTIONERROR message. - */ -static void -pgstat_recv_subscription_error(PgStat_MsgSubscriptionError *msg, int len) -{ - PgStat_StatSubEntry *subentry; - - /* Get the subscription stats */ - subentry = pgstat_get_subscription_entry(msg->m_subid, true); - Assert(subentry); - - if (msg->m_is_apply_error) - subentry->apply_error_count++; - else - subentry->sync_error_count++; + + /* and drop variable-numbered ones */ + pgstat_drop_all_entries(); } diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 9f7034df11..d5551e0af6 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -255,7 +255,6 @@ static pid_t StartupPID = 0, WalReceiverPID = 0, AutoVacPID = 0, PgArchPID = 0, - PgStatPID = 0, SysLoggerPID = 0; /* Startup process's status */ @@ -510,7 +509,6 @@ typedef struct PGPROC *AuxiliaryProcs; PGPROC *PreparedXactProcs; PMSignalData *PMSignalState; - InheritableSocket pgStatSock; pid_t PostmasterPid; TimestampTz PgStartTime; TimestampTz PgReloadTime; @@ -645,9 +643,8 @@ PostmasterMain(int argc, char *argv[]) * CAUTION: when changing this list, check for side-effects on the signal * handling setup of child processes. See tcop/postgres.c, * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, - * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, - * postmaster/syslogger.c, postmaster/bgworker.c and - * postmaster/checkpointer.c. + * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/syslogger.c, + * postmaster/bgworker.c and postmaster/checkpointer.c. */ pqinitmask(); PG_SETMASK(&BlockSig); @@ -1384,12 +1381,6 @@ PostmasterMain(int argc, char *argv[]) */ RemovePgTempFiles(); - /* - * Initialize stats collection subsystem (this does NOT start the - * collector process!) - */ - pgstat_init(); - /* * Initialize the autovacuum subsystem (again, no process start yet) */ @@ -1845,11 +1836,6 @@ ServerLoop(void) start_autovac_launcher = false; /* signal processed */ } - /* If we have lost the stats collector, try to start a new one */ - if (PgStatPID == 0 && - (pmState == PM_RUN || pmState == PM_HOT_STANDBY)) - PgStatPID = pgstat_start(); - /* If we have lost the archiver, try to start a new one. */ if (PgArchPID == 0 && PgArchStartupAllowed()) PgArchPID = StartArchiver(); @@ -2772,8 +2758,6 @@ SIGHUP_handler(SIGNAL_ARGS) signal_child(PgArchPID, SIGHUP); if (SysLoggerPID != 0) signal_child(SysLoggerPID, SIGHUP); - if (PgStatPID != 0) - signal_child(PgStatPID, SIGHUP); /* Reload authentication config files too */ if (!load_hba()) @@ -3097,8 +3081,6 @@ reaper(SIGNAL_ARGS) AutoVacPID = StartAutoVacLauncher(); if (PgArchStartupAllowed() && PgArchPID == 0) PgArchPID = StartArchiver(); - if (PgStatPID == 0) - PgStatPID = pgstat_start(); /* workers may be scheduled to start now */ maybe_start_bgworkers(); @@ -3165,13 +3147,6 @@ reaper(SIGNAL_ARGS) SignalChildren(SIGUSR2); pmState = PM_SHUTDOWN_2; - - /* - * We can also shut down the stats collector now; there's - * nothing left for it to do. - */ - if (PgStatPID != 0) - signal_child(PgStatPID, SIGQUIT); } else { @@ -3250,22 +3225,6 @@ reaper(SIGNAL_ARGS) continue; } - /* - * Was it the statistics collector? If so, just try to start a new - * one; no need to force reset of the rest of the system. (If fail, - * we'll try again in future cycles of the main loop.) - */ - if (pid == PgStatPID) - { - PgStatPID = 0; - if (!EXIT_STATUS_0(exitstatus)) - LogChildExit(LOG, _("statistics collector process"), - pid, exitstatus); - if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) - PgStatPID = pgstat_start(); - continue; - } - /* Was it the system logger? If so, try to start a new one */ if (pid == SysLoggerPID) { @@ -3707,22 +3666,6 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(PgArchPID, (SendStop ? SIGSTOP : SIGQUIT)); } - /* - * Force a power-cycle of the pgstat process too. (This isn't absolutely - * necessary, but it seems like a good idea for robustness, and it - * simplifies the state-machine logic in the case where a shutdown request - * arrives during crash processing.) - */ - if (PgStatPID != 0 && take_action) - { - ereport(DEBUG2, - (errmsg_internal("sending %s to process %d", - "SIGQUIT", - (int) PgStatPID))); - signal_child(PgStatPID, SIGQUIT); - allow_immediate_pgstat_restart(); - } - /* We do NOT restart the syslogger */ if (Shutdown != ImmediateShutdown) @@ -3934,12 +3877,10 @@ PostmasterStateMachine(void) FatalError = true; pmState = PM_WAIT_DEAD_END; - /* Kill the walsenders, archiver and stats collector too */ + /* Kill the walsenders and archiver too */ SignalChildren(SIGQUIT); if (PgArchPID != 0) signal_child(PgArchPID, SIGQUIT); - if (PgStatPID != 0) - signal_child(PgStatPID, SIGQUIT); } } } @@ -3963,8 +3904,7 @@ PostmasterStateMachine(void) { /* * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty - * (ie, no dead_end children remain), and the archiver and stats - * collector are gone too. + * (ie, no dead_end children remain), and the archiver is gone too. * * The reason we wait for those two is to protect them against a new * postmaster starting conflicting subprocesses; this isn't an @@ -3974,8 +3914,7 @@ PostmasterStateMachine(void) * normal state transition leading up to PM_WAIT_DEAD_END, or during * FatalError processing. */ - if (dlist_is_empty(&BackendList) && - PgArchPID == 0 && PgStatPID == 0) + if (dlist_is_empty(&BackendList) && PgArchPID == 0) { /* These other guys should be dead already */ Assert(StartupPID == 0); @@ -4183,8 +4122,6 @@ TerminateChildren(int signal) signal_child(AutoVacPID, signal); if (PgArchPID != 0) signal_child(PgArchPID, signal); - if (PgStatPID != 0) - signal_child(PgStatPID, signal); } /* @@ -5115,12 +5052,6 @@ SubPostmasterMain(int argc, char *argv[]) StartBackgroundWorker(); } - if (strcmp(argv[1], "--forkcol") == 0) - { - /* Do not want to attach to shared memory */ - - PgstatCollectorMain(argc, argv); /* does not return */ - } if (strcmp(argv[1], "--forklog") == 0) { /* Do not want to attach to shared memory */ @@ -5224,12 +5155,6 @@ sigusr1_handler(SIGNAL_ARGS) if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && pmState == PM_RECOVERY && Shutdown == NoShutdown) { - /* - * Likewise, start other special children as needed. - */ - Assert(PgStatPID == 0); - PgStatPID = pgstat_start(); - ereport(LOG, (errmsg("database system is ready to accept read-only connections"))); @@ -6145,7 +6070,6 @@ extern slock_t *ShmemLock; extern slock_t *ProcStructLock; extern PGPROC *AuxiliaryProcs; extern PMSignalData *PMSignalState; -extern pgsocket pgStatSock; extern pg_time_t first_syslogger_file_time; #ifndef WIN32 @@ -6201,8 +6125,6 @@ save_backend_variables(BackendParameters *param, Port *port, param->AuxiliaryProcs = AuxiliaryProcs; param->PreparedXactProcs = PreparedXactProcs; param->PMSignalState = PMSignalState; - if (!write_inheritable_socket(¶m->pgStatSock, pgStatSock, childPid)) - return false; param->PostmasterPid = PostmasterPid; param->PgStartTime = PgStartTime; @@ -6436,7 +6358,6 @@ restore_backend_variables(BackendParameters *param, Port *port) AuxiliaryProcs = param->AuxiliaryProcs; PreparedXactProcs = param->PreparedXactProcs; PMSignalState = param->PMSignalState; - read_inheritable_socket(&pgStatSock, ¶m->pgStatSock); PostmasterPid = param->PostmasterPid; PgStartTime = param->PgStartTime; @@ -6475,8 +6396,6 @@ restore_backend_variables(BackendParameters *param, Port *port) if (postmaster_alive_fds[1] >= 0) ReserveExternalFD(); #endif - if (pgStatSock != PGINVALID_SOCKET) - ReserveExternalFD(); } diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 656ec8f555..30e33dace3 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -1911,7 +1911,6 @@ UpdateDecodingStats(LogicalDecodingContext *ctx) (long long) rb->totalTxns, (long long) rb->totalBytes); - namestrcpy(&repSlotStat.slotname, NameStr(ctx->slot->data.name)); repSlotStat.spill_txns = rb->spillTxns; repSlotStat.spill_count = rb->spillCount; repSlotStat.spill_bytes = rb->spillBytes; diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index 697fb23634..b2cb31eaad 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -141,7 +141,7 @@ finish_sync_worker(void) if (IsTransactionState()) { CommitTransactionCommand(); - pgstat_report_stat(false); + pgstat_report_stat(true); } /* And flush all writes. */ @@ -580,7 +580,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) if (started_tx) { CommitTransactionCommand(); - pgstat_report_stat(false); + pgstat_report_stat(true); } } @@ -1386,7 +1386,7 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) MyLogicalRepWorker->relstate, MyLogicalRepWorker->relstate_lsn); CommitTransactionCommand(); - pgstat_report_stat(false); + pgstat_report_stat(true); StartTransactionCommand(); @@ -1630,7 +1630,7 @@ AllTablesyncsReady(void) if (started_tx) { CommitTransactionCommand(); - pgstat_report_stat(false); + pgstat_report_stat(true); } /* diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index f3868b3e1f..7ade49652e 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2937,6 +2937,12 @@ LogicalRepApplyLoop(XLogRecPtr last_received) } send_feedback(last_received, requestReply, requestReply); + + /* + * Force reporting to ensure long idle periods don't lead to + * arbitrarily delayed stats. + */ + pgstat_report_stat(true); } } diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 2217af70d4..c35ea7c35b 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -502,6 +502,14 @@ retry: /* We made this slot active, so it's ours now. */ MyReplicationSlot = s; + + /* + * The call to pgstat_acquire_replslot() protects against stats for + * a different slot, from before a restart or such, being present during + * pgstat_report_replslot(). + */ + if (SlotIsLogical(s)) + pgstat_acquire_replslot(s); } /* @@ -746,20 +754,10 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) elog(DEBUG3, "replication slot drop: %s: removed directory", NameStr(slot->data.name)); /* - * Send a message to drop the replication slot to the stats collector. - * Since there is no guarantee of the order of message transfer on a UDP - * connection, it's possible that a message for creating a new slot - * reaches before a message for removing the old slot. We send the drop - * and create messages while holding ReplicationSlotAllocationLock to - * reduce that possibility. If the messages reached in reverse, we would - * lose one statistics update message. But the next update message will - * create the statistics for the replication slot. - * - * XXX In case, the messages for creation and drop slot of the same name - * get lost and create happens before (auto)vacuum cleans up the dead - * slot, the stats will be accumulated into the old slot. One can imagine - * having OIDs for each slot to avoid the accumulation of stats but that - * doesn't seem worth doing as in practice this won't happen frequently. + * Drop the statistics entry for the replication slot. Do this while + * holding ReplicationSlotAllocationLock so that we don't drop a + * statistics entry for another slot with the same name just created in + * another session. */ if (SlotIsLogical(slot)) pgstat_drop_replslot(slot); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d73a40c1bc..f80f90ac3c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2151,7 +2151,7 @@ BufferSync(int flags) if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); - PendingCheckpointerStats.m_buf_written_checkpoints++; + PendingCheckpointerStats.buf_written_checkpoints++; num_written++; } } @@ -2261,7 +2261,7 @@ BgBufferSync(WritebackContext *wb_context) strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); /* Report buffer alloc counts to pgstat */ - PendingBgWriterStats.m_buf_alloc += recent_alloc; + PendingBgWriterStats.buf_alloc += recent_alloc; /* * If we're not running the LRU scan, just stop after doing the stats @@ -2451,7 +2451,7 @@ BgBufferSync(WritebackContext *wb_context) reusable_buffers++; if (++num_written >= bgwriter_lru_maxpages) { - PendingBgWriterStats.m_maxwritten_clean++; + PendingBgWriterStats.maxwritten_clean++; break; } } @@ -2459,7 +2459,7 @@ BgBufferSync(WritebackContext *wb_context) reusable_buffers++; } - PendingBgWriterStats.m_buf_written_clean += num_written; + PendingBgWriterStats.buf_written_clean += num_written; #ifdef BGW_DEBUG elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d", diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index cd4ebe2fc5..88ff59c568 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -145,6 +145,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); + size = add_size(size, StatsShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -296,6 +297,7 @@ CreateSharedMemoryAndSemaphores(void) BTreeShmemInit(); SyncScanShmemInit(); AsyncShmemInit(); + StatsShmemInit(); #ifdef EXEC_BACKEND diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 8f7f1b2f7c..c24779d0bb 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -176,7 +176,13 @@ static const char *const BuiltinTrancheNames[] = { /* LWTRANCHE_PARALLEL_APPEND: */ "ParallelAppend", /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */ - "PerXactPredicateList" + "PerXactPredicateList", + /* LWTRANCHE_PGSTATS_DSA: */ + "PgStatsDSA", + /* LWTRANCHE_PGSTATS_HASH: */ + "PgStatsHash", + /* LWTRANCHE_PGSTATS_DATA: */ + "PgStatsData", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 260b650f15..95dc2e2c83 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3372,6 +3372,14 @@ ProcessInterrupts(void) IdleSessionTimeoutPending = false; } + if (IdleStatsUpdateTimeoutPending) + { + /* timer should have been disarmed */ + Assert(!IsTransactionBlock()); + IdleStatsUpdateTimeoutPending = false; + pgstat_report_stat(true); + } + if (ProcSignalBarrierPending) ProcessProcSignalBarrier(); @@ -4044,6 +4052,7 @@ PostgresMain(const char *dbname, const char *username) volatile bool send_ready_for_query = true; bool idle_in_transaction_timeout_enabled = false; bool idle_session_timeout_enabled = false; + bool idle_stats_update_timeout_enabled = false; AssertArg(dbname != NULL); AssertArg(username != NULL); @@ -4407,6 +4416,8 @@ PostgresMain(const char *dbname, const char *username) } else { + long stats_timeout; + /* * Process incoming notifies (including self-notifies), if * any, and send relevant messages to the client. Doing it @@ -4417,7 +4428,14 @@ PostgresMain(const char *dbname, const char *username) if (notifyInterruptPending) ProcessNotifyInterrupt(false); - pgstat_report_stat(false); + /* Start the idle-stats-update timer */ + stats_timeout = pgstat_report_stat(false); + if (stats_timeout > 0) + { + idle_stats_update_timeout_enabled = true; + enable_timeout_after(IDLE_STATS_UPDATE_TIMEOUT, + stats_timeout); + } set_ps_display("idle"); pgstat_report_activity(STATE_IDLE, NULL); @@ -4452,9 +4470,9 @@ PostgresMain(const char *dbname, const char *username) firstchar = ReadCommand(&input_message); /* - * (4) turn off the idle-in-transaction and idle-session timeouts, if - * active. We do this before step (5) so that any last-moment timeout - * is certain to be detected in step (5). + * (4) turn off the idle-in-transaction, idle-session and + * idle-stats-update timeouts if active. We do this before step (5) so + * that any last-moment timeout is certain to be detected in step (5). * * At most one of these timeouts will be active, so there's no need to * worry about combining the timeout.c calls into one. @@ -4469,6 +4487,11 @@ PostgresMain(const char *dbname, const char *username) disable_timeout(IDLE_SESSION_TIMEOUT, false); idle_session_timeout_enabled = false; } + if (idle_stats_update_timeout_enabled) + { + disable_timeout(IDLE_STATS_UPDATE_TIMEOUT, false); + idle_stats_update_timeout_enabled = false; + } /* * (5) disable async signal conditions again. diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile index 791ba68e7e..690312308f 100644 --- a/src/backend/utils/activity/Makefile +++ b/src/backend/utils/activity/Makefile @@ -23,6 +23,7 @@ OBJS = \ pgstat_function.o \ pgstat_relation.o \ pgstat_replslot.o \ + pgstat_shmem.o \ pgstat_slru.o \ pgstat_subscription.o \ pgstat_wal.o \ diff --git a/src/backend/utils/activity/pgstat_archiver.c b/src/backend/utils/activity/pgstat_archiver.c index 09bc12070d..851726fd50 100644 --- a/src/backend/utils/activity/pgstat_archiver.c +++ b/src/backend/utils/activity/pgstat_archiver.c @@ -27,14 +27,85 @@ void pgstat_report_archiver(const char *xlog, bool failed) { - PgStat_MsgArchiver msg; + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + TimestampTz now = GetCurrentTimestamp(); - /* - * Prepare and send the message - */ - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER); - msg.m_failed = failed; - strlcpy(msg.m_xlog, xlog, sizeof(msg.m_xlog)); - msg.m_timestamp = GetCurrentTimestamp(); - pgstat_send(&msg, sizeof(msg)); + pgstat_begin_changecount_write(&stats_shmem->changecount); + + if (failed) + { + ++stats_shmem->stats.failed_count; + memcpy(&stats_shmem->stats.last_failed_wal, xlog, + sizeof(stats_shmem->stats.last_failed_wal)); + stats_shmem->stats.last_failed_timestamp = now; + } + else + { + ++stats_shmem->stats.archived_count; + memcpy(&stats_shmem->stats.last_archived_wal, xlog, + sizeof(stats_shmem->stats.last_archived_wal)); + stats_shmem->stats.last_archived_timestamp = now; + } + + pgstat_end_changecount_write(&stats_shmem->changecount); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the archiver statistics struct. + */ +PgStat_ArchiverStats * +pgstat_fetch_stat_archiver(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_ARCHIVER); + + return &pgStatLocal.snapshot.archiver; +} + +void +pgstat_archiver_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + + /* see explanation above PgStatShared_Archiver for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_archiver_snapshot_cb(void) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + PgStat_ArchiverStats *stat_snap = &pgStatLocal.snapshot.archiver; + PgStat_ArchiverStats *reset_offset = &stats_shmem->reset_offset; + PgStat_ArchiverStats reset; + + pgstat_copy_changecounted_stats(stat_snap, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ + if (stat_snap->archived_count == reset.archived_count) + { + stat_snap->last_archived_wal[0] = 0; + stat_snap->last_archived_timestamp = 0; + } + stat_snap->archived_count -= reset.archived_count; + + if (stat_snap->failed_count == reset.failed_count) + { + stat_snap->last_failed_wal[0] = 0; + stat_snap->last_failed_timestamp = 0; + } + stat_snap->failed_count -= reset.failed_count; } diff --git a/src/backend/utils/activity/pgstat_bgwriter.c b/src/backend/utils/activity/pgstat_bgwriter.c index dfea88eca1..fbb1edc527 100644 --- a/src/backend/utils/activity/pgstat_bgwriter.c +++ b/src/backend/utils/activity/pgstat_bgwriter.c @@ -20,12 +20,7 @@ #include "utils/pgstat_internal.h" -/* - * BgWriter global statistics counters. Stored directly in a stats - * message structure so they can be sent without needing to copy things - * around. We assume this init to zeroes. - */ -PgStat_MsgBgWriter PendingBgWriterStats; +PgStat_BgWriterStats PendingBgWriterStats = {0}; /* @@ -34,27 +29,82 @@ PgStat_MsgBgWriter PendingBgWriterStats; void pgstat_report_bgwriter(void) { - /* We assume this initializes to zeroes */ - static const PgStat_MsgBgWriter all_zeroes; + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + static const PgStat_BgWriterStats all_zeroes; + Assert(!pgStatLocal.shmem->is_shutdown); pgstat_assert_is_up(); /* * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. + * this case, avoid unnecessarily modifying the stats entry. */ - if (memcmp(&PendingBgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0) + if (memcmp(&PendingBgWriterStats, &all_zeroes, sizeof(all_zeroes)) == 0) return; - /* - * Prepare and send the message - */ - pgstat_setheader(&PendingBgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER); - pgstat_send(&PendingBgWriterStats, sizeof(PendingBgWriterStats)); + pgstat_begin_changecount_write(&stats_shmem->changecount); + +#define BGWRITER_ACC(fld) stats_shmem->stats.fld += PendingBgWriterStats.fld + BGWRITER_ACC(buf_written_clean); + BGWRITER_ACC(maxwritten_clean); + BGWRITER_ACC(buf_alloc); +#undef BGWRITER_ACC + + pgstat_end_changecount_write(&stats_shmem->changecount); /* * Clear out the statistics buffer, so it can be re-used. */ MemSet(&PendingBgWriterStats, 0, sizeof(PendingBgWriterStats)); } + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the bgwriter statistics struct. + */ +PgStat_BgWriterStats * +pgstat_fetch_stat_bgwriter(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_BGWRITER); + + return &pgStatLocal.snapshot.bgwriter; +} + +void +pgstat_bgwriter_reset_all_cb(TimestampTz ts) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + + /* see explanation above PgStatShared_BgWriter for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_bgwriter_snapshot_cb(void) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + PgStat_BgWriterStats *reset_offset = &stats_shmem->reset_offset; + PgStat_BgWriterStats reset; + + pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.bgwriter, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ +#define BGWRITER_COMP(fld) pgStatLocal.snapshot.bgwriter.fld -= reset.fld; + BGWRITER_COMP(buf_written_clean); + BGWRITER_COMP(maxwritten_clean); + BGWRITER_COMP(buf_alloc); +#undef BGWRITER_COMP +} diff --git a/src/backend/utils/activity/pgstat_checkpointer.c b/src/backend/utils/activity/pgstat_checkpointer.c index 3f4e2054f5..af8d513e7b 100644 --- a/src/backend/utils/activity/pgstat_checkpointer.c +++ b/src/backend/utils/activity/pgstat_checkpointer.c @@ -20,12 +20,7 @@ #include "utils/pgstat_internal.h" -/* - * Checkpointer global statistics counters. Stored directly in a stats - * message structure so they can be sent without needing to copy things - * around. We assume this init to zeroes. - */ -PgStat_MsgCheckpointer PendingCheckpointerStats; +PgStat_CheckpointerStats PendingCheckpointerStats = {0}; /* @@ -35,24 +30,92 @@ void pgstat_report_checkpointer(void) { /* We assume this initializes to zeroes */ - static const PgStat_MsgCheckpointer all_zeroes; + static const PgStat_CheckpointerStats all_zeroes; + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + + Assert(!pgStatLocal.shmem->is_shutdown); + pgstat_assert_is_up(); /* * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. + * this case, avoid unnecessarily modifying the stats entry. */ - if (memcmp(&PendingCheckpointerStats, &all_zeroes, sizeof(PgStat_MsgCheckpointer)) == 0) + if (memcmp(&PendingCheckpointerStats, &all_zeroes, + sizeof(all_zeroes)) == 0) return; - /* - * Prepare and send the message - */ - pgstat_setheader(&PendingCheckpointerStats.m_hdr, PGSTAT_MTYPE_CHECKPOINTER); - pgstat_send(&PendingCheckpointerStats, sizeof(PendingCheckpointerStats)); + pgstat_begin_changecount_write(&stats_shmem->changecount); + +#define CHECKPOINTER_ACC(fld) stats_shmem->stats.fld += PendingCheckpointerStats.fld + CHECKPOINTER_ACC(timed_checkpoints); + CHECKPOINTER_ACC(requested_checkpoints); + CHECKPOINTER_ACC(checkpoint_write_time); + CHECKPOINTER_ACC(checkpoint_sync_time); + CHECKPOINTER_ACC(buf_written_checkpoints); + CHECKPOINTER_ACC(buf_written_backend); + CHECKPOINTER_ACC(buf_fsync_backend); +#undef CHECKPOINTER_ACC + + pgstat_end_changecount_write(&stats_shmem->changecount); /* * Clear out the statistics buffer, so it can be re-used. */ MemSet(&PendingCheckpointerStats, 0, sizeof(PendingCheckpointerStats)); } + +/* + * pgstat_fetch_stat_checkpointer() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the checkpointer statistics struct. + */ +PgStat_CheckpointerStats * +pgstat_fetch_stat_checkpointer(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER); + + return &pgStatLocal.snapshot.checkpointer; +} + +void +pgstat_checkpointer_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + + /* see explanation above PgStatShared_Checkpointer for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_checkpointer_snapshot_cb(void) +{ + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + PgStat_CheckpointerStats *reset_offset = &stats_shmem->reset_offset; + PgStat_CheckpointerStats reset; + + pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.checkpointer, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ +#define CHECKPOINTER_COMP(fld) pgStatLocal.snapshot.checkpointer.fld -= reset.fld; + CHECKPOINTER_COMP(timed_checkpoints); + CHECKPOINTER_COMP(requested_checkpoints); + CHECKPOINTER_COMP(checkpoint_write_time); + CHECKPOINTER_COMP(checkpoint_sync_time); + CHECKPOINTER_COMP(buf_written_checkpoints); + CHECKPOINTER_COMP(buf_written_backend); + CHECKPOINTER_COMP(buf_fsync_backend); +#undef CHECKPOINTER_COMP +} diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c index 6d27657bdb..649d9c6960 100644 --- a/src/backend/utils/activity/pgstat_database.c +++ b/src/backend/utils/activity/pgstat_database.c @@ -19,13 +19,12 @@ #include "utils/pgstat_internal.h" #include "utils/timestamp.h" +#include "storage/procsignal.h" static bool pgstat_should_report_connstat(void); -int pgStatXactCommit = 0; -int pgStatXactRollback = 0; PgStat_Counter pgStatBlockReadTime = 0; PgStat_Counter pgStatBlockWriteTime = 0; PgStat_Counter pgStatActiveTime = 0; @@ -33,25 +32,18 @@ PgStat_Counter pgStatTransactionIdleTime = 0; SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL; +static int pgStatXactCommit = 0; +static int pgStatXactRollback = 0; static PgStat_Counter pgLastSessionReportTime = 0; /* - * Tell the collector that we just dropped a database. - * (If the message gets lost, we will still clean the dead DB eventually - * via future invocations of pgstat_vacuum_stat().) + * Remove entry for the database being dropped. */ void pgstat_drop_database(Oid databaseid) { - PgStat_MsgDropdb msg; - - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB); - msg.m_databaseid = databaseid; - pgstat_send(&msg, sizeof(msg)); + pgstat_drop_transactional(PGSTAT_KIND_DATABASE, databaseid, InvalidOid); } /* @@ -62,16 +54,24 @@ pgstat_drop_database(Oid databaseid) void pgstat_report_autovac(Oid dboid) { - PgStat_MsgAutovacStart msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Database *dbentry; - if (pgStatSock == PGINVALID_SOCKET) - return; + /* can't get here in single user mode */ + Assert(IsUnderPostmaster); - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START); - msg.m_databaseid = dboid; - msg.m_start_time = GetCurrentTimestamp(); + /* + * End-of-vacuum is reported instantly. Report the start the same way for + * consistency. Vacuum doesn't run frequently and is a long-lasting + * operation so it doesn't matter if we get blocked here a little. + */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, + dboid, InvalidOid, false); - pgstat_send(&msg, sizeof(msg)); + dbentry = (PgStatShared_Database *) entry_ref->shared_stats; + dbentry->stats.last_autovac_time = GetCurrentTimestamp(); + + pgstat_unlock_entry(entry_ref); } /* @@ -80,15 +80,39 @@ pgstat_report_autovac(Oid dboid) void pgstat_report_recovery_conflict(int reason) { - PgStat_MsgRecoveryConflict msg; + PgStat_StatDBEntry *dbentry; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + Assert(IsUnderPostmaster); + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT); - msg.m_databaseid = MyDatabaseId; - msg.m_reason = reason; - pgstat_send(&msg, sizeof(msg)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + switch (reason) + { + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + + /* + * Since we drop the information about the database as soon as it + * replicates, there is no point in counting these conflicts. + */ + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + dbentry->n_conflict_tablespace++; + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + dbentry->n_conflict_lock++; + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + dbentry->n_conflict_snapshot++; + break; + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + dbentry->n_conflict_bufferpin++; + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + dbentry->n_conflict_startup_deadlock++; + break; + } } /* @@ -97,14 +121,13 @@ pgstat_report_recovery_conflict(int reason) void pgstat_report_deadlock(void) { - PgStat_MsgDeadlock msg; + PgStat_StatDBEntry *dbent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DEADLOCK); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, sizeof(msg)); + dbent = pgstat_prep_database_pending(MyDatabaseId); + dbent->n_deadlocks++; } /* @@ -113,17 +136,24 @@ pgstat_report_deadlock(void) void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) { - PgStat_MsgChecksumFailure msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Database *sharedent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE); - msg.m_databaseid = dboid; - msg.m_failurecount = failurecount; - msg.m_failure_time = GetCurrentTimestamp(); + /* + * Update the shared stats directly - checksum failures should never be + * common enough for that to be a problem. + */ + entry_ref = + pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false); - pgstat_send(&msg, sizeof(msg)); + sharedent = (PgStatShared_Database *) entry_ref->shared_stats; + sharedent->stats.n_checksum_failures += failurecount; + sharedent->stats.last_checksum_failure = GetCurrentTimestamp(); + + pgstat_unlock_entry(entry_ref); } /* @@ -141,15 +171,14 @@ pgstat_report_checksum_failure(void) void pgstat_report_tempfile(size_t filesize) { - PgStat_MsgTempFile msg; + PgStat_StatDBEntry *dbent; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TEMPFILE); - msg.m_databaseid = MyDatabaseId; - msg.m_filesize = filesize; - pgstat_send(&msg, sizeof(msg)); + dbent = pgstat_prep_database_pending(MyDatabaseId); + dbent->n_temp_bytes += filesize; + dbent->n_temp_files++; } /* @@ -158,16 +187,15 @@ pgstat_report_tempfile(size_t filesize) void pgstat_report_connect(Oid dboid) { - PgStat_MsgConnect msg; + PgStat_StatDBEntry *dbentry; if (!pgstat_should_report_connstat()) return; pgLastSessionReportTime = MyStartTimestamp; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CONNECT); - msg.m_databaseid = MyDatabaseId; - pgstat_send(&msg, sizeof(PgStat_MsgConnect)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + dbentry->n_sessions++; } /* @@ -176,15 +204,42 @@ pgstat_report_connect(Oid dboid) void pgstat_report_disconnect(Oid dboid) { - PgStat_MsgDisconnect msg; + PgStat_StatDBEntry *dbentry; if (!pgstat_should_report_connstat()) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DISCONNECT); - msg.m_databaseid = MyDatabaseId; - msg.m_cause = pgStatSessionEndCause; - pgstat_send(&msg, sizeof(PgStat_MsgDisconnect)); + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + switch (pgStatSessionEndCause) + { + case DISCONNECT_NOT_YET: + case DISCONNECT_NORMAL: + /* we don't collect these */ + break; + case DISCONNECT_CLIENT_EOF: + dbentry->n_sessions_abandoned++; + break; + case DISCONNECT_FATAL: + dbentry->n_sessions_fatal++; + break; + case DISCONNECT_KILLED: + dbentry->n_sessions_killed++; + break; + } +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one database or NULL. NULL doesn't mean + * that the database doesn't exist, just that there are no statistics, so the + * caller is better off to report ZERO instead. + */ +PgStat_StatDBEntry * +pgstat_fetch_stat_dbentry(Oid dboid) +{ + return (PgStat_StatDBEntry *) + pgstat_fetch_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid); } void @@ -205,57 +260,47 @@ AtEOXact_PgStat_Database(bool isCommit, bool parallel) } /* - * Subroutine for pgstat_send_tabstat: Handle xact commit/rollback and I/O + * Subroutine for pgstat_report_stat(): Handle xact commit/rollback and I/O * timings. */ void -pgstat_update_dbstats(PgStat_MsgTabstat *tsmsg, TimestampTz now) +pgstat_update_dbstats(TimestampTz ts) { - if (OidIsValid(tsmsg->m_databaseid)) - { - tsmsg->m_xact_commit = pgStatXactCommit; - tsmsg->m_xact_rollback = pgStatXactRollback; - tsmsg->m_block_read_time = pgStatBlockReadTime; - tsmsg->m_block_write_time = pgStatBlockWriteTime; + PgStat_StatDBEntry *dbentry; - if (pgstat_should_report_connstat()) - { - long secs; - int usecs; + dbentry = pgstat_prep_database_pending(MyDatabaseId); - /* - * pgLastSessionReportTime is initialized to MyStartTimestamp by - * pgstat_report_connect(). - */ - TimestampDifference(pgLastSessionReportTime, now, &secs, &usecs); - pgLastSessionReportTime = now; - tsmsg->m_session_time = (PgStat_Counter) secs * 1000000 + usecs; - tsmsg->m_active_time = pgStatActiveTime; - tsmsg->m_idle_in_xact_time = pgStatTransactionIdleTime; - } - else - { - tsmsg->m_session_time = 0; - tsmsg->m_active_time = 0; - tsmsg->m_idle_in_xact_time = 0; - } - pgStatXactCommit = 0; - pgStatXactRollback = 0; - pgStatBlockReadTime = 0; - pgStatBlockWriteTime = 0; - pgStatActiveTime = 0; - pgStatTransactionIdleTime = 0; - } - else + /* + * Accumulate xact commit/rollback and I/O timings to stats entry of the + * current database. + */ + dbentry->n_xact_commit += pgStatXactCommit; + dbentry->n_xact_rollback += pgStatXactRollback; + dbentry->n_block_read_time += pgStatBlockReadTime; + dbentry->n_block_write_time += pgStatBlockWriteTime; + + if (pgstat_should_report_connstat()) { - tsmsg->m_xact_commit = 0; - tsmsg->m_xact_rollback = 0; - tsmsg->m_block_read_time = 0; - tsmsg->m_block_write_time = 0; - tsmsg->m_session_time = 0; - tsmsg->m_active_time = 0; - tsmsg->m_idle_in_xact_time = 0; + long secs; + int usecs; + + /* + * pgLastSessionReportTime is initialized to MyStartTimestamp by + * pgstat_report_connect(). + */ + TimestampDifference(pgLastSessionReportTime, ts, &secs, &usecs); + pgLastSessionReportTime = ts; + dbentry->total_session_time += (PgStat_Counter) secs * 1000000 + usecs; + dbentry->total_active_time += pgStatActiveTime; + dbentry->total_idle_in_xact_time += pgStatTransactionIdleTime; } + + pgStatXactCommit = 0; + pgStatXactRollback = 0; + pgStatBlockReadTime = 0; + pgStatBlockWriteTime = 0; + pgStatActiveTime = 0; + pgStatTransactionIdleTime = 0; } /* @@ -270,3 +315,111 @@ pgstat_should_report_connstat(void) { return MyBackendType == B_BACKEND; } + +/* + * Find or create a local PgStat_StatDBEntry entry for dboid. + */ +PgStat_StatDBEntry * +pgstat_prep_database_pending(Oid dboid) +{ + PgStat_EntryRef *entry_ref; + + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid, + NULL); + + return entry_ref->pending; + +} + +/* + * Reset the database's reset timestamp, without resetting the contents of the + * database stats. + */ +void +pgstat_reset_database_timestamp(Oid dboid, TimestampTz ts) +{ + PgStat_EntryRef *dbref; + PgStatShared_Database *dbentry; + + dbref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, MyDatabaseId, InvalidOid, + false); + + dbentry = (PgStatShared_Database *) dbref->shared_stats; + dbentry->stats.stat_reset_timestamp = ts; + + pgstat_unlock_entry(dbref); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_database_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStatShared_Database *sharedent; + PgStat_StatDBEntry *pendingent; + + pendingent = (PgStat_StatDBEntry *) entry_ref->pending; + sharedent = (PgStatShared_Database *) entry_ref->shared_stats; + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + +#define PGSTAT_ACCUM_DBCOUNT(item) \ + (sharedent)->stats.item += (pendingent)->item + + PGSTAT_ACCUM_DBCOUNT(n_xact_commit); + PGSTAT_ACCUM_DBCOUNT(n_xact_rollback); + PGSTAT_ACCUM_DBCOUNT(n_blocks_fetched); + PGSTAT_ACCUM_DBCOUNT(n_blocks_hit); + + PGSTAT_ACCUM_DBCOUNT(n_tuples_returned); + PGSTAT_ACCUM_DBCOUNT(n_tuples_fetched); + PGSTAT_ACCUM_DBCOUNT(n_tuples_inserted); + PGSTAT_ACCUM_DBCOUNT(n_tuples_updated); + PGSTAT_ACCUM_DBCOUNT(n_tuples_deleted); + + /* last_autovac_time is reported immediately */ + Assert(pendingent->last_autovac_time == 0); + + PGSTAT_ACCUM_DBCOUNT(n_conflict_tablespace); + PGSTAT_ACCUM_DBCOUNT(n_conflict_lock); + PGSTAT_ACCUM_DBCOUNT(n_conflict_snapshot); + PGSTAT_ACCUM_DBCOUNT(n_conflict_bufferpin); + PGSTAT_ACCUM_DBCOUNT(n_conflict_startup_deadlock); + + PGSTAT_ACCUM_DBCOUNT(n_temp_bytes); + PGSTAT_ACCUM_DBCOUNT(n_temp_files); + PGSTAT_ACCUM_DBCOUNT(n_deadlocks); + + /* checksum failures are reported immediately */ + Assert(pendingent->n_checksum_failures == 0); + Assert(pendingent->last_checksum_failure == 0); + + PGSTAT_ACCUM_DBCOUNT(n_block_read_time); + PGSTAT_ACCUM_DBCOUNT(n_block_write_time); + + PGSTAT_ACCUM_DBCOUNT(n_sessions); + PGSTAT_ACCUM_DBCOUNT(total_session_time); + PGSTAT_ACCUM_DBCOUNT(total_active_time); + PGSTAT_ACCUM_DBCOUNT(total_idle_in_xact_time); + PGSTAT_ACCUM_DBCOUNT(n_sessions_abandoned); + PGSTAT_ACCUM_DBCOUNT(n_sessions_fatal); + PGSTAT_ACCUM_DBCOUNT(n_sessions_killed); +#undef PGSTAT_ACCUM_DBCOUNT + + pgstat_unlock_entry(entry_ref); + + memset(pendingent, 0, sizeof(*pendingent)); + + return true; +} + +void +pgstat_database_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Database *) header)->stats.stat_reset_timestamp = ts; +} diff --git a/src/backend/utils/activity/pgstat_function.c b/src/backend/utils/activity/pgstat_function.c index ad9879afb2..427d8c47fc 100644 --- a/src/backend/utils/activity/pgstat_function.c +++ b/src/backend/utils/activity/pgstat_function.c @@ -17,8 +17,10 @@ #include "postgres.h" +#include "fmgr.h" +#include "utils/inval.h" #include "utils/pgstat_internal.h" -#include "utils/timestamp.h" +#include "utils/syscache.h" /* ---------- @@ -28,18 +30,6 @@ int pgstat_track_functions = TRACK_FUNC_OFF; -/* - * Indicates if backend has some function stats that it hasn't yet - * sent to the collector. - */ -bool have_function_stats = false; - -/* - * Backends store per-function info that's waiting to be sent to the collector - * in this hash table (indexed by function OID). - */ -static HTAB *pgStatFunctions = NULL; - /* * Total time charged to functions so far in the current backend. * We use this to help separate "self" and "other" time charges. @@ -61,6 +51,10 @@ pgstat_create_function(Oid proid) /* * Ensure that stats are dropped if transaction commits. + * + * NB: This is only reliable because pgstat_init_function_usage() does some + * extra work. If other places start emitting function stats they likely need + * similar logic. */ void pgstat_drop_function(Oid proid) @@ -78,8 +72,9 @@ void pgstat_init_function_usage(FunctionCallInfo fcinfo, PgStat_FunctionCallUsage *fcu) { - PgStat_BackendFunctionEntry *htabent; - bool found; + PgStat_EntryRef *entry_ref; + PgStat_BackendFunctionEntry *pending; + bool created_entry; if (pgstat_track_functions <= fcinfo->flinfo->fn_stats) { @@ -88,29 +83,48 @@ pgstat_init_function_usage(FunctionCallInfo fcinfo, return; } - if (!pgStatFunctions) - { - /* First time through - initialize function stat table */ - HASHCTL hash_ctl; + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_FUNCTION, + MyDatabaseId, + fcinfo->flinfo->fn_oid, + &created_entry); - hash_ctl.keysize = sizeof(Oid); - hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry); - pgStatFunctions = hash_create("Function stat entries", - PGSTAT_FUNCTION_HASH_SIZE, - &hash_ctl, - HASH_ELEM | HASH_BLOBS); + /* + * If no shared entry already exists, check if the function has been + * deleted concurrently. This can go unnoticed until here because + * executing a statement that just calls a function, does not trigger + * cache invalidation processing. The reason we care about this case is + * that otherwise we could create a new stats entry for an already dropped + * function (for relations etc this is not possible because emitting stats + * requires a lock for the relation to already have been acquired). + * + * It's somewhat ugly to have a behavioral difference based on + * track_functions being enabled/disabled. But it seems acceptable, given + * that there's already behavioral differences depending on whether the + * function is the caches etc. + * + * For correctness it'd be sufficient to set ->dropped to true. However, + * the accepted invalidation will commonly cause "low level" failures in + * PL code, with an OID in the error message. Making this harder to + * test... + */ + if (created_entry) + { + AcceptInvalidationMessages(); + if (!SearchSysCacheExists1(PROCOID, ObjectIdGetDatum(fcinfo->flinfo->fn_oid))) + { + pgstat_drop_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, + fcinfo->flinfo->fn_oid); + ereport(ERROR, errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("function call to dropped function")); + } } - /* Get the stats entry for this function, create if necessary */ - htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid, - HASH_ENTER, &found); - if (!found) - MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts)); + pending = entry_ref->pending; - fcu->fs = &htabent->f_counts; + fcu->fs = &pending->f_counts; /* save stats for this function, later used to compensate for recursion */ - fcu->save_f_total_time = htabent->f_counts.f_total_time; + fcu->save_f_total_time = pending->f_counts.f_total_time; /* save current backend-wide total time */ fcu->save_total = total_func_time; @@ -167,64 +181,37 @@ pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize) fs->f_numcalls++; fs->f_total_time = f_total; INSTR_TIME_ADD(fs->f_self_time, f_self); - - /* indicate that we have something to send */ - have_function_stats = true; } /* - * Subroutine for pgstat_report_stat: populate and send a function stat message + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. */ -void -pgstat_send_funcstats(void) +bool +pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) { - /* we assume this inits to all zeroes: */ - static const PgStat_FunctionCounts all_zeroes; + PgStat_BackendFunctionEntry *localent; + PgStatShared_Function *shfuncent; - PgStat_MsgFuncstat msg; - PgStat_BackendFunctionEntry *entry; - HASH_SEQ_STATUS fstat; + localent = (PgStat_BackendFunctionEntry *) entry_ref->pending; + shfuncent = (PgStatShared_Function *) entry_ref->shared_stats; - if (pgStatFunctions == NULL) - return; + /* localent always has non-zero content */ - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT); - msg.m_databaseid = MyDatabaseId; - msg.m_nentries = 0; + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; - hash_seq_init(&fstat, pgStatFunctions); - while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL) - { - PgStat_FunctionEntry *m_ent; + shfuncent->stats.f_numcalls += localent->f_counts.f_numcalls; + shfuncent->stats.f_total_time += + INSTR_TIME_GET_MICROSEC(localent->f_counts.f_total_time); + shfuncent->stats.f_self_time += + INSTR_TIME_GET_MICROSEC(localent->f_counts.f_self_time); - /* Skip it if no counts accumulated since last time */ - if (memcmp(&entry->f_counts, &all_zeroes, - sizeof(PgStat_FunctionCounts)) == 0) - continue; + pgstat_unlock_entry(entry_ref); - /* need to convert format of time accumulators */ - m_ent = &msg.m_entry[msg.m_nentries]; - m_ent->f_id = entry->f_id; - m_ent->f_numcalls = entry->f_counts.f_numcalls; - m_ent->f_total_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_total_time); - m_ent->f_self_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_self_time); - - if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES) - { - pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + - msg.m_nentries * sizeof(PgStat_FunctionEntry)); - msg.m_nentries = 0; - } - - /* reset the entry's counts */ - MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts)); - } - - if (msg.m_nentries > 0) - pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + - msg.m_nentries * sizeof(PgStat_FunctionEntry)); - - have_function_stats = false; + return true; } /* @@ -235,12 +222,22 @@ pgstat_send_funcstats(void) PgStat_BackendFunctionEntry * find_funcstat_entry(Oid func_id) { - pgstat_assert_is_up(); + PgStat_EntryRef *entry_ref; - if (pgStatFunctions == NULL) - return NULL; + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id); - return (PgStat_BackendFunctionEntry *) hash_search(pgStatFunctions, - (void *) &func_id, - HASH_FIND, NULL); + if (entry_ref) + return entry_ref->pending; + return NULL; +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one function or NULL. + */ +PgStat_StatFuncEntry * +pgstat_fetch_stat_funcentry(Oid func_id) +{ + return (PgStat_StatFuncEntry *) + pgstat_fetch_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id); } diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index 51a87b6673..bec190c589 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -19,6 +19,7 @@ #include "access/twophase_rmgr.h" #include "access/xact.h" +#include "catalog/partition.h" #include "postmaster/autovacuum.h" #include "utils/memutils.h" #include "utils/pgstat_internal.h" @@ -26,38 +27,6 @@ #include "utils/timestamp.h" -/* - * Structures in which backends store per-table info that's waiting to be - * sent to the collector. - * - * NOTE: once allocated, TabStatusArray structures are never moved or deleted - * for the life of the backend. Also, we zero out the t_id fields of the - * contained PgStat_TableStatus structs whenever they are not actively in use. - * This allows relcache pgstat_info pointers to be treated as long-lived data, - * avoiding repeated searches in pgstat_init_relation() when a relation is - * repeatedly opened during a transaction. - */ -#define TABSTAT_QUANTUM 100 /* we alloc this many at a time */ - - -typedef struct TabStatusArray -{ - struct TabStatusArray *tsa_next; /* link to next array, if any */ - int tsa_used; /* # entries currently used */ - PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM]; /* per-table data */ -} TabStatusArray; - -static TabStatusArray *pgStatTabList = NULL; - -/* - * pgStatTabHash entry: map from relation OID to PgStat_TableStatus pointer - */ -typedef struct TabStatHashEntry -{ - Oid t_id; - PgStat_TableStatus *tsa_entry; -} TabStatHashEntry; - /* Record that's written to 2PC state file when pgstat state is persisted */ typedef struct TwoPhasePgStatRecord { @@ -74,27 +43,13 @@ typedef struct TwoPhasePgStatRecord } TwoPhasePgStatRecord; -static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared); -static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now); +static PgStat_TableStatus *pgstat_prep_relation_pending(Oid rel_id, bool isshared); static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level); static void ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info); static void save_truncdrop_counters(PgStat_TableXactStatus *trans, bool is_drop); static void restore_truncdrop_counters(PgStat_TableXactStatus *trans); -/* - * Indicates if backend has some relation stats that it hasn't yet - * sent to the collector. - */ -bool have_relation_stats; - - -/* - * Hash table for O(1) t_id -> tsa_entry lookup - */ -static HTAB *pgStatTabHash = NULL; - - /* * Copy stats between relations. This is used for things like REINDEX * CONCURRENTLY. @@ -103,43 +58,39 @@ void pgstat_copy_relation_stats(Relation dst, Relation src) { PgStat_StatTabEntry *srcstats; + PgStatShared_Relation *dstshstats; + PgStat_EntryRef *dst_ref; - srcstats = pgstat_fetch_stat_tabentry(RelationGetRelid(src)); - + srcstats = pgstat_fetch_stat_tabentry_ext(src->rd_rel->relisshared, + RelationGetRelid(src)); if (!srcstats) return; - if (pgstat_should_count_relation(dst)) - { - /* - * XXX: temporarily this does not actually quite do what the name - * says, and just copy index related fields. A subsequent commit will - * do more. - */ + dst_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + dst->rd_rel->relisshared ? InvalidOid : MyDatabaseId, + RelationGetRelid(dst), + false); - dst->pgstat_info->t_counts.t_numscans = srcstats->numscans; - dst->pgstat_info->t_counts.t_tuples_returned = srcstats->tuples_returned; - dst->pgstat_info->t_counts.t_tuples_fetched = srcstats->tuples_fetched; - dst->pgstat_info->t_counts.t_blocks_fetched = srcstats->blocks_fetched; - dst->pgstat_info->t_counts.t_blocks_hit = srcstats->blocks_hit; + dstshstats = (PgStatShared_Relation *) dst_ref->shared_stats; + dstshstats->stats = *srcstats; - /* the data will be sent by the next pgstat_report_stat() call */ - } + pgstat_unlock_entry(dst_ref); } /* - * Initialize a relcache entry to count access statistics. - * Called whenever a relation is opened. + * Initialize a relcache entry to count access statistics. Called whenever a + * relation is opened. * - * We assume that a relcache entry's pgstat_info field is zeroed by - * relcache.c when the relcache entry is made; thereafter it is long-lived - * data. We can avoid repeated searches of the TabStatus arrays when the - * same relation is touched repeatedly within a transaction. + * We assume that a relcache entry's pgstat_info field is zeroed by relcache.c + * when the relcache entry is made; thereafter it is long-lived data. + * + * This does not create a reference to a stats entry in shared memory, nor + * allocate memory for the pending stats. That happens in + * pgstat_assoc_relation(). */ void pgstat_init_relation(Relation rel) { - Oid rel_id = rel->rd_id; char relkind = rel->rd_rel->relkind; /* @@ -147,27 +98,68 @@ pgstat_init_relation(Relation rel) */ if (!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE) { + rel->pgstat_enabled = false; rel->pgstat_info = NULL; return; } - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) { + if (rel->pgstat_info) + pgstat_unlink_relation(rel); + /* We're not counting at all */ + rel->pgstat_enabled = false; rel->pgstat_info = NULL; return; } - /* - * If we already set up this relation in the current transaction, nothing - * to do. - */ - if (rel->pgstat_info != NULL && - rel->pgstat_info->t_id == rel_id) - return; + rel->pgstat_enabled = true; +} + +/* + * Prepare for statistics for this relation to be collected. + * + * This ensures we have a reference to the stats entry before stats can be + * generated. That is important because a relation drop in another connection + * could otherwise lead to the stats entry being dropped, which then later + * would get recreated when flushing stats. + * + * This is separate from pgstat_init_relation() as it is not uncommon for + * relcache entries to be opened without ever getting stats reported. + */ +void +pgstat_assoc_relation(Relation rel) +{ + Assert(rel->pgstat_enabled); + Assert(rel->pgstat_info == NULL); /* Else find or make the PgStat_TableStatus entry, and update link */ - rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared); + rel->pgstat_info = pgstat_prep_relation_pending(RelationGetRelid(rel), + rel->rd_rel->relisshared); + + /* don't allow link a stats to multiple relcache entries */ + Assert(rel->pgstat_info->relation == NULL); + + /* mark this relation as the owner */ + rel->pgstat_info->relation = rel; +} + +/* + * Break the mutual link between a relcache entry and pending stats entry. + * This must be called whenever one end of the link is removed. + */ +void +pgstat_unlink_relation(Relation rel) +{ + /* remove the link to stats info if any */ + if (rel->pgstat_info == NULL) + return; + + /* link sanity check */ + Assert(rel->pgstat_info->relation == rel); + rel->pgstat_info->relation = NULL; + rel->pgstat_info = NULL; } /* @@ -187,9 +179,26 @@ pgstat_create_relation(Relation rel) void pgstat_drop_relation(Relation rel) { + int nest_level = GetCurrentTransactionNestLevel(); + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + pgstat_drop_transactional(PGSTAT_KIND_RELATION, rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId, RelationGetRelid(rel)); + + /* + * Transactionally set counters to 0. That ensures that accesses to + * pg_stat_xact_all_tables inside the transaction show 0. + */ + if (pgstat_info && + pgstat_info->trans != NULL && + pgstat_info->trans->nest_level == nest_level) + { + save_truncdrop_counters(pgstat_info->trans, true); + pgstat_info->trans->tuples_inserted = 0; + pgstat_info->trans->tuples_updated = 0; + pgstat_info->trans->tuples_deleted = 0; + } } /* @@ -199,19 +208,52 @@ void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples) { - PgStat_MsgVacuum msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Relation *shtabentry; + PgStat_StatTabEntry *tabentry; + Oid dboid = (shared ? InvalidOid : MyDatabaseId); + TimestampTz ts; - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM); - msg.m_databaseid = shared ? InvalidOid : MyDatabaseId; - msg.m_tableoid = tableoid; - msg.m_autovacuum = IsAutoVacuumWorkerProcess(); - msg.m_vacuumtime = GetCurrentTimestamp(); - msg.m_live_tuples = livetuples; - msg.m_dead_tuples = deadtuples; - pgstat_send(&msg, sizeof(msg)); + /* Store the data in the table's hash table entry. */ + ts = GetCurrentTimestamp(); + + /* block acquiring lock for the same reason as pgstat_report_autovac() */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + dboid, tableoid, false); + + shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; + tabentry = &shtabentry->stats; + + tabentry->n_live_tuples = livetuples; + tabentry->n_dead_tuples = deadtuples; + + /* + * It is quite possible that a non-aggressive VACUUM ended up skipping + * various pages, however, we'll zero the insert counter here regardless. + * It's currently used only to track when we need to perform an "insert" + * autovacuum, which are mainly intended to freeze newly inserted tuples. + * Zeroing this may just mean we'll not try to vacuum the table again + * until enough tuples have been inserted to trigger another insert + * autovacuum. An anti-wraparound autovacuum will catch any persistent + * stragglers. + */ + tabentry->inserts_since_vacuum = 0; + + if (IsAutoVacuumWorkerProcess()) + { + tabentry->autovac_vacuum_timestamp = ts; + tabentry->autovac_vacuum_count++; + } + else + { + tabentry->vacuum_timestamp = ts; + tabentry->vacuum_count++; + } + + pgstat_unlock_entry(entry_ref); } /* @@ -225,9 +267,12 @@ pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter) { - PgStat_MsgAnalyze msg; + PgStat_EntryRef *entry_ref; + PgStatShared_Relation *shtabentry; + PgStat_StatTabEntry *tabentry; + Oid dboid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); - if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + if (!pgstat_track_counts) return; /* @@ -259,15 +304,39 @@ pgstat_report_analyze(Relation rel, deadtuples = Max(deadtuples, 0); } - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE); - msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId; - msg.m_tableoid = RelationGetRelid(rel); - msg.m_autovacuum = IsAutoVacuumWorkerProcess(); - msg.m_resetcounter = resetcounter; - msg.m_analyzetime = GetCurrentTimestamp(); - msg.m_live_tuples = livetuples; - msg.m_dead_tuples = deadtuples; - pgstat_send(&msg, sizeof(msg)); + /* block acquiring lock for the same reason as pgstat_report_autovac() */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, dboid, + RelationGetRelid(rel), + false); + /* can't get dropped while accessed */ + Assert(entry_ref != NULL && entry_ref->shared_stats != NULL); + + shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; + tabentry = &shtabentry->stats; + + tabentry->n_live_tuples = livetuples; + tabentry->n_dead_tuples = deadtuples; + + /* + * If commanded, reset changes_since_analyze to zero. This forgets any + * changes that were committed while the ANALYZE was in progress, but we + * have no good way to estimate how many of those there were. + */ + if (resetcounter) + tabentry->changes_since_analyze = 0; + + if (IsAutoVacuumWorkerProcess()) + { + tabentry->autovac_analyze_timestamp = GetCurrentTimestamp(); + tabentry->autovac_analyze_count++; + } + else + { + tabentry->analyze_timestamp = GetCurrentTimestamp(); + tabentry->analyze_count++; + } + + pgstat_unlock_entry(entry_ref); } /* @@ -356,30 +425,61 @@ pgstat_update_heap_dead_tuples(Relation rel, int delta) } } +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one table or NULL. NULL doesn't mean + * that the table doesn't exist, just that there are no statistics, so the + * caller is better off to report ZERO instead. + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry(Oid relid) +{ + PgStat_StatTabEntry *tabentry; + + tabentry = pgstat_fetch_stat_tabentry_ext(false, relid); + if (tabentry != NULL) + return tabentry; + + /* + * If we didn't find it, maybe it's a shared table. + */ + tabentry = pgstat_fetch_stat_tabentry_ext(true, relid); + return tabentry; +} + +/* + * More efficient version of pgstat_fetch_stat_tabentry(), allowing to specify + * whether the to-be-accessed table is a shared relation or not. + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry_ext(bool shared, Oid reloid) +{ + Oid dboid = (shared ? InvalidOid : MyDatabaseId); + + return (PgStat_StatTabEntry *) + pgstat_fetch_entry(PGSTAT_KIND_RELATION, dboid, reloid); +} + /* * find any existing PgStat_TableStatus entry for rel * - * If no entry, return NULL, don't create a new one + * Find any existing PgStat_TableStatus entry for rel_id in the current + * database. If not found, try finding from shared tables. * - * Note: if we got an error in the most recent execution of pgstat_report_stat, - * it's possible that an entry exists but there's no hashtable entry for it. - * That's okay, we'll treat this case as "doesn't exist". + * If no entry found, return NULL, don't create a new one */ PgStat_TableStatus * find_tabstat_entry(Oid rel_id) { - TabStatHashEntry *hash_entry; + PgStat_EntryRef *entry_ref; - /* If hashtable doesn't exist, there are no entries at all */ - if (!pgStatTabHash) - return NULL; + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, MyDatabaseId, rel_id); + if (!entry_ref) + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, InvalidOid, rel_id); - hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_FIND, NULL); - if (!hash_entry) - return NULL; - - /* Note that this step could also return NULL, but that's correct */ - return hash_entry->tsa_entry; + if (entry_ref) + return entry_ref->pending; + return NULL; } /* @@ -536,7 +636,7 @@ AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) for (trans = xact_state->first; trans != NULL; trans = trans->next) { - PgStat_TableStatus *tabstat; + PgStat_TableStatus *tabstat PG_USED_FOR_ASSERTS_ONLY; TwoPhasePgStatRecord record; Assert(trans->nest_level == 1); @@ -594,7 +694,7 @@ pgstat_twophase_postcommit(TransactionId xid, uint16 info, PgStat_TableStatus *pgstat_info; /* Find or create a tabstat entry for the rel */ - pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); + pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared); /* Same math as in AtEOXact_PgStat, commit case */ pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; @@ -630,7 +730,7 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info, PgStat_TableStatus *pgstat_info; /* Find or create a tabstat entry for the rel */ - pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); + pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared); /* Same math as in AtEOXact_PgStat, abort case */ if (rec->t_truncdropped) @@ -647,204 +747,116 @@ pgstat_twophase_postabort(TransactionId xid, uint16 info, } /* - * Subroutine for pgstat_report_stat: Send relation statistics + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + * + * Some of the stats are copied to the corresponding pending database stats + * entry when successfully flushing. */ -void -pgstat_send_tabstats(TimestampTz now, bool disconnect) +bool +pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) { - /* we assume this inits to all zeroes: */ static const PgStat_TableCounts all_zeroes; - PgStat_MsgTabstat regular_msg; - PgStat_MsgTabstat shared_msg; - TabStatusArray *tsa; - int i; + Oid dboid; + PgStat_TableStatus *lstats; /* pending stats entry */ + PgStatShared_Relation *shtabstats; + PgStat_StatTabEntry *tabentry; /* table entry of shared stats */ + PgStat_StatDBEntry *dbentry; /* pending database entry */ + + dboid = entry_ref->shared_entry->key.dboid; + lstats = (PgStat_TableStatus *) entry_ref->pending; + shtabstats = (PgStatShared_Relation *) entry_ref->shared_stats; /* - * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry - * entries it points to. (Should we fail partway through the loop below, - * it's okay to have removed the hashtable already --- the only - * consequence is we'd get multiple entries for the same table in the - * pgStatTabList, and that's safe.) + * Ignore entries that didn't accumulate any actual counts, such as + * indexes that were opened by the planner but not used. */ - if (pgStatTabHash) - hash_destroy(pgStatTabHash); - pgStatTabHash = NULL; - - /* - * Scan through the TabStatusArray struct(s) to find tables that actually - * have counts, and build messages to send. We have to separate shared - * relations from regular ones because the databaseid field in the message - * header has to depend on that. - */ - regular_msg.m_databaseid = MyDatabaseId; - shared_msg.m_databaseid = InvalidOid; - regular_msg.m_nentries = 0; - shared_msg.m_nentries = 0; - - for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next) + if (memcmp(&lstats->t_counts, &all_zeroes, + sizeof(PgStat_TableCounts)) == 0) { - for (i = 0; i < tsa->tsa_used; i++) - { - PgStat_TableStatus *entry = &tsa->tsa_entries[i]; - PgStat_MsgTabstat *this_msg; - PgStat_TableEntry *this_ent; - - /* Shouldn't have any pending transaction-dependent counts */ - Assert(entry->trans == NULL); - - /* - * Ignore entries that didn't accumulate any actual counts, such - * as indexes that were opened by the planner but not used. - */ - if (memcmp(&entry->t_counts, &all_zeroes, - sizeof(PgStat_TableCounts)) == 0) - continue; - - /* - * OK, insert data into the appropriate message, and send if full. - */ - this_msg = entry->t_shared ? &shared_msg : ®ular_msg; - this_ent = &this_msg->m_entry[this_msg->m_nentries]; - this_ent->t_id = entry->t_id; - memcpy(&this_ent->t_counts, &entry->t_counts, - sizeof(PgStat_TableCounts)); - if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES) - { - pgstat_send_tabstat(this_msg, now); - this_msg->m_nentries = 0; - } - } - /* zero out PgStat_TableStatus structs after use */ - MemSet(tsa->tsa_entries, 0, - tsa->tsa_used * sizeof(PgStat_TableStatus)); - tsa->tsa_used = 0; + return true; } - /* - * Send partial messages. Make sure that any pending xact commit/abort - * and connection stats get counted, even if there are no table stats to - * send. - */ - if (regular_msg.m_nentries > 0 || - pgStatXactCommit > 0 || pgStatXactRollback > 0 || disconnect) - pgstat_send_tabstat(®ular_msg, now); - if (shared_msg.m_nentries > 0) - pgstat_send_tabstat(&shared_msg, now); + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; - have_relation_stats = false; + /* add the values to the shared entry. */ + tabentry = &shtabstats->stats; + + tabentry->numscans += lstats->t_counts.t_numscans; + tabentry->tuples_returned += lstats->t_counts.t_tuples_returned; + tabentry->tuples_fetched += lstats->t_counts.t_tuples_fetched; + tabentry->tuples_inserted += lstats->t_counts.t_tuples_inserted; + tabentry->tuples_updated += lstats->t_counts.t_tuples_updated; + tabentry->tuples_deleted += lstats->t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated += lstats->t_counts.t_tuples_hot_updated; + + /* + * If table was truncated/dropped, first reset the live/dead counters. + */ + if (lstats->t_counts.t_truncdropped) + { + tabentry->n_live_tuples = 0; + tabentry->n_dead_tuples = 0; + tabentry->inserts_since_vacuum = 0; + } + + tabentry->n_live_tuples += lstats->t_counts.t_delta_live_tuples; + tabentry->n_dead_tuples += lstats->t_counts.t_delta_dead_tuples; + tabentry->changes_since_analyze += lstats->t_counts.t_changed_tuples; + tabentry->inserts_since_vacuum += lstats->t_counts.t_tuples_inserted; + tabentry->blocks_fetched += lstats->t_counts.t_blocks_fetched; + tabentry->blocks_hit += lstats->t_counts.t_blocks_hit; + + /* Clamp n_live_tuples in case of negative delta_live_tuples */ + tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); + /* Likewise for n_dead_tuples */ + tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); + + pgstat_unlock_entry(entry_ref); + + /* The entry was successfully flushed, add the same to database stats */ + dbentry = pgstat_prep_database_pending(dboid); + dbentry->n_tuples_returned += lstats->t_counts.t_tuples_returned; + dbentry->n_tuples_fetched += lstats->t_counts.t_tuples_fetched; + dbentry->n_tuples_inserted += lstats->t_counts.t_tuples_inserted; + dbentry->n_tuples_updated += lstats->t_counts.t_tuples_updated; + dbentry->n_tuples_deleted += lstats->t_counts.t_tuples_deleted; + dbentry->n_blocks_fetched += lstats->t_counts.t_blocks_fetched; + dbentry->n_blocks_hit += lstats->t_counts.t_blocks_hit; + + return true; } -/* - * Subroutine for pgstat_send_tabstats: finish and send one tabstat message - */ -static void -pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now) +void +pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref) { - int n; - int len; + PgStat_TableStatus *pending = (PgStat_TableStatus *) entry_ref->pending; - /* It's unlikely we'd get here with no socket, but maybe not impossible */ - if (pgStatSock == PGINVALID_SOCKET) - return; - - /* - * Report and reset accumulated xact commit/rollback and I/O timings - * whenever we send a normal tabstat message - */ - pgstat_update_dbstats(tsmsg, now); - - n = tsmsg->m_nentries; - len = offsetof(PgStat_MsgTabstat, m_entry[0]) + - n * sizeof(PgStat_TableEntry); - - pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT); - pgstat_send(tsmsg, len); + if (pending->relation) + pgstat_unlink_relation(pending->relation); } /* - * find or create a PgStat_TableStatus entry for rel + * Find or create a PgStat_TableStatus entry for rel. New entry is created and + * initialized if not exists. */ static PgStat_TableStatus * -get_tabstat_entry(Oid rel_id, bool isshared) +pgstat_prep_relation_pending(Oid rel_id, bool isshared) { - TabStatHashEntry *hash_entry; - PgStat_TableStatus *entry; - TabStatusArray *tsa; - bool found; + PgStat_EntryRef *entry_ref; + PgStat_TableStatus *pending; - pgstat_assert_is_up(); + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_RELATION, + isshared ? InvalidOid : MyDatabaseId, + rel_id, NULL); + pending = entry_ref->pending; + pending->t_id = rel_id; + pending->t_shared = isshared; - have_relation_stats = true; - - /* - * Create hash table if we don't have it already. - */ - if (pgStatTabHash == NULL) - { - HASHCTL ctl; - - ctl.keysize = sizeof(Oid); - ctl.entrysize = sizeof(TabStatHashEntry); - - pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table", - TABSTAT_QUANTUM, - &ctl, - HASH_ELEM | HASH_BLOBS); - } - - /* - * Find an entry or create a new one. - */ - hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_ENTER, &found); - if (!found) - { - /* initialize new entry with null pointer */ - hash_entry->tsa_entry = NULL; - } - - /* - * If entry is already valid, we're done. - */ - if (hash_entry->tsa_entry) - return hash_entry->tsa_entry; - - /* - * Locate the first pgStatTabList entry with free space, making a new list - * entry if needed. Note that we could get an OOM failure here, but if so - * we have left the hashtable and the list in a consistent state. - */ - if (pgStatTabList == NULL) - { - /* Set up first pgStatTabList entry */ - pgStatTabList = (TabStatusArray *) - MemoryContextAllocZero(TopMemoryContext, - sizeof(TabStatusArray)); - } - - tsa = pgStatTabList; - while (tsa->tsa_used >= TABSTAT_QUANTUM) - { - if (tsa->tsa_next == NULL) - tsa->tsa_next = (TabStatusArray *) - MemoryContextAllocZero(TopMemoryContext, - sizeof(TabStatusArray)); - tsa = tsa->tsa_next; - } - - /* - * Allocate a PgStat_TableStatus entry within this list entry. We assume - * the entry was already zeroed, either at creation or after last use. - */ - entry = &tsa->tsa_entries[tsa->tsa_used++]; - entry->t_id = rel_id; - entry->t_shared = isshared; - - /* - * Now we can fill the entry in pgStatTabHash. - */ - hash_entry->tsa_entry = entry; - - return entry; + return pending; } /* diff --git a/src/backend/utils/activity/pgstat_replslot.c b/src/backend/utils/activity/pgstat_replslot.c index ceefc5d59b..b77c05ab5f 100644 --- a/src/backend/utils/activity/pgstat_replslot.c +++ b/src/backend/utils/activity/pgstat_replslot.c @@ -8,6 +8,14 @@ * storage implementation and the details about individual types of * statistics. * + * Replication slot stats work a bit different than other other + * variable-numbered stats. Slots do not have oids (so they can be created on + * physical replicas). Use the slot index as object id while running. However, + * the slot index can change when restarting. That is addressed by using the + * name when (de-)serializing. After a restart it is possible for slots to + * have been dropped while shut down, which is addressed by not restoring + * stats for slots that cannot be found by name when starting up. + * * Copyright (c) 2001-2022, PostgreSQL Global Development Group * * IDENTIFICATION @@ -22,6 +30,9 @@ #include "utils/pgstat_internal.h" +static int get_replslot_index(const char *name); + + /* * Reset counters for a single replication slot. * @@ -32,18 +43,10 @@ void pgstat_reset_replslot(const char *name) { ReplicationSlot *slot; - PgStat_MsgResetreplslotcounter msg; AssertArg(name != NULL); - if (pgStatSock == PGINVALID_SOCKET) - return; - - /* - * Check if the slot exists with the given name. It is possible that by - * the time this message is executed the slot is dropped but at least this - * check will ensure that the given name is for a valid slot. - */ + /* Check if the slot exits with the given name. */ slot = SearchNamedReplicationSlot(name, true); if (!slot) @@ -59,10 +62,9 @@ pgstat_reset_replslot(const char *name) if (SlotIsPhysical(slot)) return; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETREPLSLOTCOUNTER); - namestrcpy(&msg.m_slotname, name); - msg.clearall = false; - pgstat_send(&msg, sizeof(msg)); + /* reset this one entry */ + pgstat_reset(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot)); } /* @@ -71,24 +73,34 @@ pgstat_reset_replslot(const char *name) void pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *repSlotStat) { - PgStat_MsgReplSlot msg; + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + PgStat_StatReplSlotEntry *statent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + statent = &shstatent->stats; /* - * Prepare and send the message + * Any mismatch should have been fixed in pgstat_create_replslot() or + * pgstat_acquire_replslot(). */ - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(repSlotStat->slotname)); - msg.m_create = false; - msg.m_drop = false; - msg.m_spill_txns = repSlotStat->spill_txns; - msg.m_spill_count = repSlotStat->spill_count; - msg.m_spill_bytes = repSlotStat->spill_bytes; - msg.m_stream_txns = repSlotStat->stream_txns; - msg.m_stream_count = repSlotStat->stream_count; - msg.m_stream_bytes = repSlotStat->stream_bytes; - msg.m_total_txns = repSlotStat->total_txns; - msg.m_total_bytes = repSlotStat->total_bytes; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + Assert(namestrcmp(&statent->slotname, NameStr(slot->data.name)) == 0); + + /* Update the replication slot statistics */ +#define REPLSLOT_ACC(fld) statent->fld += repSlotStat->fld + REPLSLOT_ACC(spill_txns); + REPLSLOT_ACC(spill_count); + REPLSLOT_ACC(spill_bytes); + REPLSLOT_ACC(stream_txns); + REPLSLOT_ACC(stream_count); + REPLSLOT_ACC(stream_bytes); + REPLSLOT_ACC(total_txns); + REPLSLOT_ACC(total_bytes); +#undef REPLSLOT_ACC + + pgstat_unlock_entry(entry_ref); } /* @@ -100,13 +112,50 @@ pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *re void pgstat_create_replslot(ReplicationSlot *slot) { - PgStat_MsgReplSlot msg; + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(slot->data.name)); - msg.m_create = true; - msg.m_drop = false; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + + /* + * NB: need to accept that there might be stats from an older slot, e.g. + * if we previously crashed after dropping a slot. + */ + memset(&shstatent->stats, 0, sizeof(shstatent->stats)); + namestrcpy(&shstatent->stats.slotname, NameStr(slot->data.name)); + + pgstat_unlock_entry(entry_ref); +} + +/* + * Report replication slot has been acquired. + */ +void +pgstat_acquire_replslot(ReplicationSlot *slot) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + PgStat_StatReplSlotEntry *statent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + statent = &shstatent->stats; + + /* + * NB: need to accept that there might be stats from an older slot, e.g. + * if we previously crashed after dropping a slot. + */ + if (NameStr(statent->slotname)[0] == 0 || + namestrcmp(&statent->slotname, NameStr(slot->data.name)) != 0) + { + memset(statent, 0, sizeof(*statent)); + namestrcpy(&statent->slotname, NameStr(slot->data.name)); + } + + pgstat_unlock_entry(entry_ref); } /* @@ -115,11 +164,65 @@ pgstat_create_replslot(ReplicationSlot *slot) void pgstat_drop_replslot(ReplicationSlot *slot) { - PgStat_MsgReplSlot msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); - namestrcpy(&msg.m_slotname, NameStr(slot->data.name)); - msg.m_create = false; - msg.m_drop = true; - pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); + pgstat_drop_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot)); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the replication slot statistics struct. + */ +PgStat_StatReplSlotEntry * +pgstat_fetch_replslot(NameData slotname) +{ + int idx = get_replslot_index(NameStr(slotname)); + + if (idx == -1) + return NULL; + + return (PgStat_StatReplSlotEntry *) + pgstat_fetch_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, idx); +} + +void +pgstat_replslot_to_serialized_name_cb(const PgStatShared_Common *header, NameData *name) +{ + namestrcpy(name, NameStr(((PgStatShared_ReplSlot *) header)->stats.slotname)); +} + +bool +pgstat_replslot_from_serialized_name_cb(const NameData *name, PgStat_HashKey *key) +{ + int idx = get_replslot_index(NameStr(*name)); + + /* slot might have been deleted */ + if (idx == -1) + return false; + + key->kind = PGSTAT_KIND_REPLSLOT; + key->dboid = InvalidOid; + key->objoid = idx; + + return true; +} + +void +pgstat_replslot_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_ReplSlot *) header)->stats.stat_reset_timestamp = ts; +} + +static int +get_replslot_index(const char *name) +{ + ReplicationSlot *slot; + + AssertArg(name != NULL); + + slot = SearchNamedReplicationSlot(name, true); + + if (!slot) + return -1; + + return ReplicationSlotIndex(slot); } diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c new file mode 100644 index 0000000000..a32740b2f6 --- /dev/null +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -0,0 +1,987 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_shmem.c + * Storage of stats entries in shared memory + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_shmem.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgstat.h" +#include "storage/shmem.h" +#include "utils/memutils.h" +#include "utils/pgstat_internal.h" + + +#define PGSTAT_ENTRY_REF_HASH_SIZE 128 + +/* hash table entry for finding the PgStat_EntryRef for a key */ +typedef struct PgStat_EntryRefHashEntry +{ + PgStat_HashKey key; /* hash key */ + char status; /* for simplehash use */ + PgStat_EntryRef *entry_ref; +} PgStat_EntryRefHashEntry; + + +/* for references to shared statistics entries */ +#define SH_PREFIX pgstat_entry_ref_hash +#define SH_ELEMENT_TYPE PgStat_EntryRefHashEntry +#define SH_KEY_TYPE PgStat_HashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL) +#define SH_EQUAL(tb, a, b) \ + pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + + +static void pgstat_drop_database_and_contents(Oid dboid); + +static void pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat); + +static void pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, bool discard_pending); +static bool pgstat_need_entry_refs_gc(void); +static void pgstat_gc_entry_refs(void); +static void pgstat_release_all_entry_refs(bool discard_pending); +typedef bool (*ReleaseMatchCB) (PgStat_EntryRefHashEntry *, Datum data); +static void pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, Datum match_data); + +static void pgstat_setup_memcxt(void); + + +/* parameter for the shared hash */ +static const dshash_parameters dsh_params = { + sizeof(PgStat_HashKey), + sizeof(PgStatShared_HashEntry), + pgstat_cmp_hash_key, + pgstat_hash_hash_key, + LWTRANCHE_PGSTATS_HASH +}; + + +/* + * Backend local references to shared stats entries. If there are pending + * updates to a stats entry, the PgStat_EntryRef is added to the pgStatPending + * list. + * + * When a stats entry is dropped each backend needs to release its reference + * to it before the memory can be released. To trigger that + * pgStatLocal.shmem->gc_request_count is incremented - which each backend + * compares to their copy of pgStatSharedRefAge on a regular basis. + */ +static pgstat_entry_ref_hash_hash *pgStatEntryRefHash = NULL; +static int pgStatSharedRefAge = 0; /* cache age of pgStatShmLookupCache */ + +/* + * Memory contexts containing the pgStatEntryRefHash table and the + * pgStatSharedRef entries respectively. Kept separate to make it easier to + * track / attribute memory usage. + */ +static MemoryContext pgStatSharedRefContext = NULL; +static MemoryContext pgStatEntryRefHashContext = NULL; + + +/* ------------------------------------------------------------ + * Public functions called from postmaster follow + * ------------------------------------------------------------ + */ + +/* + * The size of the shared memory allocation for stats stored in the shared + * stats hash table. This allocation will be done as part of the main shared + * memory, rather than dynamic shared memory, allowing it to be initialized in + * postmaster. + */ +static Size +pgstat_dsa_init_size(void) +{ + Size sz; + + /* + * The dshash header / initial buckets array needs to fit into "plain" + * shared memory, but it's beneficial to not need dsm segments + * immediately. A size of 256kB seems works well and is not + * disproportional compared to other constant sized shared memory + * allocations. NB: To avoid DSMs further, the user can configure + * min_dynamic_shared_memory. + */ + sz = 256 * 1024; + Assert(dsa_minimum_size() <= sz); + return MAXALIGN(sz); +} + +/* + * Compute shared memory space needed for cumulative statistics + */ +Size +StatsShmemSize(void) +{ + Size sz; + + sz = MAXALIGN(sizeof(PgStat_ShmemControl)); + sz = add_size(sz, pgstat_dsa_init_size()); + + return sz; +} + +/* + * Initialize cumulative statistics system during startup + */ +void +StatsShmemInit(void) +{ + bool found; + Size sz; + + sz = StatsShmemSize(); + pgStatLocal.shmem = (PgStat_ShmemControl *) + ShmemInitStruct("Shared Memory Stats", sz, &found); + + if (!IsUnderPostmaster) + { + dsa_area *dsa; + dshash_table *dsh; + PgStat_ShmemControl *ctl = pgStatLocal.shmem; + char *p = (char *) ctl; + + Assert(!found); + + /* the allocation of pgStatLocal.shmem itself */ + p += MAXALIGN(sizeof(PgStat_ShmemControl)); + + /* + * Create a small dsa allocation in plain shared memory. This is + * required because postmaster cannot use dsm segments. It also + * provides a small efficiency win. + */ + ctl->raw_dsa_area = p; + p += MAXALIGN(pgstat_dsa_init_size()); + dsa = dsa_create_in_place(ctl->raw_dsa_area, + pgstat_dsa_init_size(), + LWTRANCHE_PGSTATS_DSA, 0); + dsa_pin(dsa); + + /* + * To ensure dshash is created in "plain" shared memory, temporarily + * limit size of dsa to the initial size of the dsa. + */ + dsa_set_size_limit(dsa, pgstat_dsa_init_size()); + + /* + * With the limit in place, create the dshash table. XXX: It'd be nice + * if there were dshash_create_in_place(). + */ + dsh = dshash_create(dsa, &dsh_params, 0); + ctl->hash_handle = dshash_get_hash_table_handle(dsh); + + /* lift limit set above */ + dsa_set_size_limit(dsa, -1); + + /* + * Postmaster will never access these again, thus free the local + * dsa/dshash references. + */ + dshash_detach(dsh); + dsa_detach(dsa); + + pg_atomic_init_u64(&ctl->gc_request_count, 1); + + + /* initialize fixed-numbered stats */ + LWLockInitialize(&ctl->archiver.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->bgwriter.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->checkpointer.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->slru.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->wal.lock, LWTRANCHE_PGSTATS_DATA); + } + else + { + Assert(found); + } +} + +void +pgstat_attach_shmem(void) +{ + MemoryContext oldcontext; + + Assert(pgStatLocal.dsa == NULL); + + /* stats shared memory persists for the backend lifetime */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + pgStatLocal.dsa = dsa_attach_in_place(pgStatLocal.shmem->raw_dsa_area, + NULL); + dsa_pin_mapping(pgStatLocal.dsa); + + pgStatLocal.shared_hash = dshash_attach(pgStatLocal.dsa, &dsh_params, + pgStatLocal.shmem->hash_handle, 0); + + MemoryContextSwitchTo(oldcontext); +} + +void +pgstat_detach_shmem(void) +{ + Assert(pgStatLocal.dsa); + + /* we shouldn't leave references to shared stats */ + pgstat_release_all_entry_refs(false); + + dshash_detach(pgStatLocal.shared_hash); + pgStatLocal.shared_hash = NULL; + + dsa_detach(pgStatLocal.dsa); + pgStatLocal.dsa = NULL; +} + + +/* ------------------------------------------------------------ + * Maintenance of shared memory stats entries + * ------------------------------------------------------------ + */ + +PgStatShared_Common * +pgstat_init_entry(PgStat_Kind kind, + PgStatShared_HashEntry *shhashent) +{ + /* Create new stats entry. */ + dsa_pointer chunk; + PgStatShared_Common *shheader; + + /* + * Initialize refcount to 1, marking it as valid / not dropped. The entry + * can't be freed before the initialization because it can't be found as + * long as we hold the dshash partition lock. Caller needs to increase + * further if a longer lived reference is needed. + */ + pg_atomic_init_u32(&shhashent->refcount, 1); + shhashent->dropped = false; + + chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size); + shheader = dsa_get_address(pgStatLocal.dsa, chunk); + shheader->magic = 0xdeadbeef; + + /* Link the new entry from the hash entry. */ + shhashent->body = chunk; + + LWLockInitialize(&shheader->lock, LWTRANCHE_PGSTATS_DATA); + + return shheader; +} + +static PgStatShared_Common * +pgstat_reinit_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent) +{ + PgStatShared_Common *shheader; + + shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body); + + /* mark as not dropped anymore */ + pg_atomic_fetch_add_u32(&shhashent->refcount, 1); + shhashent->dropped = false; + + /* reinitialize content */ + Assert(shheader->magic == 0xdeadbeef); + memset(shheader, 0, pgstat_get_kind_info(shhashent->key.kind)->shared_size); + shheader->magic = 0xdeadbeef; + + return shheader; +} + +static void +pgstat_setup_shared_refs(void) +{ + if (likely(pgStatEntryRefHash != NULL)) + return; + + pgStatEntryRefHash = + pgstat_entry_ref_hash_create(pgStatEntryRefHashContext, + PGSTAT_ENTRY_REF_HASH_SIZE, NULL); + pgStatSharedRefAge = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + Assert(pgStatSharedRefAge != 0); +} + +/* + * Helper function for pgstat_get_entry_ref(). + */ +static void +pgstat_acquire_entry_ref(PgStat_EntryRef *entry_ref, + PgStatShared_HashEntry *shhashent, + PgStatShared_Common *shheader) +{ + Assert(shheader->magic == 0xdeadbeef); + Assert(pg_atomic_read_u32(&shhashent->refcount) > 0); + + pg_atomic_fetch_add_u32(&shhashent->refcount, 1); + + dshash_release_lock(pgStatLocal.shared_hash, shhashent); + + entry_ref->shared_stats = shheader; + entry_ref->shared_entry = shhashent; +} + +/* + * Helper function for pgstat_get_entry_ref(). + */ +static bool +pgstat_get_entry_ref_cached(PgStat_HashKey key, PgStat_EntryRef **entry_ref_p) +{ + bool found; + PgStat_EntryRefHashEntry *cache_entry; + + /* + * We immediately insert a cache entry, because it avoids 1) multiple + * hashtable lookups in case of a cache miss 2) having to deal with + * out-of-memory errors after incrementing PgStatShared_Common->refcount. + */ + + cache_entry = pgstat_entry_ref_hash_insert(pgStatEntryRefHash, key, &found); + + if (!found || !cache_entry->entry_ref) + { + PgStat_EntryRef *entry_ref; + + cache_entry->entry_ref = entry_ref = + MemoryContextAlloc(pgStatSharedRefContext, + sizeof(PgStat_EntryRef)); + entry_ref->shared_stats = NULL; + entry_ref->shared_entry = NULL; + entry_ref->pending = NULL; + + found = false; + } + else if (cache_entry->entry_ref->shared_stats == NULL) + { + Assert(cache_entry->entry_ref->pending == NULL); + found = false; + } + else + { + PgStat_EntryRef *entry_ref PG_USED_FOR_ASSERTS_ONLY; + + entry_ref = cache_entry->entry_ref; + Assert(entry_ref->shared_entry != NULL); + Assert(entry_ref->shared_stats != NULL); + + Assert(entry_ref->shared_stats->magic == 0xdeadbeef); + /* should have at least our reference */ + Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) > 0); + } + + *entry_ref_p = cache_entry->entry_ref; + return found; +} + +/* + * Get a shared stats reference. If create is true, the shared stats object is + * created if it does not exist. + * + * When create is true, and created_entry is non-NULL, it'll be set to true + * if the entry is newly created, false otherwise. + */ +PgStat_EntryRef * +pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, Oid objoid, bool create, + bool *created_entry) +{ + PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid}; + PgStatShared_HashEntry *shhashent; + PgStatShared_Common *shheader = NULL; + PgStat_EntryRef *entry_ref; + + /* + * passing in created_entry only makes sense if we possibly could create + * entry. + */ + AssertArg(create || created_entry == NULL); + pgstat_assert_is_up(); + Assert(pgStatLocal.shared_hash != NULL); + Assert(!pgStatLocal.shmem->is_shutdown); + + pgstat_setup_memcxt(); + pgstat_setup_shared_refs(); + + if (created_entry != NULL) + *created_entry = false; + + /* + * Check if other backends dropped stats that could not be deleted because + * somebody held references to it. If so, check this backend's references. + * This is not expected to happen often. The location of the check is a + * bit random, but this is a relatively frequently called path, so better + * than most. + */ + if (pgstat_need_entry_refs_gc()) + pgstat_gc_entry_refs(); + + /* + * First check the lookup cache hashtable in local memory. If we find a + * match here we can avoid taking locks / causing contention. + */ + if (pgstat_get_entry_ref_cached(key, &entry_ref)) + return entry_ref; + + Assert(entry_ref != NULL); + + /* + * Do a lookup in the hash table first - it's quite likely that the entry + * already exists, and that way we only need a shared lock. + */ + shhashent = dshash_find(pgStatLocal.shared_hash, &key, false); + + if (create && !shhashent) + { + bool shfound; + + /* + * It's possible that somebody created the entry since the above + * lookup. If so, fall through to the same path as if we'd have if it + * already had been created before the dshash_find() calls. + */ + shhashent = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &shfound); + if (!shfound) + { + shheader = pgstat_init_entry(kind, shhashent); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + if (created_entry != NULL) + *created_entry = true; + + return entry_ref; + } + } + + if (!shhashent) + { + /* + * If we're not creating, delete the reference again. In all + * likelihood it's just a stats lookup - no point wasting memory for a + * shared ref to nothing... + */ + pgstat_release_entry_ref(key, entry_ref, false); + + return NULL; + } + else + { + /* + * Can get here either because dshash_find() found a match, or if + * dshash_find_or_insert() found a concurrently inserted entry. + */ + + if (shhashent->dropped && create) + { + /* + * There are legitimate cases where the old stats entry might not + * yet have been dropped by the time it's reused. The most obvious + * case are replication slot stats, where a new slot can be + * created with the same index just after dropping. But oid + * wraparound can lead to other cases as well. We just reset the + * stats to their plain state. + */ + shheader = pgstat_reinit_entry(kind, shhashent); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + if (created_entry != NULL) + *created_entry = true; + + return entry_ref; + } + else if (shhashent->dropped) + { + dshash_release_lock(pgStatLocal.shared_hash, shhashent); + pgstat_release_entry_ref(key, entry_ref, false); + + return NULL; + } + else + { + shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + return entry_ref; + } + } +} + +static void +pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, + bool discard_pending) +{ + if (entry_ref && entry_ref->pending) + { + if (discard_pending) + pgstat_delete_pending_entry(entry_ref); + else + elog(ERROR, "releasing ref with pending data"); + } + + if (entry_ref && entry_ref->shared_stats) + { + Assert(entry_ref->shared_stats->magic == 0xdeadbeef); + Assert(entry_ref->pending == NULL); + + /* + * This can't race with another backend looking up the stats entry and + * increasing the refcount because it is not "legal" to create + * additional references to dropped entries. + */ + if (pg_atomic_fetch_sub_u32(&entry_ref->shared_entry->refcount, 1) == 1) + { + PgStatShared_HashEntry *shent; + + /* + * We're the last referrer to this entry, try to drop the shared + * entry. + */ + + /* only dropped entries can reach a 0 refcount */ + Assert(entry_ref->shared_entry->dropped); + + shent = dshash_find(pgStatLocal.shared_hash, + &entry_ref->shared_entry->key, + true); + if (!shent) + elog(ERROR, "could not find just referenced shared stats entry"); + + Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) == 0); + Assert(entry_ref->shared_entry == shent); + + pgstat_free_entry(shent, NULL); + } + } + + if (!pgstat_entry_ref_hash_delete(pgStatEntryRefHash, key)) + elog(ERROR, "entry ref vanished before deletion"); + + if (entry_ref) + pfree(entry_ref); +} + +bool +pgstat_lock_entry(PgStat_EntryRef *entry_ref, bool nowait) +{ + LWLock *lock = &entry_ref->shared_stats->lock; + + if (nowait) + return LWLockConditionalAcquire(lock, LW_EXCLUSIVE); + + LWLockAcquire(lock, LW_EXCLUSIVE); + return true; +} + +void +pgstat_unlock_entry(PgStat_EntryRef *entry_ref) +{ + LWLockRelease(&entry_ref->shared_stats->lock); +} + +/* + * Helper function to fetch and lock shared stats. + */ +PgStat_EntryRef * +pgstat_get_entry_ref_locked(PgStat_Kind kind, Oid dboid, Oid objoid, + bool nowait) +{ + PgStat_EntryRef *entry_ref; + + /* find shared table stats entry corresponding to the local entry */ + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, true, NULL); + + /* lock the shared entry to protect the content, skip if failed */ + if (!pgstat_lock_entry(entry_ref, nowait)) + return NULL; + + return entry_ref; +} + +void +pgstat_request_entry_refs_gc(void) +{ + pg_atomic_fetch_add_u64(&pgStatLocal.shmem->gc_request_count, 1); +} + +static bool +pgstat_need_entry_refs_gc(void) +{ + uint64 curage; + + if (!pgStatEntryRefHash) + return false; + + /* should have been initialized when creating pgStatEntryRefHash */ + Assert(pgStatSharedRefAge != 0); + + curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + + return pgStatSharedRefAge != curage; +} + +static void +pgstat_gc_entry_refs(void) +{ + pgstat_entry_ref_hash_iterator i; + PgStat_EntryRefHashEntry *ent; + uint64 curage; + + curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + Assert(curage != 0); + + /* + * Some entries have been dropped. Invalidate cache pointer to them. + */ + pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i); + while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) != NULL) + { + PgStat_EntryRef *entry_ref = ent->entry_ref; + + Assert(!entry_ref->shared_stats || + entry_ref->shared_stats->magic == 0xdeadbeef); + + if (!entry_ref->shared_entry->dropped) + continue; + + /* cannot gc shared ref that has pending data */ + if (entry_ref->pending != NULL) + continue; + + pgstat_release_entry_ref(ent->key, entry_ref, false); + } + + pgStatSharedRefAge = curage; +} + +static void +pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, + Datum match_data) +{ + pgstat_entry_ref_hash_iterator i; + PgStat_EntryRefHashEntry *ent; + + if (pgStatEntryRefHash == NULL) + return; + + pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i); + + while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) + != NULL) + { + Assert(ent->entry_ref != NULL); + + if (match && !match(ent, match_data)) + continue; + + pgstat_release_entry_ref(ent->key, ent->entry_ref, discard_pending); + } +} + +/* + * Release all local references to shared stats entries. + * + * When a process exits it cannot do so while still holding references onto + * stats entries, otherwise the shared stats entries could never be freed. + */ +static void +pgstat_release_all_entry_refs(bool discard_pending) +{ + if (pgStatEntryRefHash == NULL) + return; + + pgstat_release_matching_entry_refs(discard_pending, NULL, 0); + Assert(pgStatEntryRefHash->members == 0); + pgstat_entry_ref_hash_destroy(pgStatEntryRefHash); + pgStatEntryRefHash = NULL; +} + +static bool +match_db(PgStat_EntryRefHashEntry *ent, Datum match_data) +{ + Oid dboid = DatumGetObjectId(match_data); + + return ent->key.dboid == dboid; +} + +static void +pgstat_release_db_entry_refs(Oid dboid) +{ + pgstat_release_matching_entry_refs( /* discard pending = */ true, + match_db, + ObjectIdGetDatum(dboid)); +} + + +/* ------------------------------------------------------------ + * Dropping and resetting of stats entries + * ------------------------------------------------------------ + */ + +static void +pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat) +{ + dsa_pointer pdsa; + + /* + * Fetch dsa pointer before deleting entry - that way we can free the + * memory after releasing the lock. + */ + pdsa = shent->body; + + if (!hstat) + dshash_delete_entry(pgStatLocal.shared_hash, shent); + else + dshash_delete_current(hstat); + + dsa_free(pgStatLocal.dsa, pdsa); +} + +/* + * Helper for both pgstat_drop_database_and_contents() and + * pgstat_drop_entry(). If hstat is non-null delete the shared entry using + * dshash_delete_current(), otherwise use dshash_delete_entry(). In either + * case the entry needs to be already locked. + */ +static bool +pgstat_drop_entry_internal(PgStatShared_HashEntry *shent, + dshash_seq_status *hstat) +{ + Assert(shent->body != InvalidDsaPointer); + + /* should already have released local reference */ + if (pgStatEntryRefHash) + Assert(!pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, shent->key)); + + /* + * Signal that the entry is dropped - this will eventually cause other + * backends to release their references. + */ + if (shent->dropped) + elog(ERROR, "can only drop stats once"); + shent->dropped = true; + + /* release refcount marking entry as not dropped */ + if (pg_atomic_sub_fetch_u32(&shent->refcount, 1) == 0) + { + pgstat_free_entry(shent, hstat); + return true; + } + else + { + if (!hstat) + dshash_release_lock(pgStatLocal.shared_hash, shent); + return false; + } +} + +/* + * Drop stats for the database and all the objects inside that database. + */ +static void +pgstat_drop_database_and_contents(Oid dboid) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + uint64 not_freed_count = 0; + + Assert(OidIsValid(dboid)); + + Assert(pgStatLocal.shared_hash != NULL); + + /* + * This backend might very well be the only backend holding a reference to + * about-to-be-dropped entries. Ensure that we're not preventing it from + * being cleaned up till later. + * + * Doing this separately from the dshash iteration below avoids having to + * do so while holding a partition lock on the shared hashtable. + */ + pgstat_release_db_entry_refs(dboid); + + /* some of the dshash entries are to be removed, take exclusive lock. */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, true); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + if (p->dropped) + continue; + + if (p->key.dboid != dboid) + continue; + + if (!pgstat_drop_entry_internal(p, &hstat)) + { + /* + * Even statistics for a dropped database might currently be + * accessed (consider e.g. database stats for pg_stat_database). + */ + not_freed_count++; + } + } + dshash_seq_term(&hstat); + + /* + * If some of the stats data could not be freed, signal the reference + * holders to run garbage collection of their cached pgStatShmLookupCache. + */ + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +bool +pgstat_drop_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid}; + PgStatShared_HashEntry *shent; + bool freed = true; + + /* delete local reference */ + if (pgStatEntryRefHash) + { + PgStat_EntryRefHashEntry *lohashent = + pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, key); + + if (lohashent) + pgstat_release_entry_ref(lohashent->key, lohashent->entry_ref, + true); + } + + /* mark entry in shared hashtable as deleted, drop if possible */ + shent = dshash_find(pgStatLocal.shared_hash, &key, true); + if (shent) + { + freed = pgstat_drop_entry_internal(shent, NULL); + + /* + * Database stats contain other stats. Drop those as well when + * dropping the database. XXX: Perhaps this should be done in a + * slightly more principled way? But not obvious what that'd look + * like, and so far this is the only case... + */ + if (key.kind == PGSTAT_KIND_DATABASE) + pgstat_drop_database_and_contents(key.dboid); + } + + return freed; +} + +void +pgstat_drop_all_entries(void) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *ps; + uint64 not_freed_count = 0; + + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((ps = dshash_seq_next(&hstat)) != NULL) + { + if (ps->dropped) + continue; + + if (!pgstat_drop_entry_internal(ps, &hstat)) + not_freed_count++; + } + dshash_seq_term(&hstat); + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +static void +shared_stat_reset_contents(PgStat_Kind kind, PgStatShared_Common *header, + TimestampTz ts) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + memset(pgstat_get_entry_data(kind, header), 0, + pgstat_get_entry_len(kind)); + + if (kind_info->reset_timestamp_cb) + kind_info->reset_timestamp_cb(header, ts); +} + +/* + * Reset one variable-numbered stats entry. + */ +void +pgstat_reset_entry(PgStat_Kind kind, Oid dboid, Oid objoid, TimestampTz ts) +{ + PgStat_EntryRef *entry_ref; + + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + if (!entry_ref || entry_ref->shared_entry->dropped) + return; + + pgstat_lock_entry(entry_ref, false); + shared_stat_reset_contents(kind, entry_ref->shared_stats, ts); + pgstat_unlock_entry(entry_ref); +} + +/* + * Scan through the shared hashtable of stats, resetting statistics if + * approved by the provided do_reset() function. + */ +void +pgstat_reset_matching_entries(bool (*do_reset) (PgStatShared_HashEntry *, Datum), + Datum match_data, TimestampTz ts) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + + /* dshash entry is not modified, take shared lock */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + PgStatShared_Common *header; + + if (p->dropped) + continue; + + if (!do_reset(p, match_data)) + continue; + + header = dsa_get_address(pgStatLocal.dsa, p->body); + + LWLockAcquire(&header->lock, LW_EXCLUSIVE); + + shared_stat_reset_contents(p->key.kind, header, ts); + + LWLockRelease(&header->lock); + } + dshash_seq_term(&hstat); +} + +static bool +match_kind(PgStatShared_HashEntry *p, Datum match_data) +{ + return p->key.kind == DatumGetInt32(match_data); +} + +void +pgstat_reset_entries_of_kind(PgStat_Kind kind, TimestampTz ts) +{ + pgstat_reset_matching_entries(match_kind, Int32GetDatum(kind), ts); +} + +static void +pgstat_setup_memcxt(void) +{ + if (unlikely(!pgStatSharedRefContext)) + pgStatSharedRefContext = + AllocSetContextCreate(CacheMemoryContext, + "PgStat Shared Ref", + ALLOCSET_SMALL_SIZES); + if (unlikely(!pgStatEntryRefHashContext)) + pgStatEntryRefHashContext = + AllocSetContextCreate(CacheMemoryContext, + "PgStat Shared Ref Hash", + ALLOCSET_SMALL_SIZES); +} diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c index d932bc74e0..d0b85b62a5 100644 --- a/src/backend/utils/activity/pgstat_slru.c +++ b/src/backend/utils/activity/pgstat_slru.c @@ -18,18 +18,21 @@ #include "postgres.h" #include "utils/pgstat_internal.h" +#include "utils/timestamp.h" -static inline PgStat_MsgSLRU *get_slru_entry(int slru_idx); +static inline PgStat_SLRUStats *get_slru_entry(int slru_idx); +static void pgstat_reset_slru_counter_internal(int index, TimestampTz ts); /* - * SLRU statistics counts waiting to be sent to the collector. These are - * stored directly in stats message format so they can be sent without needing - * to copy things around. We assume this variable inits to zeroes. Entries - * are one-to-one with slru_names[]. + * SLRU statistics counts waiting to be flushed out. We assume this variable + * inits to zeroes. Entries are one-to-one with slru_names[]. Changes of + * SLRU counters are reported within critical sections so we use static memory + * in order to avoid memory allocation. */ -static PgStat_MsgSLRU SLRUStats[SLRU_NUM_ELEMENTS]; +static PgStat_SLRUStats pending_SLRUStats[SLRU_NUM_ELEMENTS]; +bool have_slrustats = false; /* @@ -41,17 +44,11 @@ static PgStat_MsgSLRU SLRUStats[SLRU_NUM_ELEMENTS]; void pgstat_reset_slru(const char *name) { - PgStat_MsgResetslrucounter msg; + TimestampTz ts = GetCurrentTimestamp(); AssertArg(name != NULL); - if (pgStatSock == PGINVALID_SOCKET) - return; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSLRUCOUNTER); - msg.m_index = pgstat_get_slru_index(name); - - pgstat_send(&msg, sizeof(msg)); + pgstat_reset_slru_counter_internal(pgstat_get_slru_index(name), ts); } /* @@ -61,43 +58,55 @@ pgstat_reset_slru(const char *name) void pgstat_count_slru_page_zeroed(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_zeroed += 1; + get_slru_entry(slru_idx)->blocks_zeroed += 1; } void pgstat_count_slru_page_hit(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_hit += 1; + get_slru_entry(slru_idx)->blocks_hit += 1; } void pgstat_count_slru_page_exists(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_exists += 1; + get_slru_entry(slru_idx)->blocks_exists += 1; } void pgstat_count_slru_page_read(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_read += 1; + get_slru_entry(slru_idx)->blocks_read += 1; } void pgstat_count_slru_page_written(int slru_idx) { - get_slru_entry(slru_idx)->m_blocks_written += 1; + get_slru_entry(slru_idx)->blocks_written += 1; } void pgstat_count_slru_flush(int slru_idx) { - get_slru_entry(slru_idx)->m_flush += 1; + get_slru_entry(slru_idx)->flush += 1; } void pgstat_count_slru_truncate(int slru_idx) { - get_slru_entry(slru_idx)->m_truncate += 1; + get_slru_entry(slru_idx)->truncate += 1; +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the slru statistics struct. + */ +PgStat_SLRUStats * +pgstat_fetch_slru(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_SLRU); + + return pgStatLocal.snapshot.slru; } /* @@ -135,45 +144,81 @@ pgstat_get_slru_index(const char *name) } /* - * Send SLRU statistics to the collector + * Flush out locally pending SLRU stats entries + * + * If nowait is true, this function returns false on lock failure. Otherwise + * this function always returns true. Writer processes are mutually excluded + * using LWLock, but readers are expected to use change-count protocol to avoid + * interference with writers. + * + * If nowait is true, this function returns true if the lock could not be + * acquired. Otherwise return false. */ -void -pgstat_send_slru(void) +bool +pgstat_slru_flush(bool nowait) { - /* We assume this initializes to zeroes */ - static const PgStat_MsgSLRU all_zeroes; + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + int i; - for (int i = 0; i < SLRU_NUM_ELEMENTS; i++) + if (!have_slrustats) + return false; + + if (!nowait) + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE)) + return true; + + for (i = 0; i < SLRU_NUM_ELEMENTS; i++) { - /* - * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. - */ - if (memcmp(&SLRUStats[i], &all_zeroes, sizeof(PgStat_MsgSLRU)) == 0) - continue; + PgStat_SLRUStats *sharedent = &stats_shmem->stats[i]; + PgStat_SLRUStats *pendingent = &pending_SLRUStats[i]; - /* set the SLRU type before each send */ - SLRUStats[i].m_index = i; - - /* - * Prepare and send the message - */ - pgstat_setheader(&SLRUStats[i].m_hdr, PGSTAT_MTYPE_SLRU); - pgstat_send(&SLRUStats[i], sizeof(PgStat_MsgSLRU)); - - /* - * Clear out the statistics buffer, so it can be re-used. - */ - MemSet(&SLRUStats[i], 0, sizeof(PgStat_MsgSLRU)); +#define SLRU_ACC(fld) sharedent->fld += pendingent->fld + SLRU_ACC(blocks_zeroed); + SLRU_ACC(blocks_hit); + SLRU_ACC(blocks_read); + SLRU_ACC(blocks_written); + SLRU_ACC(blocks_exists); + SLRU_ACC(flush); + SLRU_ACC(truncate); +#undef SLRU_ACC } + + /* done, clear the pending entry */ + MemSet(pending_SLRUStats, 0, sizeof(pending_SLRUStats)); + + LWLockRelease(&stats_shmem->lock); + + have_slrustats = false; + + return false; +} + +void +pgstat_slru_reset_all_cb(TimestampTz ts) +{ + for (int i = 0; i < SLRU_NUM_ELEMENTS; i++) + pgstat_reset_slru_counter_internal(i, ts); +} + +void +pgstat_slru_snapshot_cb(void) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + + memcpy(pgStatLocal.snapshot.slru, &stats_shmem->stats, + sizeof(stats_shmem->stats)); + + LWLockRelease(&stats_shmem->lock); } /* * Returns pointer to entry with counters for given SLRU (based on the name * stored in SlruCtl as lwlock tranche name). */ -static inline PgStat_MsgSLRU * +static inline PgStat_SLRUStats * get_slru_entry(int slru_idx) { pgstat_assert_is_up(); @@ -186,5 +231,20 @@ get_slru_entry(int slru_idx) Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); - return &SLRUStats[slru_idx]; + have_slrustats = true; + + return &pending_SLRUStats[slru_idx]; +} + +static void +pgstat_reset_slru_counter_internal(int index, TimestampTz ts) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + + memset(&stats_shmem->stats[index], 0, sizeof(PgStat_SLRUStats)); + stats_shmem->stats[index].stat_reset_timestamp = ts; + + LWLockRelease(&stats_shmem->lock); } diff --git a/src/backend/utils/activity/pgstat_subscription.c b/src/backend/utils/activity/pgstat_subscription.c index 689029b30a..e1072bd5ba 100644 --- a/src/backend/utils/activity/pgstat_subscription.c +++ b/src/backend/utils/activity/pgstat_subscription.c @@ -26,12 +26,17 @@ void pgstat_report_subscription_error(Oid subid, bool is_apply_error) { - PgStat_MsgSubscriptionError msg; + PgStat_EntryRef *entry_ref; + PgStat_BackendSubEntry *pending; - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_SUBSCRIPTIONERROR); - msg.m_subid = subid; - msg.m_is_apply_error = is_apply_error; - pgstat_send(&msg, sizeof(PgStat_MsgSubscriptionError)); + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_SUBSCRIPTION, + InvalidOid, subid, NULL); + pending = entry_ref->pending; + + if (is_apply_error) + pending->apply_error_count++; + else + pending->sync_error_count++; } /* @@ -54,12 +59,52 @@ pgstat_create_subscription(Oid subid) void pgstat_drop_subscription(Oid subid) { - PgStat_MsgSubscriptionDrop msg; - - pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_SUBSCRIPTIONDROP); - msg.m_subid = subid; - pgstat_send(&msg, sizeof(PgStat_MsgSubscriptionDrop)); - pgstat_drop_transactional(PGSTAT_KIND_SUBSCRIPTION, InvalidOid, subid); } + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one subscription or NULL. + */ +PgStat_StatSubEntry * +pgstat_fetch_stat_subscription(Oid subid) +{ + return (PgStat_StatSubEntry *) + pgstat_fetch_entry(PGSTAT_KIND_SUBSCRIPTION, InvalidOid, subid); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStat_BackendSubEntry *localent; + PgStatShared_Subscription *shsubent; + + localent = (PgStat_BackendSubEntry *) entry_ref->pending; + shsubent = (PgStatShared_Subscription *) entry_ref->shared_stats; + + /* localent always has non-zero content */ + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + +#define SUB_ACC(fld) shsubent->stats.fld += localent->fld + SUB_ACC(apply_error_count); + SUB_ACC(sync_error_count); +#undef SUB_ACC + + pgstat_unlock_entry(entry_ref); + return true; +} + +void +pgstat_subscription_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Subscription *) header)->stats.stat_reset_timestamp = ts; +} diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 8855598f52..5a878bd115 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -21,13 +21,7 @@ #include "executor/instrument.h" -/* - * WAL global statistics counters. Stored directly in a stats message - * structure so they can be sent without needing to copy things around. We - * assume these init to zeroes. - */ -PgStat_MsgWal WalStats; - +PgStat_WalStats PendingWalStats = {0}; /* * WAL usage counters saved from pgWALUsage at the previous call to @@ -39,101 +33,100 @@ static WalUsage prevWalUsage; /* - * Send WAL statistics to the collector. + * Calculate how much WAL usage counters have increased and update + * shared statistics. * - * If 'force' is not set, WAL stats message is only sent if enough time has - * passed since last one was sent to reach PGSTAT_STAT_INTERVAL. + * Must be called by processes that generate WAL, that do not call + * pgstat_report_stat(), like walwriter. */ void pgstat_report_wal(bool force) { - static TimestampTz sendTime = 0; + pgstat_flush_wal(force); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the WAL statistics struct. + */ +PgStat_WalStats * +pgstat_fetch_stat_wal(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_WAL); + + return &pgStatLocal.snapshot.wal; +} + +/* + * Calculate how much WAL usage counters have increased by subtracting the + * previous counters from the current ones. + * + * If nowait is true, this function returns true if the lock could not be + * acquired. Otherwise return false. + */ +bool +pgstat_flush_wal(bool nowait) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + WalUsage diff = {0}; + + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + Assert(pgStatLocal.shmem != NULL && + !pgStatLocal.shmem->is_shutdown); /* - * This function can be called even if nothing at all has happened. In - * this case, avoid sending a completely empty message to the stats - * collector. - * - * Check wal_records counter to determine whether any WAL activity has - * happened since last time. Note that other WalUsage counters don't need - * to be checked because they are incremented always together with - * wal_records counter. - * - * m_wal_buffers_full also doesn't need to be checked because it's - * incremented only when at least one WAL record is generated (i.e., - * wal_records counter is incremented). But for safely, we assert that - * m_wal_buffers_full is always zero when no WAL record is generated - * - * This function can be called by a process like walwriter that normally - * generates no WAL records. To determine whether any WAL activity has - * happened at that process since the last time, the numbers of WAL writes - * and syncs are also checked. + * This function can be called even if nothing at all has happened. Avoid + * taking lock for nothing in that case. */ - if (pgWalUsage.wal_records == prevWalUsage.wal_records && - WalStats.m_wal_write == 0 && WalStats.m_wal_sync == 0) - { - Assert(WalStats.m_wal_buffers_full == 0); - return; - } - - if (!force) - { - TimestampTz now = GetCurrentTimestamp(); - - /* - * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL - * msec since we last sent one to avoid overloading the stats - * collector. - */ - if (!TimestampDifferenceExceeds(sendTime, now, PGSTAT_STAT_INTERVAL)) - return; - sendTime = now; - } + if (!pgstat_have_pending_wal()) + return false; /* - * Set the counters related to generated WAL data if the counters were - * updated. + * We don't update the WAL usage portion of the local WalStats elsewhere. + * Calculate how much WAL usage counters were increased by subtracting the + * previous counters from the current ones. */ - if (pgWalUsage.wal_records != prevWalUsage.wal_records) - { - WalUsage walusage; + WalUsageAccumDiff(&diff, &pgWalUsage, &prevWalUsage); + PendingWalStats.wal_records = diff.wal_records; + PendingWalStats.wal_fpi = diff.wal_fpi; + PendingWalStats.wal_bytes = diff.wal_bytes; - /* - * Calculate how much WAL usage counters were increased by subtracting - * the previous counters from the current ones. Fill the results in - * WAL stats message. - */ - MemSet(&walusage, 0, sizeof(WalUsage)); - WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage); + if (!nowait) + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE)) + return true; - WalStats.m_wal_records = walusage.wal_records; - WalStats.m_wal_fpi = walusage.wal_fpi; - WalStats.m_wal_bytes = walusage.wal_bytes; +#define WALSTAT_ACC(fld) stats_shmem->stats.fld += PendingWalStats.fld + WALSTAT_ACC(wal_records); + WALSTAT_ACC(wal_fpi); + WALSTAT_ACC(wal_bytes); + WALSTAT_ACC(wal_buffers_full); + WALSTAT_ACC(wal_write); + WALSTAT_ACC(wal_sync); + WALSTAT_ACC(wal_write_time); + WALSTAT_ACC(wal_sync_time); +#undef WALSTAT_ACC - /* - * Save the current counters for the subsequent calculation of WAL - * usage. - */ - prevWalUsage = pgWalUsage; - } + LWLockRelease(&stats_shmem->lock); /* - * Prepare and send the message + * Save the current counters for the subsequent calculation of WAL usage. */ - pgstat_setheader(&WalStats.m_hdr, PGSTAT_MTYPE_WAL); - pgstat_send(&WalStats, sizeof(WalStats)); + prevWalUsage = pgWalUsage; /* * Clear out the statistics buffer, so it can be re-used. */ - MemSet(&WalStats, 0, sizeof(WalStats)); + MemSet(&PendingWalStats, 0, sizeof(PendingWalStats)); + + return false; } void pgstat_init_wal(void) { /* - * Initialize prevWalUsage with pgWalUsage so that pgstat_report_wal() can + * Initialize prevWalUsage with pgWalUsage so that pgstat_flush_wal() can * calculate how much pgWalUsage counters are increased by subtracting * prevWalUsage from pgWalUsage. */ @@ -151,6 +144,28 @@ bool pgstat_have_pending_wal(void) { return pgWalUsage.wal_records != prevWalUsage.wal_records || - WalStats.m_wal_write != 0 || - WalStats.m_wal_sync != 0; + PendingWalStats.wal_write != 0 || + PendingWalStats.wal_sync != 0; +} + +void +pgstat_wal_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + memset(&stats_shmem->stats, 0, sizeof(stats_shmem->stats)); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_wal_snapshot_cb(void) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&pgStatLocal.snapshot.wal, &stats_shmem->stats, + sizeof(pgStatLocal.snapshot.wal)); + LWLockRelease(&stats_shmem->lock); } diff --git a/src/backend/utils/activity/pgstat_xact.c b/src/backend/utils/activity/pgstat_xact.c index 3f33087378..230ffa5afc 100644 --- a/src/backend/utils/activity/pgstat_xact.c +++ b/src/backend/utils/activity/pgstat_xact.c @@ -68,6 +68,7 @@ static void AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) { dlist_mutable_iter iter; + int not_freed_count = 0; if (xact_state->pending_drops_count == 0) { @@ -79,6 +80,7 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) { PgStat_PendingDroppedStatsItem *pending = dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + xl_xact_stats_item *it = &pending->item; if (isCommit && !pending->is_create) { @@ -86,7 +88,8 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) * Transaction that dropped an object committed. Drop the stats * too. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } else if (!isCommit && pending->is_create) { @@ -94,13 +97,17 @@ AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) * Transaction that created an object aborted. Drop the stats * associated with the object. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } dlist_delete(&pending->node); xact_state->pending_drops_count--; pfree(pending); } + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } /* @@ -135,6 +142,7 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, { PgStat_SubXactStatus *parent_xact_state; dlist_mutable_iter iter; + int not_freed_count = 0; if (xact_state->pending_drops_count == 0) return; @@ -145,6 +153,7 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, { PgStat_PendingDroppedStatsItem *pending = dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + xl_xact_stats_item *it = &pending->item; dlist_delete(&pending->node); xact_state->pending_drops_count--; @@ -155,7 +164,8 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, * Subtransaction creating a new stats object aborted. Drop the * stats object. */ - /* will do work in subsequent commit */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; pfree(pending); } else if (isCommit) @@ -175,6 +185,8 @@ AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, } Assert(xact_state->pending_drops_count == 0); + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } /* @@ -307,13 +319,21 @@ pgstat_get_transactional_drops(bool isCommit, xl_xact_stats_item **items) void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_item *items, bool is_redo) { + int not_freed_count = 0; + if (ndrops == 0) return; for (int i = 0; i < ndrops; i++) { - /* will do work in subsequent commit */ + xl_xact_stats_item *it = &items[i]; + + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; } + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); } static void @@ -345,6 +365,15 @@ create_drop_transactional_internal(PgStat_Kind kind, Oid dboid, Oid objoid, bool void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, Oid objoid) { + if (pgstat_get_entry_ref(kind, dboid, objoid, false, NULL)) + { + ereport(WARNING, + errmsg("resetting existing stats for type %s, db=%d, oid=%d", + (pgstat_get_kind_info(kind))->name, dboid, objoid)); + + pgstat_reset(kind, dboid, objoid); + } + create_drop_transactional_internal(kind, dboid, objoid, /* create */ true); } diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 1c8aba4925..87c15b9c6f 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -230,9 +230,6 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN: event_name = "LogicalLauncherMain"; break; - case WAIT_EVENT_PGSTAT_MAIN: - event_name = "PgStatMain"; - break; case WAIT_EVENT_RECOVERY_WAL_STREAM: event_name = "RecoveryWalStream"; break; diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index be5470a107..248d318f86 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -2046,7 +2046,15 @@ pg_stat_get_xact_function_self_time(PG_FUNCTION_ARGS) Datum pg_stat_get_snapshot_timestamp(PG_FUNCTION_ARGS) { - PG_RETURN_TIMESTAMPTZ(pgstat_fetch_global()->stats_timestamp); + bool have_snapshot; + TimestampTz ts; + + ts = pgstat_get_stat_snapshot_timestamp(&have_snapshot); + + if (!have_snapshot) + PG_RETURN_NULL(); + + PG_RETURN_TIMESTAMPTZ(ts); } /* Discard the active statistics snapshot */ diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index a15ce9edb1..1f29670a13 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -73,6 +73,7 @@ #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" +#include "pgstat.h" #include "rewrite/rewriteDefine.h" #include "rewrite/rowsecurity.h" #include "storage/lmgr.h" @@ -2409,6 +2410,9 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc) */ RelationCloseSmgr(relation); + /* break mutual link with stats entry */ + pgstat_unlink_relation(relation); + /* * Free all the subsidiary data structures of the relcache entry, then the * entry itself. @@ -2716,8 +2720,9 @@ RelationClearRelation(Relation relation, bool rebuild) SWAPFIELD(RowSecurityDesc *, rd_rsdesc); /* toast OID override must be preserved */ SWAPFIELD(Oid, rd_toastoid); - /* pgstat_info must be preserved */ + /* pgstat_info / enabled must be preserved */ SWAPFIELD(struct PgStat_TableStatus *, pgstat_info); + SWAPFIELD(bool, pgstat_enabled); /* preserve old partition key if we have one */ if (keep_partkey) { diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 3419c099b2..1a5d29ac9b 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -36,6 +36,7 @@ volatile sig_atomic_t IdleInTransactionSessionTimeoutPending = false; volatile sig_atomic_t IdleSessionTimeoutPending = false; volatile sig_atomic_t ProcSignalBarrierPending = false; volatile sig_atomic_t LogMemoryContextPending = false; +volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false; volatile uint32 InterruptHoldoffCount = 0; volatile uint32 QueryCancelHoldoffCount = 0; volatile uint32 CritSectionCount = 0; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index bdc77af719..0d3cfe8240 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -288,9 +288,6 @@ GetBackendTypeDesc(BackendType backendType) case B_ARCHIVER: backendDesc = "archiver"; break; - case B_STATS_COLLECTOR: - backendDesc = "stats collector"; - break; case B_LOGGER: backendDesc = "logger"; break; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 342169b195..a85c2e0260 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -80,6 +80,7 @@ static void StatementTimeoutHandler(void); static void LockTimeoutHandler(void); static void IdleInTransactionSessionTimeoutHandler(void); static void IdleSessionTimeoutHandler(void); +static void IdleStatsUpdateTimeoutHandler(void); static void ClientCheckTimeoutHandler(void); static bool ThereIsAtLeastOneRole(void); static void process_startup_options(Port *port, bool am_superuser); @@ -725,6 +726,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, IdleInTransactionSessionTimeoutHandler); RegisterTimeout(IDLE_SESSION_TIMEOUT, IdleSessionTimeoutHandler); RegisterTimeout(CLIENT_CONNECTION_CHECK_TIMEOUT, ClientCheckTimeoutHandler); + RegisterTimeout(IDLE_STATS_UPDATE_TIMEOUT, + IdleStatsUpdateTimeoutHandler); } /* @@ -752,6 +755,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, * Use before_shmem_exit() so that ShutdownXLOG() can rely on DSM * segments etc to work (which in turn is required for pgstats). */ + before_shmem_exit(pgstat_before_server_shutdown, 0); before_shmem_exit(ShutdownXLOG, 0); } @@ -1334,6 +1338,14 @@ IdleSessionTimeoutHandler(void) SetLatch(MyLatch); } +static void +IdleStatsUpdateTimeoutHandler(void) +{ + IdleStatsUpdateTimeoutPending = true; + InterruptPending = true; + SetLatch(MyLatch); +} + static void ClientCheckTimeoutHandler(void) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5538465d7d..f7758ea4a7 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -375,6 +375,16 @@ static const struct config_enum_entry track_function_options[] = { StaticAssertDecl(lengthof(track_function_options) == (TRACK_FUNC_ALL + 2), "array length mismatch"); +static const struct config_enum_entry stats_fetch_consistency[] = { + {"none", PGSTAT_FETCH_CONSISTENCY_NONE, false}, + {"cache", PGSTAT_FETCH_CONSISTENCY_CACHE, false}, + {"snapshot", PGSTAT_FETCH_CONSISTENCY_SNAPSHOT, false}, + {NULL, 0, false} +}; + +StaticAssertDecl(lengthof(stats_fetch_consistency) == (PGSTAT_FETCH_CONSISTENCY_SNAPSHOT + 2), + "array length mismatch"); + static const struct config_enum_entry xmlbinary_options[] = { {"base64", XMLBINARY_BASE64, false}, {"hex", XMLBINARY_HEX, false}, @@ -4918,6 +4928,17 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + + { + {"stats_fetch_consistency", PGC_USERSET, STATS_COLLECTOR, + gettext_noop("Sets the consistency of accesses to statistics data"), + NULL + }, + &pgstat_fetch_consistency, + PGSTAT_FETCH_CONSISTENCY_CACHE, stats_fetch_consistency, + NULL, NULL, NULL + }, + { {"wal_compression", PGC_SUSET, WAL_SETTINGS, gettext_noop("Compresses full-page writes written in WAL file with specified method."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 93d221a37b..5f9a37bed3 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -614,6 +614,7 @@ #track_wal_io_timing = off #track_functions = none # none, pl, all #stats_temp_directory = 'pg_stat_tmp' +#stats_fetch_consistency = none # - Monitoring - diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9321d7f264..66c404c666 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -94,6 +94,7 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleInTransactionSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending; extern PGDLLIMPORT volatile sig_atomic_t LogMemoryContextPending; +extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending; extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost; @@ -333,7 +334,6 @@ typedef enum BackendType B_WAL_SENDER, B_WAL_WRITER, B_ARCHIVER, - B_STATS_COLLECTOR, B_LOGGER, } BackendType; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 99115bacde..1d2d3de86c 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -14,10 +14,8 @@ #include "datatype/timestamp.h" #include "portability/instr_time.h" #include "postmaster/pgarch.h" /* for MAX_XFN_CHARS */ -#include "replication/logicalproto.h" #include "utils/backend_progress.h" /* for backward compatibility */ #include "utils/backend_status.h" /* for backward compatibility */ -#include "utils/hsearch.h" #include "utils/relcache.h" #include "utils/wait_event.h" /* for backward compatibility */ @@ -27,8 +25,8 @@ * ---------- */ #define PGSTAT_STAT_PERMANENT_DIRECTORY "pg_stat" -#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat" -#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp" +#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/pgstat.stat" +#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/pgstat.tmp" /* Default directory to store temporary statistics data in */ #define PG_STAT_TMP_DIR "pg_stat_tmp" @@ -66,6 +64,13 @@ typedef enum TrackFunctionsLevel TRACK_FUNC_ALL } TrackFunctionsLevel; +typedef enum PgStat_FetchConsistency +{ + PGSTAT_FETCH_CONSISTENCY_NONE, + PGSTAT_FETCH_CONSISTENCY_CACHE, + PGSTAT_FETCH_CONSISTENCY_SNAPSHOT, +} PgStat_FetchConsistency; + /* Values to track the cause of session termination */ typedef enum SessionEndType { @@ -92,7 +97,7 @@ typedef int64 PgStat_Counter; * PgStat_FunctionCounts The actual per-function counts kept by a backend * * This struct should contain only actual event counters, because we memcmp - * it against zeroes to detect whether there are any counts to transmit. + * it against zeroes to detect whether there are any pending stats. * * Note that the time counters are in instr_time format here. We convert to * microseconds in PgStat_Counter format when flushing out pending statistics. @@ -106,12 +111,11 @@ typedef struct PgStat_FunctionCounts } PgStat_FunctionCounts; /* ---------- - * PgStat_BackendFunctionEntry Entry in backend's per-function hash table + * PgStat_BackendFunctionEntry Non-flushed function stats. * ---------- */ typedef struct PgStat_BackendFunctionEntry { - Oid f_id; PgStat_FunctionCounts f_counts; } PgStat_BackendFunctionEntry; @@ -131,13 +135,22 @@ typedef struct PgStat_FunctionCallUsage instr_time f_start; } PgStat_FunctionCallUsage; +/* ---------- + * PgStat_BackendSubEntry Non-flushed subscription stats. + * ---------- + */ +typedef struct PgStat_BackendSubEntry +{ + PgStat_Counter apply_error_count; + PgStat_Counter sync_error_count; +} PgStat_BackendSubEntry; + /* ---------- * PgStat_TableCounts The actual per-table counts kept by a backend * * This struct should contain only actual event counters, because we memcmp - * it against zeroes to detect whether there are any counts to transmit. - * It is a component of PgStat_TableStatus (within-backend state) and - * PgStat_TableEntry (the transmitted message format). + * it against zeroes to detect whether there are any stats updates to apply. + * It is a component of PgStat_TableStatus (within-backend state). * * Note: for a table, tuples_returned is the number of tuples successfully * fetched by heap_getnext, while tuples_fetched is the number of tuples @@ -194,6 +207,7 @@ typedef struct PgStat_TableStatus bool t_shared; /* is it a shared catalog? */ struct PgStat_TableXactStatus *trans; /* lowest subxact's counts */ PgStat_TableCounts t_counts; /* event counts to be sent */ + Relation relation; /* rel that is using this entry */ } PgStat_TableStatus; /* ---------- @@ -221,569 +235,14 @@ typedef struct PgStat_TableXactStatus /* ------------------------------------------------------------ - * Message formats follow - * ------------------------------------------------------------ - */ - -/* ---------- - * The types of backend -> collector messages - * ---------- - */ -typedef enum StatMsgType -{ - PGSTAT_MTYPE_DUMMY, - PGSTAT_MTYPE_INQUIRY, - PGSTAT_MTYPE_TABSTAT, - PGSTAT_MTYPE_TABPURGE, - PGSTAT_MTYPE_DROPDB, - PGSTAT_MTYPE_RESETCOUNTER, - PGSTAT_MTYPE_RESETSHAREDCOUNTER, - PGSTAT_MTYPE_RESETSINGLECOUNTER, - PGSTAT_MTYPE_RESETSLRUCOUNTER, - PGSTAT_MTYPE_RESETREPLSLOTCOUNTER, - PGSTAT_MTYPE_RESETSUBCOUNTER, - PGSTAT_MTYPE_AUTOVAC_START, - PGSTAT_MTYPE_VACUUM, - PGSTAT_MTYPE_ANALYZE, - PGSTAT_MTYPE_ARCHIVER, - PGSTAT_MTYPE_BGWRITER, - PGSTAT_MTYPE_CHECKPOINTER, - PGSTAT_MTYPE_WAL, - PGSTAT_MTYPE_SLRU, - PGSTAT_MTYPE_FUNCSTAT, - PGSTAT_MTYPE_FUNCPURGE, - PGSTAT_MTYPE_RECOVERYCONFLICT, - PGSTAT_MTYPE_TEMPFILE, - PGSTAT_MTYPE_DEADLOCK, - PGSTAT_MTYPE_CHECKSUMFAILURE, - PGSTAT_MTYPE_REPLSLOT, - PGSTAT_MTYPE_CONNECT, - PGSTAT_MTYPE_DISCONNECT, - PGSTAT_MTYPE_SUBSCRIPTIONDROP, - PGSTAT_MTYPE_SUBSCRIPTIONERROR, -} StatMsgType; - -/* ---------- - * PgStat_MsgHdr The common message header - * ---------- - */ -typedef struct PgStat_MsgHdr -{ - StatMsgType m_type; - int m_size; -} PgStat_MsgHdr; - -/* ---------- - * Space available in a message. This will keep the UDP packets below 1K, - * which should fit unfragmented into the MTU of the loopback interface. - * (Larger values of PGSTAT_MAX_MSG_SIZE would work for that on most - * platforms, but we're being conservative here.) - * ---------- - */ -#define PGSTAT_MAX_MSG_SIZE 1000 -#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr)) - - -/* ---------- - * PgStat_MsgDummy A dummy message, ignored by the collector - * ---------- - */ -typedef struct PgStat_MsgDummy -{ - PgStat_MsgHdr m_hdr; -} PgStat_MsgDummy; - -/* ---------- - * PgStat_MsgInquiry Sent by a backend to ask the collector - * to write the stats file(s). - * - * Ordinarily, an inquiry message prompts writing of the global stats file, - * the stats file for shared catalogs, and the stats file for the specified - * database. If databaseid is InvalidOid, only the first two are written. - * - * New file(s) will be written only if the existing file has a timestamp - * older than the specified cutoff_time; this prevents duplicated effort - * when multiple requests arrive at nearly the same time, assuming that - * backends send requests with cutoff_times a little bit in the past. - * - * clock_time should be the requestor's current local time; the collector - * uses this to check for the system clock going backward, but it has no - * effect unless that occurs. We assume clock_time >= cutoff_time, though. - * ---------- - */ -typedef struct PgStat_MsgInquiry -{ - PgStat_MsgHdr m_hdr; - TimestampTz clock_time; /* observed local clock time */ - TimestampTz cutoff_time; /* minimum acceptable file timestamp */ - Oid databaseid; /* requested DB (InvalidOid => shared only) */ -} PgStat_MsgInquiry; - -/* ---------- - * PgStat_TableEntry Per-table info in a MsgTabstat - * ---------- - */ -typedef struct PgStat_TableEntry -{ - Oid t_id; - PgStat_TableCounts t_counts; -} PgStat_TableEntry; - -/* ---------- - * PgStat_MsgTabstat Sent by the backend to report table - * and buffer access statistics. - * ---------- - */ -#define PGSTAT_NUM_TABENTRIES \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 5 * sizeof(PgStat_Counter)) \ - / sizeof(PgStat_TableEntry)) - -typedef struct PgStat_MsgTabstat -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - int m_xact_commit; - int m_xact_rollback; - PgStat_Counter m_block_read_time; /* times in microseconds */ - PgStat_Counter m_block_write_time; - PgStat_Counter m_session_time; - PgStat_Counter m_active_time; - PgStat_Counter m_idle_in_xact_time; - PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES]; -} PgStat_MsgTabstat; - -/* ---------- - * PgStat_MsgTabpurge Sent by the backend to tell the collector - * about dead tables. - * ---------- - */ -#define PGSTAT_NUM_TABPURGE \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(Oid)) - -typedef struct PgStat_MsgTabpurge -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - Oid m_tableid[PGSTAT_NUM_TABPURGE]; -} PgStat_MsgTabpurge; - -/* ---------- - * PgStat_MsgDropdb Sent by the backend to tell the collector - * about a dropped database - * ---------- - */ -typedef struct PgStat_MsgDropdb -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgDropdb; - -/* ---------- - * PgStat_MsgResetcounter Sent by the backend to tell the collector - * to reset counters - * ---------- - */ -typedef struct PgStat_MsgResetcounter -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgResetcounter; - -/* ---------- - * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector - * to reset a shared counter - * ---------- - */ -typedef struct PgStat_MsgResetsharedcounter -{ - PgStat_MsgHdr m_hdr; - PgStat_Kind m_resettarget; -} PgStat_MsgResetsharedcounter; - -/* ---------- - * PgStat_MsgResetsinglecounter Sent by the backend to tell the collector - * to reset a single counter - * ---------- - */ -typedef struct PgStat_MsgResetsinglecounter -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - PgStat_Kind m_resettype; - Oid m_objectid; -} PgStat_MsgResetsinglecounter; - -/* ---------- - * PgStat_MsgResetslrucounter Sent by the backend to tell the collector - * to reset a SLRU counter - * ---------- - */ -typedef struct PgStat_MsgResetslrucounter -{ - PgStat_MsgHdr m_hdr; - int m_index; -} PgStat_MsgResetslrucounter; - -/* ---------- - * PgStat_MsgResetreplslotcounter Sent by the backend to tell the collector - * to reset replication slot counter(s) - * ---------- - */ -typedef struct PgStat_MsgResetreplslotcounter -{ - PgStat_MsgHdr m_hdr; - NameData m_slotname; - bool clearall; -} PgStat_MsgResetreplslotcounter; - -/* ---------- - * PgStat_MsgResetsubcounter Sent by the backend to tell the collector - * to reset subscription counter(s) - * ---------- - */ -typedef struct PgStat_MsgResetsubcounter -{ - PgStat_MsgHdr m_hdr; - Oid m_subid; /* InvalidOid means reset all subscription - * stats */ -} PgStat_MsgResetsubcounter; - -/* ---------- - * PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal - * that a database is going to be processed - * ---------- - */ -typedef struct PgStat_MsgAutovacStart -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - TimestampTz m_start_time; -} PgStat_MsgAutovacStart; - -/* ---------- - * PgStat_MsgVacuum Sent by the backend or autovacuum daemon - * after VACUUM - * ---------- - */ -typedef struct PgStat_MsgVacuum -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - Oid m_tableoid; - bool m_autovacuum; - TimestampTz m_vacuumtime; - PgStat_Counter m_live_tuples; - PgStat_Counter m_dead_tuples; -} PgStat_MsgVacuum; - -/* ---------- - * PgStat_MsgAnalyze Sent by the backend or autovacuum daemon - * after ANALYZE - * ---------- - */ -typedef struct PgStat_MsgAnalyze -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - Oid m_tableoid; - bool m_autovacuum; - bool m_resetcounter; - TimestampTz m_analyzetime; - PgStat_Counter m_live_tuples; - PgStat_Counter m_dead_tuples; -} PgStat_MsgAnalyze; - -/* ---------- - * PgStat_MsgArchiver Sent by the archiver to update statistics. - * ---------- - */ -typedef struct PgStat_MsgArchiver -{ - PgStat_MsgHdr m_hdr; - bool m_failed; /* Failed attempt */ - char m_xlog[MAX_XFN_CHARS + 1]; - TimestampTz m_timestamp; -} PgStat_MsgArchiver; - -/* ---------- - * PgStat_MsgBgWriter Sent by the bgwriter to update statistics. - * ---------- - */ -typedef struct PgStat_MsgBgWriter -{ - PgStat_MsgHdr m_hdr; - - PgStat_Counter m_buf_written_clean; - PgStat_Counter m_maxwritten_clean; - PgStat_Counter m_buf_alloc; -} PgStat_MsgBgWriter; - -/* ---------- - * PgStat_MsgCheckpointer Sent by the checkpointer to update statistics. - * ---------- - */ -typedef struct PgStat_MsgCheckpointer -{ - PgStat_MsgHdr m_hdr; - - PgStat_Counter m_timed_checkpoints; - PgStat_Counter m_requested_checkpoints; - PgStat_Counter m_buf_written_checkpoints; - PgStat_Counter m_buf_written_backend; - PgStat_Counter m_buf_fsync_backend; - PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */ - PgStat_Counter m_checkpoint_sync_time; -} PgStat_MsgCheckpointer; - -/* ---------- - * PgStat_MsgWal Sent by backends and background processes to update WAL statistics. - * ---------- - */ -typedef struct PgStat_MsgWal -{ - PgStat_MsgHdr m_hdr; - PgStat_Counter m_wal_records; - PgStat_Counter m_wal_fpi; - uint64 m_wal_bytes; - PgStat_Counter m_wal_buffers_full; - PgStat_Counter m_wal_write; - PgStat_Counter m_wal_sync; - PgStat_Counter m_wal_write_time; /* time spent writing wal records in - * microseconds */ - PgStat_Counter m_wal_sync_time; /* time spent syncing wal records in - * microseconds */ -} PgStat_MsgWal; - -/* ---------- - * PgStat_MsgSLRU Sent by a backend to update SLRU statistics. - * ---------- - */ -typedef struct PgStat_MsgSLRU -{ - PgStat_MsgHdr m_hdr; - PgStat_Counter m_index; - PgStat_Counter m_blocks_zeroed; - PgStat_Counter m_blocks_hit; - PgStat_Counter m_blocks_read; - PgStat_Counter m_blocks_written; - PgStat_Counter m_blocks_exists; - PgStat_Counter m_flush; - PgStat_Counter m_truncate; -} PgStat_MsgSLRU; - -/* ---------- - * PgStat_MsgReplSlot Sent by a backend or a wal sender to update replication - * slot statistics. - * ---------- - */ -typedef struct PgStat_MsgReplSlot -{ - PgStat_MsgHdr m_hdr; - NameData m_slotname; - bool m_create; - bool m_drop; - PgStat_Counter m_spill_txns; - PgStat_Counter m_spill_count; - PgStat_Counter m_spill_bytes; - PgStat_Counter m_stream_txns; - PgStat_Counter m_stream_count; - PgStat_Counter m_stream_bytes; - PgStat_Counter m_total_txns; - PgStat_Counter m_total_bytes; -} PgStat_MsgReplSlot; - -/* ---------- - * PgStat_MsgSubscriptionDrop Sent by the backend and autovacuum to tell the - * collector about the dead subscription. - * ---------- - */ -typedef struct PgStat_MsgSubscriptionDrop -{ - PgStat_MsgHdr m_hdr; - Oid m_subid; -} PgStat_MsgSubscriptionDrop; - -/* ---------- - * PgStat_MsgSubscriptionError Sent by the apply worker or the table sync - * worker to report an error on the subscription. - * ---------- - */ -typedef struct PgStat_MsgSubscriptionError -{ - PgStat_MsgHdr m_hdr; - - Oid m_subid; - bool m_is_apply_error; -} PgStat_MsgSubscriptionError; - -/* ---------- - * PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict - * ---------- - */ -typedef struct PgStat_MsgRecoveryConflict -{ - PgStat_MsgHdr m_hdr; - - Oid m_databaseid; - int m_reason; -} PgStat_MsgRecoveryConflict; - -/* ---------- - * PgStat_MsgTempFile Sent by the backend upon creating a temp file - * ---------- - */ -typedef struct PgStat_MsgTempFile -{ - PgStat_MsgHdr m_hdr; - - Oid m_databaseid; - size_t m_filesize; -} PgStat_MsgTempFile; - -/* ---------- - * PgStat_FunctionEntry Per-function info in a MsgFuncstat - * ---------- - */ -typedef struct PgStat_FunctionEntry -{ - Oid f_id; - PgStat_Counter f_numcalls; - PgStat_Counter f_total_time; /* times in microseconds */ - PgStat_Counter f_self_time; -} PgStat_FunctionEntry; - -/* ---------- - * PgStat_MsgFuncstat Sent by the backend to report function - * usage statistics. - * ---------- - */ -#define PGSTAT_NUM_FUNCENTRIES \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(PgStat_FunctionEntry)) - -typedef struct PgStat_MsgFuncstat -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES]; -} PgStat_MsgFuncstat; - -/* ---------- - * PgStat_MsgFuncpurge Sent by the backend to tell the collector - * about dead functions. - * ---------- - */ -#define PGSTAT_NUM_FUNCPURGE \ - ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \ - / sizeof(Oid)) - -typedef struct PgStat_MsgFuncpurge -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_nentries; - Oid m_functionid[PGSTAT_NUM_FUNCPURGE]; -} PgStat_MsgFuncpurge; - -/* ---------- - * PgStat_MsgDeadlock Sent by the backend to tell the collector - * about a deadlock that occurred. - * ---------- - */ -typedef struct PgStat_MsgDeadlock -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgDeadlock; - -/* ---------- - * PgStat_MsgChecksumFailure Sent by the backend to tell the collector - * about checksum failures noticed. - * ---------- - */ -typedef struct PgStat_MsgChecksumFailure -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - int m_failurecount; - TimestampTz m_failure_time; -} PgStat_MsgChecksumFailure; - -/* ---------- - * PgStat_MsgConnect Sent by the backend upon connection - * establishment - * ---------- - */ -typedef struct PgStat_MsgConnect -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; -} PgStat_MsgConnect; - -/* ---------- - * PgStat_MsgDisconnect Sent by the backend when disconnecting - * ---------- - */ -typedef struct PgStat_MsgDisconnect -{ - PgStat_MsgHdr m_hdr; - Oid m_databaseid; - SessionEndType m_cause; -} PgStat_MsgDisconnect; - -/* ---------- - * PgStat_Msg Union over all possible messages. - * ---------- - */ -typedef union PgStat_Msg -{ - PgStat_MsgHdr msg_hdr; - PgStat_MsgDummy msg_dummy; - PgStat_MsgInquiry msg_inquiry; - PgStat_MsgTabstat msg_tabstat; - PgStat_MsgTabpurge msg_tabpurge; - PgStat_MsgDropdb msg_dropdb; - PgStat_MsgResetcounter msg_resetcounter; - PgStat_MsgResetsharedcounter msg_resetsharedcounter; - PgStat_MsgResetsinglecounter msg_resetsinglecounter; - PgStat_MsgResetslrucounter msg_resetslrucounter; - PgStat_MsgResetreplslotcounter msg_resetreplslotcounter; - PgStat_MsgResetsubcounter msg_resetsubcounter; - PgStat_MsgAutovacStart msg_autovacuum_start; - PgStat_MsgVacuum msg_vacuum; - PgStat_MsgAnalyze msg_analyze; - PgStat_MsgArchiver msg_archiver; - PgStat_MsgBgWriter msg_bgwriter; - PgStat_MsgCheckpointer msg_checkpointer; - PgStat_MsgWal msg_wal; - PgStat_MsgSLRU msg_slru; - PgStat_MsgFuncstat msg_funcstat; - PgStat_MsgFuncpurge msg_funcpurge; - PgStat_MsgRecoveryConflict msg_recoveryconflict; - PgStat_MsgDeadlock msg_deadlock; - PgStat_MsgTempFile msg_tempfile; - PgStat_MsgChecksumFailure msg_checksumfailure; - PgStat_MsgReplSlot msg_replslot; - PgStat_MsgConnect msg_connect; - PgStat_MsgDisconnect msg_disconnect; - PgStat_MsgSubscriptionError msg_subscriptionerror; - PgStat_MsgSubscriptionDrop msg_subscriptiondrop; -} PgStat_Msg; - - -/* ------------------------------------------------------------ - * Statistic collector data structures follow + * Data structures on disk and in shared memory follow * * PGSTAT_FILE_FORMAT_ID should be changed whenever any of these * data structures change. * ------------------------------------------------------------ */ -#define PGSTAT_FILE_FORMAT_ID 0x01A5BCA6 +#define PGSTAT_FILE_FORMAT_ID 0x01A5BCA7 typedef struct PgStat_ArchiverStats { @@ -808,7 +267,6 @@ typedef struct PgStat_BgWriterStats typedef struct PgStat_CheckpointerStats { - TimestampTz stats_timestamp; /* time of stats file update */ PgStat_Counter timed_checkpoints; PgStat_Counter requested_checkpoints; PgStat_Counter checkpoint_write_time; /* times in milliseconds */ @@ -820,7 +278,6 @@ typedef struct PgStat_CheckpointerStats typedef struct PgStat_StatDBEntry { - Oid databaseid; PgStat_Counter n_xact_commit; PgStat_Counter n_xact_rollback; PgStat_Counter n_blocks_fetched; @@ -852,34 +309,16 @@ typedef struct PgStat_StatDBEntry PgStat_Counter n_sessions_killed; TimestampTz stat_reset_timestamp; - TimestampTz stats_timestamp; /* time of db stats file update */ - - /* - * tables and functions must be last in the struct, because we don't write - * the pointers out to the stats file. - */ - HTAB *tables; - HTAB *functions; } PgStat_StatDBEntry; typedef struct PgStat_StatFuncEntry { - Oid functionid; - PgStat_Counter f_numcalls; PgStat_Counter f_total_time; /* times in microseconds */ PgStat_Counter f_self_time; } PgStat_StatFuncEntry; -typedef struct PgStat_GlobalStats -{ - TimestampTz stats_timestamp; /* time of stats file update */ - - PgStat_CheckpointerStats checkpointer; - PgStat_BgWriterStats bgwriter; -} PgStat_GlobalStats; - typedef struct PgStat_StatReplSlotEntry { NameData slotname; @@ -908,8 +347,6 @@ typedef struct PgStat_SLRUStats typedef struct PgStat_StatSubEntry { - Oid subid; /* hash key (must be first) */ - PgStat_Counter apply_error_count; PgStat_Counter sync_error_count; TimestampTz stat_reset_timestamp; @@ -917,8 +354,6 @@ typedef struct PgStat_StatSubEntry typedef struct PgStat_StatTabEntry { - Oid tableid; - PgStat_Counter numscans; PgStat_Counter tuples_returned; @@ -966,22 +401,19 @@ typedef struct PgStat_WalStats */ /* functions called from postmaster */ -extern void pgstat_init(void); -extern void pgstat_reset_all(void); -extern int pgstat_start(void); -extern void allow_immediate_pgstat_restart(void); +extern Size StatsShmemSize(void); +extern void StatsShmemInit(void); -#ifdef EXEC_BACKEND -extern void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn(); -#endif +/* Functions called during server startup / shutdown */ +extern void pgstat_restore_stats(void); +extern void pgstat_discard_stats(void); +extern void pgstat_before_server_shutdown(int code, Datum arg); /* Functions for backend initialization */ extern void pgstat_initialize(void); /* Functions called from backends */ -extern void pgstat_report_stat(bool force); -extern void pgstat_vacuum_stat(void); -extern void pgstat_ping(void); +extern long pgstat_report_stat(bool force); extern void pgstat_reset_counters(void); extern void pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objectid); @@ -989,24 +421,17 @@ extern void pgstat_reset_of_kind(PgStat_Kind kind); /* stats accessors */ extern void pgstat_clear_snapshot(void); -extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); -extern PgStat_BgWriterStats *pgstat_fetch_stat_bgwriter(void); -extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void); -extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid); -extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid); -extern PgStat_GlobalStats *pgstat_fetch_global(void); -extern PgStat_StatReplSlotEntry *pgstat_fetch_replslot(NameData slotname); -extern PgStat_StatSubEntry *pgstat_fetch_stat_subscription(Oid subid); -extern PgStat_SLRUStats *pgstat_fetch_slru(void); -extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); -extern PgStat_WalStats *pgstat_fetch_stat_wal(void); +extern TimestampTz pgstat_get_stat_snapshot_timestamp(bool *have_snapshot); +/* helpers */ +extern PgStat_Kind pgstat_get_kind_from_str(char *kind_str); /* * Functions in pgstat_archiver.c */ extern void pgstat_report_archiver(const char *xlog, bool failed); +extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void); /* @@ -1014,6 +439,7 @@ extern void pgstat_report_archiver(const char *xlog, bool failed); */ extern void pgstat_report_bgwriter(void); +extern PgStat_BgWriterStats *pgstat_fetch_stat_bgwriter(void); /* @@ -1021,6 +447,7 @@ extern void pgstat_report_bgwriter(void); */ extern void pgstat_report_checkpointer(void); +extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void); /* @@ -1044,6 +471,7 @@ extern void pgstat_report_connect(Oid dboid); #define pgstat_count_conn_txn_idle_time(n) \ (pgStatTransactionIdleTime += (n)) +extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid); /* * Functions in pgstat_function.c @@ -1058,6 +486,7 @@ extern void pgstat_init_function_usage(struct FunctionCallInfoBaseData *fcinfo, extern void pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize); +extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid); extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id); @@ -1070,6 +499,8 @@ extern void pgstat_drop_relation(Relation rel); extern void pgstat_copy_relation_stats(Relation dstrel, Relation srcrel); extern void pgstat_init_relation(Relation rel); +extern void pgstat_assoc_relation(Relation rel); +extern void pgstat_unlink_relation(Relation rel); extern void pgstat_report_vacuum(Oid tableoid, bool shared, PgStat_Counter livetuples, PgStat_Counter deadtuples); @@ -1077,8 +508,14 @@ extern void pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter); +/* + * If stats are enabled, but pending data hasn't been prepared yet, call + * pgstat_assoc_relation() to do so. See its comment for why this is done + * separately from pgstat_init_relation(). + */ #define pgstat_should_count_relation(rel) \ - (likely((rel)->pgstat_info != NULL)) + (likely((rel)->pgstat_info != NULL) ? true : \ + ((rel)->pgstat_enabled ? pgstat_assoc_relation(rel), true : false)) /* nontransactional event counts are simple enough to inline */ @@ -1129,6 +566,9 @@ extern void pgstat_twophase_postcommit(TransactionId xid, uint16 info, extern void pgstat_twophase_postabort(TransactionId xid, uint16 info, void *recdata, uint32 len); +extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); +extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_ext(bool shared, + Oid relid); extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id); @@ -1140,7 +580,9 @@ extern void pgstat_reset_replslot(const char *name); struct ReplicationSlot; extern void pgstat_report_replslot(struct ReplicationSlot *slot, const PgStat_StatReplSlotEntry *repSlotStat); extern void pgstat_create_replslot(struct ReplicationSlot *slot); +extern void pgstat_acquire_replslot(struct ReplicationSlot *slot); extern void pgstat_drop_replslot(struct ReplicationSlot *slot); +extern PgStat_StatReplSlotEntry *pgstat_fetch_replslot(NameData slotname); /* @@ -1157,6 +599,7 @@ extern void pgstat_count_slru_flush(int slru_idx); extern void pgstat_count_slru_truncate(int slru_idx); extern const char *pgstat_get_slru_name(int slru_idx); extern int pgstat_get_slru_index(const char *name); +extern PgStat_SLRUStats *pgstat_fetch_slru(void); /* @@ -1166,6 +609,7 @@ extern int pgstat_get_slru_index(const char *name); extern void pgstat_report_subscription_error(Oid subid, bool is_apply_error); extern void pgstat_create_subscription(Oid subid); extern void pgstat_drop_subscription(Oid subid); +extern PgStat_StatSubEntry *pgstat_fetch_stat_subscription(Oid subid); /* @@ -1186,6 +630,7 @@ extern void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_ */ extern void pgstat_report_wal(bool force); +extern PgStat_WalStats *pgstat_fetch_stat_wal(void); /* @@ -1195,6 +640,8 @@ extern void pgstat_report_wal(bool force); /* GUC parameters */ extern PGDLLIMPORT bool pgstat_track_counts; extern PGDLLIMPORT int pgstat_track_functions; +extern PGDLLIMPORT int pgstat_fetch_consistency; + extern char *pgstat_stat_directory; extern char *pgstat_stat_tmpname; extern char *pgstat_stat_filename; @@ -1205,7 +652,7 @@ extern char *pgstat_stat_filename; */ /* updated directly by bgwriter and bufmgr */ -extern PgStat_MsgBgWriter PendingBgWriterStats; +extern PgStat_BgWriterStats PendingBgWriterStats; /* @@ -1216,7 +663,7 @@ extern PgStat_MsgBgWriter PendingBgWriterStats; * Checkpointer statistics counters are updated directly by checkpointer and * bufmgr. */ -extern PgStat_MsgCheckpointer PendingCheckpointerStats; +extern PgStat_CheckpointerStats PendingCheckpointerStats; /* @@ -1243,7 +690,7 @@ extern SessionEndType pgStatSessionEndCause; */ /* updated directly by backends and background processes */ -extern PgStat_MsgWal WalStats; +extern PgStat_WalStats PendingWalStats; #endif /* PGSTAT_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index c3d5889d7b..33eb4c1033 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -190,6 +190,9 @@ typedef enum BuiltinTrancheIds LWTRANCHE_SHARED_TIDBITMAP, LWTRANCHE_PARALLEL_APPEND, LWTRANCHE_PER_XACT_PREDICATE_LIST, + LWTRANCHE_PGSTATS_DSA, + LWTRANCHE_PGSTATS_HASH, + LWTRANCHE_PGSTATS_DATA, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index c3f83c74c6..ab27bc47c5 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -14,21 +14,134 @@ #define PGSTAT_INTERNAL_H +#include "common/hashfn.h" +#include "lib/dshash.h" +#include "lib/ilist.h" #include "pgstat.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" -#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file - * updates; in milliseconds. */ - -/* ---------- - * The initial size hints for the hash tables used in the collector. - * ---------- +/* + * Types related to shared memory storage of statistics. + * + * Per-object statistics are stored in the "shared stats" hashtable. That + * table's entries (PgStatShared_HashEntry) contain a pointer to the actual stats + * data for the object (the size of the stats data varies depending on the + * kind of stats). The table is keyed by PgStat_HashKey. + * + * Once a backend has a reference to a shared stats entry, it increments the + * entry's refcount. Even after stats data is dropped (e.g., due to a DROP + * TABLE), the entry itself can only be deleted once all references have been + * released. + * + * These refcounts, in combination with a backend local hashtable + * (pgStatEntryRefHash, with entries pointing to PgStat_EntryRef) in front of + * the shared hash table, mean that most stats work can happen without + * touching the shared hash table, reducing contention. + * + * Once there are pending stats updates for a table PgStat_EntryRef->pending + * is allocated to contain a working space for as-of-yet-unapplied stats + * updates. Once the stats are flushed, PgStat_EntryRef->pending is freed. + * + * Each stat kind in the shared hash table has a fixed member + * PgStatShared_Common as the first element. */ -#define PGSTAT_DB_HASH_SIZE 16 -#define PGSTAT_TAB_HASH_SIZE 512 -#define PGSTAT_FUNCTION_HASH_SIZE 512 -#define PGSTAT_SUBSCRIPTION_HASH_SIZE 32 -#define PGSTAT_REPLSLOT_HASH_SIZE 32 + +/* struct for shared statistics hash entry key. */ +typedef struct PgStat_HashKey +{ + PgStat_Kind kind; /* statistics entry kind */ + Oid dboid; /* database ID. InvalidOid for shared objects. */ + Oid objoid; /* object ID, either table or function. */ +} PgStat_HashKey; + +/* + * Shared statistics hash entry. Doesn't itself contain any stats, but points + * to them (with ->body). That allows the stats entries themselves to be of + * variable size. + */ +typedef struct PgStatShared_HashEntry +{ + PgStat_HashKey key; /* hash key */ + + /* + * If dropped is set, backends need to release their references so that + * the memory for the entry can be freed. No new references may be made + * once marked as dropped. + */ + bool dropped; + + /* + * Refcount managing lifetime of the entry itself (as opposed to the + * dshash entry pointing to it). The stats lifetime has to be separate + * from the hash table entry lifetime because we allow backends to point + * to a stats entry without holding a hash table lock (and some other + * reasons). + * + * As long as the entry is not dropped, 1 is added to the refcount + * representing that the entry should not be dropped. In addition each + * backend that has a reference to the entry needs to increment the + * refcount as long as it does. + * + * May only be incremented / decremented while holding at least a shared + * lock on the dshash partition containing the entry. It needs to be an + * atomic variable because multiple backends can increment the refcount + * with just a shared lock. + * + * When the refcount reaches 0 the entry needs to be freed. + */ + pg_atomic_uint32 refcount; + + /* + * Pointer to shared stats. The stats entry always starts with + * PgStatShared_Common, embedded in a larger struct containing the + * PgStat_Kind specific stats fields. + */ + dsa_pointer body; +} PgStatShared_HashEntry; + +/* + * Common header struct for PgStatShm_Stat*Entry. + */ +typedef struct PgStatShared_Common +{ + uint32 magic; /* just a validity cross-check */ + /* lock protecting stats contents (i.e. data following the header) */ + LWLock lock; +} PgStatShared_Common; + +/* + * A backend local reference to a shared stats entry. As long as at least one + * such reference exists, the shared stats entry will not be released. + * + * If there are pending stats update to the shared stats, these are stored in + * ->pending. + */ +typedef struct PgStat_EntryRef +{ + /* + * Pointer to the PgStatShared_HashEntry entry in the shared stats + * hashtable. + */ + PgStatShared_HashEntry *shared_entry; + + /* + * Pointer to the stats data (i.e. PgStatShared_HashEntry->body), resolved + * as a local pointer, to avoid repeated dsa_get_address() calls. + */ + PgStatShared_Common *shared_stats; + + /* + * Pending statistics data that will need to be flushed to shared memory + * stats eventually. Each stats kind utilizing pending data defines what + * format its pending data has and needs to provide a + * PgStat_KindInfo->flush_pending_cb callback to merge pending into shared + * stats. + */ + void *pending; + dlist_node pending_node; /* membership in pgStatPending list */ +} PgStat_EntryRef; /* @@ -43,11 +156,11 @@ typedef struct PgStat_SubXactStatus struct PgStat_SubXactStatus *prev; /* higher-level subxact if any */ /* - * Dropping the statistics for objects that dropped transactionally itself - * needs to be transactional. Therefore we collect the stats dropped in - * the current (sub-)transaction and only execute the stats drop when we - * know if the transaction commits/aborts. To handle replicas and crashes, - * stats drops are included in commit records. + * Statistics for transactionally dropped objects need to be + * transactionally dropped as well. Collect the stats dropped in the + * current (sub-)transaction and only execute the stats drop when we know + * if the transaction commits/aborts. To handle replicas and crashes, + * stats drops are included in commit / abort records. */ dlist_head pending_drops; int pending_drops_count; @@ -64,10 +177,96 @@ typedef struct PgStat_SubXactStatus } PgStat_SubXactStatus; +/* + * Metadata for a specific kind of statistics. + */ +typedef struct PgStat_KindInfo +{ + /* + * Do a fixed number of stats objects exist for this kind of stats (e.g. + * bgwriter stats) or not (e.g. tables). + */ + bool fixed_amount:1; + + /* + * Can stats of this kind be accessed from another database? Determines + * whether a stats object gets included in stats snapshots. + */ + bool accessed_across_databases:1; + + /* + * For variable-numbered stats: Identified on-disk using a name, rather + * than PgStat_HashKey. Probably only needed for replication slot stats. + */ + bool named_on_disk:1; + + /* + * The size of an entry in the shared stats hash table (pointed to by + * PgStatShared_HashEntry->body). + */ + uint32 shared_size; + + /* + * The offset/size of statistics inside the shared stats entry. Used when + * [de-]serializing statistics to / from disk respectively. Separate from + * shared_size because [de-]serialization may not include in-memory state + * like lwlocks. + */ + uint32 shared_data_off; + uint32 shared_data_len; + + /* + * The size of the pending data for this kind. E.g. how large + * PgStat_EntryRef->pending is. Used for allocations. + * + * 0 signals that an entry of this kind should never have a pending entry. + */ + uint32 pending_size; + + /* + * For variable-numbered stats: flush pending stats. Required if pending + * data is used. + */ + bool (*flush_pending_cb) (PgStat_EntryRef *sr, bool nowait); + + /* + * For variable-numbered stats: delete pending stats. Optional. + */ + void (*delete_pending_cb) (PgStat_EntryRef *sr); + + /* + * For variable-numbered stats: reset the reset timestamp. Optional. + */ + void (*reset_timestamp_cb) (PgStatShared_Common *header, TimestampTz ts); + + /* + * For variable-numbered stats with named_on_disk. Optional. + */ + void (*to_serialized_name) (const PgStatShared_Common *header, NameData *name); + bool (*from_serialized_name) (const NameData *name, PgStat_HashKey *key); + + /* + * For fixed-numbered statistics: Reset All. + */ + void (*reset_all_cb) (TimestampTz ts); + + /* + * For fixed-numbered statistics: Build snapshot for entry + */ + void (*snapshot_cb) (void); + + /* name of the kind of stats */ + const char *const name; +} PgStat_KindInfo; + + /* * List of SLRU names that we keep stats for. There is no central registry of * SLRUs, so we use this fixed list instead. The "other" entry is used for * all SLRUs without an explicit entry (e.g. SLRUs in extensions). + * + * This is only defined here so that SLRU_NUM_ELEMENTS is known for later type + * definitions. */ static const char *const slru_names[] = { "CommitTs", @@ -83,33 +282,271 @@ static const char *const slru_names[] = { #define SLRU_NUM_ELEMENTS lengthof(slru_names) +/* ---------- + * Types and definitions for different kinds of fixed-amount stats. + * + * Single-writer stats use the changecount mechanism to achieve low-overhead + * writes - they're obviously more performance critical than reads. Check the + * definition of struct PgBackendStatus for some explanation of the + * changecount mechanism. + * + * Because the obvious implementation of resetting single-writer stats isn't + * compatible with that (another backend needs to write), we don't scribble on + * shared stats while resetting. Instead, just record the current counter + * values in a copy of the stats data, which is protected by ->lock. See + * pgstat_fetch_stat_(archiver|bgwriter|checkpointer) for the reader side. + * + * The only exception to that is the the stat_reset_timestamp in these + * structs, which is protected by ->lock, because it has to be written by + * another backend while resetting + * ---------- + */ + +typedef struct PgStatShared_Archiver +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_ArchiverStats stats; + PgStat_ArchiverStats reset_offset; +} PgStatShared_Archiver; + +typedef struct PgStatShared_BgWriter +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_BgWriterStats stats; + PgStat_BgWriterStats reset_offset; +} PgStatShared_BgWriter; + +typedef struct PgStatShared_Checkpointer +{ + /* lock protects ->reset_offset as well as stats->stat_reset_timestamp */ + LWLock lock; + uint32 changecount; + PgStat_CheckpointerStats stats; + PgStat_CheckpointerStats reset_offset; +} PgStatShared_Checkpointer; + +typedef struct PgStatShared_SLRU +{ + /* lock protects ->stats */ + LWLock lock; + PgStat_SLRUStats stats[SLRU_NUM_ELEMENTS]; +} PgStatShared_SLRU; + +typedef struct PgStatShared_Wal +{ + /* lock protects ->stats */ + LWLock lock; + PgStat_WalStats stats; +} PgStatShared_Wal; + + + +/* ---------- + * Types and definitions for different kinds of variable-amount stats. + * + * Each struct has to start with PgStatShared_Common, containing information + * common across the different types of stats. Kind-specific data follows. + * ---------- + */ + +typedef struct PgStatShared_Database +{ + PgStatShared_Common header; + PgStat_StatDBEntry stats; +} PgStatShared_Database; + +typedef struct PgStatShared_Relation +{ + PgStatShared_Common header; + PgStat_StatTabEntry stats; +} PgStatShared_Relation; + +typedef struct PgStatShared_Function +{ + PgStatShared_Common header; + PgStat_StatFuncEntry stats; +} PgStatShared_Function; + +typedef struct PgStatShared_Subscription +{ + PgStatShared_Common header; + PgStat_StatSubEntry stats; +} PgStatShared_Subscription; + +typedef struct PgStatShared_ReplSlot +{ + PgStatShared_Common header; + PgStat_StatReplSlotEntry stats; +} PgStatShared_ReplSlot; + + +/* + * Central shared memory entry for the cumulative stats system. + * + * Fixed amount stats, the dynamic shared memory hash table for + * non-fixed-amount stats, as well as remaining bits and pieces are all + * reached from here. + */ +typedef struct PgStat_ShmemControl +{ + void *raw_dsa_area; + + /* + * Stats for variable-numbered objects are kept in this shared hash table. + * See comment above PgStat_Kind for details. + */ + dshash_table_handle hash_handle; /* shared dbstat hash */ + + /* Has the stats system already been shut down? Just a debugging check. */ + bool is_shutdown; + + /* + * Whenever statistics for dropped objects could not be freed - because + * backends still have references - the dropping backend calls + * pgstat_request_entry_refs_gc() incrementing this counter. Eventually + * that causes backends to run pgstat_gc_entry_refs(), allowing memory to + * be reclaimed. + */ + pg_atomic_uint64 gc_request_count; + + /* + * Stats data for fixed-numbered objects. + */ + PgStatShared_Archiver archiver; + PgStatShared_BgWriter bgwriter; + PgStatShared_Checkpointer checkpointer; + PgStatShared_SLRU slru; + PgStatShared_Wal wal; +} PgStat_ShmemControl; + + +/* + * Cached statistics snapshot + */ +typedef struct PgStat_Snapshot +{ + PgStat_FetchConsistency mode; + + /* time at which snapshot was taken */ + TimestampTz snapshot_timestamp; + + bool fixed_valid[PGSTAT_NUM_KINDS]; + + PgStat_ArchiverStats archiver; + + PgStat_BgWriterStats bgwriter; + + PgStat_CheckpointerStats checkpointer; + + PgStat_SLRUStats slru[SLRU_NUM_ELEMENTS]; + + PgStat_WalStats wal; + + /* to free snapshot in bulk */ + MemoryContext context; + struct pgstat_snapshot_hash *stats; +} PgStat_Snapshot; + + +/* + * Collection of backend-local stats state. + */ +typedef struct PgStat_LocalState +{ + PgStat_ShmemControl *shmem; + dsa_area *dsa; + dshash_table *shared_hash; + + /* the current statistics snapshot */ + PgStat_Snapshot snapshot; +} PgStat_LocalState; + + +/* + * Inline functions defined further below. + */ + +static inline void pgstat_begin_changecount_write(uint32 *cc); +static inline void pgstat_end_changecount_write(uint32 *cc); +static inline uint32 pgstat_begin_changecount_read(uint32 *cc); +static inline bool pgstat_end_changecount_read(uint32 *cc, uint32 cc_before); + +static inline void pgstat_copy_changecounted_stats(void *dst, void *src, size_t len, + uint32 *cc); + +static inline int pgstat_cmp_hash_key(const void *a, const void *b, size_t size, void *arg); +static inline uint32 pgstat_hash_hash_key(const void *d, size_t size, void *arg); +static inline size_t pgstat_get_entry_len(PgStat_Kind kind); +static inline void *pgstat_get_entry_data(PgStat_Kind kind, PgStatShared_Common *entry); + + /* * Functions in pgstat.c */ -extern void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype); -extern void pgstat_send(void *msg, int len); +const PgStat_KindInfo *pgstat_get_kind_info(PgStat_Kind kind); + #ifdef USE_ASSERT_CHECKING extern void pgstat_assert_is_up(void); #else #define pgstat_assert_is_up() ((void)true) #endif +extern void pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref); +extern PgStat_EntryRef *pgstat_prep_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid, bool *created_entry); +extern PgStat_EntryRef *pgstat_fetch_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid); + +extern void *pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, Oid objoid); +extern void pgstat_snapshot_fixed(PgStat_Kind kind); + + +/* + * Functions in pgstat_archiver.c + */ + +extern void pgstat_archiver_reset_all_cb(TimestampTz ts); +extern void pgstat_archiver_snapshot_cb(void); + + +/* + * Functions in pgstat_bgwriter.c + */ + +extern void pgstat_bgwriter_reset_all_cb(TimestampTz ts); +extern void pgstat_bgwriter_snapshot_cb(void); + + +/* + * Functions in pgstat_checkpointer.c + */ + +extern void pgstat_checkpointer_reset_all_cb(TimestampTz ts); +extern void pgstat_checkpointer_snapshot_cb(void); + /* * Functions in pgstat_database.c */ -extern void AtEOXact_PgStat_Database(bool isCommit, bool parallel); extern void pgstat_report_disconnect(Oid dboid); -extern void pgstat_update_dbstats(PgStat_MsgTabstat *tsmsg, TimestampTz now); +extern void pgstat_update_dbstats(TimestampTz ts); +extern void AtEOXact_PgStat_Database(bool isCommit, bool parallel); + +extern PgStat_StatDBEntry *pgstat_prep_database_pending(Oid dboid); +extern void pgstat_reset_database_timestamp(Oid dboid, TimestampTz ts); +extern bool pgstat_database_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_database_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); /* * Functions in pgstat_function.c */ -extern void pgstat_send_funcstats(void); +extern bool pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); /* @@ -120,23 +557,73 @@ extern void AtEOXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isC extern void AtEOSubXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit, int nestDepth); extern void AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); extern void PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); -extern void pgstat_send_tabstats(TimestampTz now, bool disconnect); + +extern bool pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref); + + +/* + * Functions in pgstat_replslot.c + */ + +extern void pgstat_replslot_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); +extern void pgstat_replslot_to_serialized_name_cb(const PgStatShared_Common *tmp, NameData *name); +extern bool pgstat_replslot_from_serialized_name_cb(const NameData *name, PgStat_HashKey *key); + + +/* + * Functions in pgstat_shmem.c + */ + +extern void pgstat_attach_shmem(void); +extern void pgstat_detach_shmem(void); + +extern PgStat_EntryRef *pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, Oid objoid, + bool create, bool *found); +extern bool pgstat_lock_entry(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_unlock_entry(PgStat_EntryRef *entry_ref); +extern bool pgstat_drop_entry(PgStat_Kind kind, Oid dboid, Oid objoid); +extern void pgstat_drop_all_entries(void); +extern PgStat_EntryRef *pgstat_get_entry_ref_locked(PgStat_Kind kind, Oid dboid, Oid objoid, + bool nowait); +extern void pgstat_reset_entry(PgStat_Kind kind, Oid dboid, Oid objoid, TimestampTz ts); +extern void pgstat_reset_entries_of_kind(PgStat_Kind kind, TimestampTz ts); +extern void pgstat_reset_matching_entries(bool (*do_reset) (PgStatShared_HashEntry *, Datum), + Datum match_data, + TimestampTz ts); + +extern void pgstat_request_entry_refs_gc(void); +extern PgStatShared_Common *pgstat_init_entry(PgStat_Kind kind, + PgStatShared_HashEntry *shhashent); /* * Functions in pgstat_slru.c */ -extern void pgstat_send_slru(void); +extern bool pgstat_slru_flush(bool nowait); +extern void pgstat_slru_reset_all_cb(TimestampTz ts); +extern void pgstat_slru_snapshot_cb(void); /* * Functions in pgstat_wal.c */ +extern bool pgstat_flush_wal(bool nowait); extern void pgstat_init_wal(void); extern bool pgstat_have_pending_wal(void); +extern void pgstat_wal_reset_all_cb(TimestampTz ts); +extern void pgstat_wal_snapshot_cb(void); + + +/* + * Functions in pgstat_subscription.c + */ + +extern bool pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); +extern void pgstat_subscription_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); /* * Functions in pgstat_xact.c @@ -151,29 +638,145 @@ extern void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, Oid objoid) * Variables in pgstat.c */ -extern pgsocket pgStatSock; +extern PgStat_LocalState pgStatLocal; /* - * Variables in pgstat_database.c + * Variables in pgstat_slru.c */ -extern int pgStatXactCommit; -extern int pgStatXactRollback; +extern bool have_slrustats; /* - * Variables in pgstat_functions.c + * Implementation of inline functions declared above. */ -extern bool have_function_stats; +/* + * Helpers for changecount manipulation. See comments around struct + * PgBackendStatus for details. + */ + +static inline void +pgstat_begin_changecount_write(uint32 *cc) +{ + Assert((*cc & 1) == 0); + + START_CRIT_SECTION(); + (*cc)++; + pg_write_barrier(); +} + +static inline void +pgstat_end_changecount_write(uint32 *cc) +{ + Assert((*cc & 1) == 1); + + pg_write_barrier(); + + (*cc)++; + + END_CRIT_SECTION(); +} + +static inline uint32 +pgstat_begin_changecount_read(uint32 *cc) +{ + uint32 before_cc = *cc; + + CHECK_FOR_INTERRUPTS(); + + pg_read_barrier(); + + return before_cc; +} + +/* + * Returns true if the read succeeded, false if it needs to be repeated. + */ +static inline bool +pgstat_end_changecount_read(uint32 *cc, uint32 before_cc) +{ + uint32 after_cc; + + pg_read_barrier(); + + after_cc = *cc; + + /* was a write in progress when we started? */ + if (before_cc & 1) + return false; + + /* did writes start and complete while we read? */ + return before_cc == after_cc; +} /* - * Variables in pgstat_relation.c + * helper function for PgStat_KindInfo->snapshot_cb + * PgStat_KindInfo->reset_all_cb callbacks. + * + * Copies out the specified memory area following change-count protocol. */ +static inline void +pgstat_copy_changecounted_stats(void *dst, void *src, size_t len, + uint32 *cc) +{ + uint32 cc_before; -extern bool have_relation_stats; + do + { + cc_before = pgstat_begin_changecount_read(cc); + memcpy(dst, src, len); + } + while (!pgstat_end_changecount_read(cc, cc_before)); +} + +/* helpers for dshash / simplehash hashtables */ +static inline int +pgstat_cmp_hash_key(const void *a, const void *b, size_t size, void *arg) +{ + AssertArg(size == sizeof(PgStat_HashKey) && arg == NULL); + return memcmp(a, b, sizeof(PgStat_HashKey)); +} + +static inline uint32 +pgstat_hash_hash_key(const void *d, size_t size, void *arg) +{ + const PgStat_HashKey *key = (PgStat_HashKey *) d; + uint32 hash; + + AssertArg(size == sizeof(PgStat_HashKey) && arg == NULL); + + hash = murmurhash32(key->kind); + hash = hash_combine(hash, murmurhash32(key->dboid)); + hash = hash_combine(hash, murmurhash32(key->objoid)); + + return hash; +} + +/* + * The length of the data portion of a shared memory stats entry (i.e. without + * transient data such as refcounts, lwlocks, ...). + */ +static inline size_t +pgstat_get_entry_len(PgStat_Kind kind) +{ + return pgstat_get_kind_info(kind)->shared_data_len; +} + +/* + * Returns a pointer to the data portion of a shared memory stats entry. + */ +static inline void * +pgstat_get_entry_data(PgStat_Kind kind, PgStatShared_Common *entry) +{ + size_t off = pgstat_get_kind_info(kind)->shared_data_off; + + Assert(off != 0 && off < PG_UINT32_MAX); + + return ((char *) (entry)) + off; +} #endif /* PGSTAT_INTERNAL_H */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 121dbbc9a9..eadbd00904 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -246,6 +246,7 @@ typedef struct RelationData */ Oid rd_toastoid; /* Real TOAST table's OID, or InvalidOid */ + bool pgstat_enabled; /* should relation stats be counted */ /* use "struct" here to avoid needing to include pgstat.h: */ struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ } RelationData; diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h index 099f91c61d..c068986d09 100644 --- a/src/include/utils/timeout.h +++ b/src/include/utils/timeout.h @@ -32,6 +32,7 @@ typedef enum TimeoutId STANDBY_LOCK_TIMEOUT, IDLE_IN_TRANSACTION_SESSION_TIMEOUT, IDLE_SESSION_TIMEOUT, + IDLE_STATS_UPDATE_TIMEOUT, CLIENT_CONNECTION_CHECK_TIMEOUT, STARTUP_PROGRESS_TIMEOUT, /* First user-definable timeout reason */ diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index d870c59263..b578e2ec75 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -42,7 +42,6 @@ typedef enum WAIT_EVENT_CHECKPOINTER_MAIN, WAIT_EVENT_LOGICAL_APPLY_MAIN, WAIT_EVENT_LOGICAL_LAUNCHER_MAIN, - WAIT_EVENT_PGSTAT_MAIN, WAIT_EVENT_RECOVERY_WAL_STREAM, WAIT_EVENT_SYSLOGGER_MAIN, WAIT_EVENT_WAL_RECEIVER_MAIN, diff --git a/src/test/modules/worker_spi/worker_spi.c b/src/test/modules/worker_spi/worker_spi.c index 48829df29c..5b541ec47f 100644 --- a/src/test/modules/worker_spi/worker_spi.c +++ b/src/test/modules/worker_spi/worker_spi.c @@ -265,7 +265,7 @@ worker_spi_main(Datum main_arg) PopActiveSnapshot(); CommitTransactionCommand(); debug_query_string = NULL; - pgstat_report_stat(false); + pgstat_report_stat(true); pgstat_report_activity(STATE_IDLE, NULL); } diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 494fb26237..64e2ff6b29 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -17,6 +17,8 @@ SET enable_indexscan TO on; -- for the moment, we don't want index-only scans here SET enable_indexonlyscan TO off; -- save counters +BEGIN; +SET LOCAL stats_fetch_consistency = snapshot; CREATE TABLE prevstats AS SELECT t.seq_scan, t.seq_tup_read, t.idx_scan, t.idx_tup_fetch, (b.heap_blks_read + b.heap_blks_hit) AS heap_blks, @@ -25,6 +27,7 @@ SELECT t.seq_scan, t.seq_tup_read, t.idx_scan, t.idx_tup_fetch, FROM pg_catalog.pg_stat_user_tables AS t, pg_catalog.pg_statio_user_tables AS b WHERE t.relname='tenk2' AND b.relname='tenk2'; +COMMIT; -- function to wait for counters to advance create function wait_for_stats() returns void as $$ declare @@ -34,6 +37,8 @@ declare updated3 bool; updated4 bool; begin + SET LOCAL stats_fetch_consistency = snapshot; + -- We don't want to wait forever. No timeout suffices if the OS drops our -- stats traffic because an earlier test file left a full UDP buffer. -- Hence, don't use PG_TEST_TIMEOUT_DEFAULT, which may be large for @@ -163,6 +168,8 @@ SELECT wait_for_stats(); (1 row) -- check effects +BEGIN; +SET LOCAL stats_fetch_consistency = snapshot; SELECT relname, n_tup_ins, n_tup_upd, n_tup_del, n_live_tup, n_dead_tup FROM pg_stat_user_tables WHERE relname like 'trunc_stats_test%' order by relname; @@ -202,6 +209,7 @@ FROM prevstats AS pr; t (1 row) +COMMIT; DROP TABLE trunc_stats_test, trunc_stats_test1, trunc_stats_test2, trunc_stats_test3, trunc_stats_test4; DROP TABLE prevstats; -- test BRIN index doesn't block HOT update - we include this test here, as it diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index d0ba1f6d7b..85a253bcd4 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -15,6 +15,8 @@ SET enable_indexscan TO on; SET enable_indexonlyscan TO off; -- save counters +BEGIN; +SET LOCAL stats_fetch_consistency = snapshot; CREATE TABLE prevstats AS SELECT t.seq_scan, t.seq_tup_read, t.idx_scan, t.idx_tup_fetch, (b.heap_blks_read + b.heap_blks_hit) AS heap_blks, @@ -23,6 +25,7 @@ SELECT t.seq_scan, t.seq_tup_read, t.idx_scan, t.idx_tup_fetch, FROM pg_catalog.pg_stat_user_tables AS t, pg_catalog.pg_statio_user_tables AS b WHERE t.relname='tenk2' AND b.relname='tenk2'; +COMMIT; -- function to wait for counters to advance create function wait_for_stats() returns void as $$ @@ -33,6 +36,8 @@ declare updated3 bool; updated4 bool; begin + SET LOCAL stats_fetch_consistency = snapshot; + -- We don't want to wait forever. No timeout suffices if the OS drops our -- stats traffic because an earlier test file left a full UDP buffer. -- Hence, don't use PG_TEST_TIMEOUT_DEFAULT, which may be large for @@ -158,6 +163,9 @@ RESET enable_bitmapscan; SELECT wait_for_stats(); -- check effects +BEGIN; +SET LOCAL stats_fetch_consistency = snapshot; + SELECT relname, n_tup_ins, n_tup_upd, n_tup_del, n_live_tup, n_dead_tup FROM pg_stat_user_tables WHERE relname like 'trunc_stats_test%' order by relname; @@ -177,6 +185,8 @@ SELECT st.heap_blks_read + st.heap_blks_hit >= pr.heap_blks + cl.relpages, SELECT pr.snap_ts < pg_stat_get_snapshot_timestamp() as snapshot_newer FROM prevstats AS pr; +COMMIT; + DROP TABLE trunc_stats_test, trunc_stats_test1, trunc_stats_test2, trunc_stats_test3, trunc_stats_test4; DROP TABLE prevstats; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index dc38e16405..566ecbf091 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1933,51 +1933,39 @@ PgFdwPathExtraData PgFdwRelationInfo PgFdwScanState PgIfAddrCallback +PgStatShared_Archiver +PgStatShared_BgWriter +PgStatShared_Checkpointer +PgStatShared_Common +PgStatShared_Database +PgStatShared_Function +PgStatShared_HashEntry +PgStatShared_Relation +PgStatShared_ReplSlot +PgStatShared_SLRU +PgStatShared_Subscription +PgStatShared_Wal PgStat_ArchiverStats PgStat_BackendFunctionEntry +PgStat_BackendSubEntry PgStat_BgWriterStats PgStat_CheckpointerStats PgStat_Counter +PgStat_EntryRef +PgStat_EntryRefHashEntry +PgStat_FetchConsistency PgStat_FunctionCallUsage PgStat_FunctionCounts -PgStat_FunctionEntry -PgStat_GlobalStats +PgStat_HashKey PgStat_Kind -PgStat_Msg -PgStat_MsgAnalyze -PgStat_MsgAnlAncestors -PgStat_MsgArchiver -PgStat_MsgAutovacStart -PgStat_MsgBgWriter -PgStat_MsgCheckpointer -PgStat_MsgChecksumFailure -PgStat_MsgConnect -PgStat_MsgDeadlock -PgStat_MsgDisconnect -PgStat_MsgDropdb -PgStat_MsgDummy -PgStat_MsgFuncpurge -PgStat_MsgFuncstat -PgStat_MsgHdr -PgStat_MsgInquiry -PgStat_MsgRecoveryConflict -PgStat_MsgReplSlot -PgStat_MsgResetcounter -PgStat_MsgResetreplslotcounter -PgStat_MsgResetsharedcounter -PgStat_MsgResetsinglecounter -PgStat_MsgResetslrucounter -PgStat_MsgResetsubcounter -PgStat_MsgSLRU -PgStat_MsgSubscriptionDrop -PgStat_MsgSubscriptionError -PgStat_MsgTabpurge -PgStat_MsgTabstat -PgStat_MsgTempFile -PgStat_MsgVacuum -PgStat_MsgWal +PgStat_KindInfo +PgStat_LocalState PgStat_PendingDroppedStatsItem +PgStat_ReplSlotStats PgStat_SLRUStats +PgStat_ShmemControl +PgStat_Snapshot +PgStat_SnapshotEntry PgStat_StatDBEntry PgStat_StatFuncEntry PgStat_StatReplSlotEntry @@ -1985,7 +1973,6 @@ PgStat_StatSubEntry PgStat_StatTabEntry PgStat_SubXactStatus PgStat_TableCounts -PgStat_TableEntry PgStat_TableStatus PgStat_TableXactStatus PgStat_WalStats @@ -2533,7 +2520,6 @@ StartReplicationCmd StartupStatusEnum StatEntry StatExtEntry -StatMsgType StateFileChunk StatisticExtInfo Stats @@ -2647,8 +2633,6 @@ TXNEntryFile TYPCATEGORY T_Action T_WorkerStatus -TabStatHashEntry -TabStatusArray TableAmRoutine TableAttachInfo TableDataInfo @@ -3433,6 +3417,7 @@ pgssHashKey pgssSharedState pgssStoreKind pgssVersion +pgstat_entry_ref_hash_hash pgstat_page pgstattuple_type pgthreadlock_t diff --git a/src/tools/valgrind.supp b/src/tools/valgrind.supp index e3a179d210..4e8c482757 100644 --- a/src/tools/valgrind.supp +++ b/src/tools/valgrind.supp @@ -14,24 +14,6 @@ # These may contain uninitialized padding bytes. Since recipients also ignore # those bytes as padding, this is harmless. -{ - padding_pgstat_send - Memcheck:Param - socketcall.send(msg) - - fun:*send* - fun:pgstat_send -} - -{ - padding_pgstat_sendto - Memcheck:Param - socketcall.sendto(msg) - - fun:*send* - fun:pgstat_send -} - { padding_pgstat_write Memcheck:Param