/*------------------------------------------------------------------------- * * pg_stat_statements.c * Track statement execution times across a whole database cluster. * * Note about locking issues: to create or delete an entry in the shared * hashtable, one must hold pgss->lock exclusively. Modifying any field * in an entry except the counters requires the same. To look up an entry, * one must hold the lock shared. To read or update the counters within * an entry, one must hold the lock shared or exclusive (so the entry doesn't * disappear!) and also take the entry's mutex spinlock. * * * Copyright (c) 2008-2010, PostgreSQL Global Development Group * * IDENTIFICATION * $PostgreSQL: pgsql/contrib/pg_stat_statements/pg_stat_statements.c,v 1.14 2010/04/28 16:54:15 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "access/hash.h" #include "catalog/pg_type.h" #include "executor/executor.h" #include "executor/instrument.h" #include "funcapi.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/spin.h" #include "tcop/utility.h" #include "utils/builtins.h" #include "utils/hsearch.h" #include "utils/guc.h" PG_MODULE_MAGIC; /* Location of stats file */ #define PGSS_DUMP_FILE "global/pg_stat_statements.stat" /* This constant defines the magic number in the stats file header */ static const uint32 PGSS_FILE_HEADER = 0x20100108; /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */ #define USAGE_EXEC(duration) (1.0) #define USAGE_INIT (1.0) /* including initial planning */ #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */ #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */ /* * Hashtable key that defines the identity of a hashtable entry. The * hash comparators do not assume that the query string is null-terminated; * this lets us search for an mbcliplen'd string without copying it first. * * Presently, the query encoding is fully determined by the source database * and so we don't really need it to be in the key. But that might not always * be true. Anyway it's notationally convenient to pass it as part of the key. */ typedef struct pgssHashKey { Oid userid; /* user OID */ Oid dbid; /* database OID */ int encoding; /* query encoding */ int query_len; /* # of valid bytes in query string */ const char *query_ptr; /* query string proper */ } pgssHashKey; /* * The actual stats counters kept within pgssEntry. */ typedef struct Counters { int64 calls; /* # of times executed */ double total_time; /* total execution time in seconds */ int64 rows; /* total # of retrieved or affected rows */ int64 shared_blks_hit; /* # of shared buffer hits */ int64 shared_blks_read; /* # of shared disk blocks read */ int64 shared_blks_written; /* # of shared disk blocks written */ int64 local_blks_hit; /* # of local buffer hits */ int64 local_blks_read; /* # of local disk blocks read */ int64 local_blks_written; /* # of local disk blocks written */ int64 temp_blks_read; /* # of temp blocks read */ int64 temp_blks_written; /* # of temp blocks written */ double usage; /* usage factor */ } Counters; /* * Statistics per statement * * NB: see the file read/write code before changing field order here. */ typedef struct pgssEntry { pgssHashKey key; /* hash key of entry - MUST BE FIRST */ Counters counters; /* the statistics for this query */ slock_t mutex; /* protects the counters only */ char query[1]; /* VARIABLE LENGTH ARRAY - MUST BE LAST */ /* Note: the allocated length of query[] is actually pgss->query_size */ } pgssEntry; /* * Global shared state */ typedef struct pgssSharedState { LWLockId lock; /* protects hashtable search/modification */ int query_size; /* max query length in bytes */ } pgssSharedState; /*---- Local variables ----*/ /* Current nesting depth of ExecutorRun calls */ static int nested_level = 0; /* Saved hook values in case of unload */ static shmem_startup_hook_type prev_shmem_startup_hook = NULL; static ExecutorStart_hook_type prev_ExecutorStart = NULL; static ExecutorRun_hook_type prev_ExecutorRun = NULL; static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; static ProcessUtility_hook_type prev_ProcessUtility = NULL; /* Links to shared memory state */ static pgssSharedState *pgss = NULL; static HTAB *pgss_hash = NULL; /*---- GUC variables ----*/ typedef enum { PGSS_TRACK_NONE, /* track no statements */ PGSS_TRACK_TOP, /* only top level statements */ PGSS_TRACK_ALL /* all statements, including nested ones */ } PGSSTrackLevel; static const struct config_enum_entry track_options[] = { {"none", PGSS_TRACK_NONE, false}, {"top", PGSS_TRACK_TOP, false}, {"all", PGSS_TRACK_ALL, false}, {NULL, 0, false} }; static int pgss_max; /* max # statements to track */ static int pgss_track; /* tracking level */ static bool pgss_track_utility; /* whether to track utility commands */ static bool pgss_save; /* whether to save stats across shutdown */ #define pgss_enabled() \ (pgss_track == PGSS_TRACK_ALL || \ (pgss_track == PGSS_TRACK_TOP && nested_level == 0)) /*---- Function declarations ----*/ void _PG_init(void); void _PG_fini(void); Datum pg_stat_statements_reset(PG_FUNCTION_ARGS); Datum pg_stat_statements(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pg_stat_statements_reset); PG_FUNCTION_INFO_V1(pg_stat_statements); static void pgss_shmem_startup(void); static void pgss_shmem_shutdown(int code, Datum arg); static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags); static void pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count); static void pgss_ExecutorEnd(QueryDesc *queryDesc); static void pgss_ProcessUtility(Node *parsetree, const char *queryString, ParamListInfo params, bool isTopLevel, DestReceiver *dest, char *completionTag); static uint32 pgss_hash_fn(const void *key, Size keysize); static int pgss_match_fn(const void *key1, const void *key2, Size keysize); static void pgss_store(const char *query, double total_time, uint64 rows, const BufferUsage *bufusage); static Size pgss_memsize(void); static pgssEntry *entry_alloc(pgssHashKey *key); static void entry_dealloc(void); static void entry_reset(void); /* * Module load callback */ void _PG_init(void) { /* * In order to create our shared memory area, we have to be loaded via * shared_preload_libraries. If not, fall out without hooking into any of * the main system. (We don't throw error here because it seems useful to * allow the pg_stat_statements functions to be created even when the * module isn't active. The functions must protect themselves against * being called then, however.) */ if (!process_shared_preload_libraries_in_progress) return; /* * Define (or redefine) custom GUC variables. */ DefineCustomIntVariable("pg_stat_statements.max", "Sets the maximum number of statements tracked by pg_stat_statements.", NULL, &pgss_max, 1000, 100, INT_MAX, PGC_POSTMASTER, 0, NULL, NULL); DefineCustomEnumVariable("pg_stat_statements.track", "Selects which statements are tracked by pg_stat_statements.", NULL, &pgss_track, PGSS_TRACK_TOP, track_options, PGC_SUSET, 0, NULL, NULL); DefineCustomBoolVariable("pg_stat_statements.track_utility", "Selects whether utility commands are tracked by pg_stat_statements.", NULL, &pgss_track_utility, true, PGC_SUSET, 0, NULL, NULL); DefineCustomBoolVariable("pg_stat_statements.save", "Save pg_stat_statements statistics across server shutdowns.", NULL, &pgss_save, true, PGC_SIGHUP, 0, NULL, NULL); EmitWarningsOnPlaceholders("pg_stat_statements"); /* * Request additional shared resources. (These are no-ops if we're not in * the postmaster process.) We'll allocate or attach to the shared * resources in pgss_shmem_startup(). */ RequestAddinShmemSpace(pgss_memsize()); RequestAddinLWLocks(1); /* * Install hooks. */ prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = pgss_shmem_startup; prev_ExecutorStart = ExecutorStart_hook; ExecutorStart_hook = pgss_ExecutorStart; prev_ExecutorRun = ExecutorRun_hook; ExecutorRun_hook = pgss_ExecutorRun; prev_ExecutorEnd = ExecutorEnd_hook; ExecutorEnd_hook = pgss_ExecutorEnd; prev_ProcessUtility = ProcessUtility_hook; ProcessUtility_hook = pgss_ProcessUtility; } /* * Module unload callback */ void _PG_fini(void) { /* Uninstall hooks. */ shmem_startup_hook = prev_shmem_startup_hook; ExecutorStart_hook = prev_ExecutorStart; ExecutorRun_hook = prev_ExecutorRun; ExecutorEnd_hook = prev_ExecutorEnd; ProcessUtility_hook = prev_ProcessUtility; } /* * shmem_startup hook: allocate or attach to shared memory, * then load any pre-existing statistics from file. */ static void pgss_shmem_startup(void) { bool found; HASHCTL info; FILE *file; uint32 header; int32 num; int32 i; int query_size; int buffer_size; char *buffer = NULL; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); /* reset in case this is a restart within the postmaster */ pgss = NULL; pgss_hash = NULL; /* * Create or attach to the shared memory state, including hash table */ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); pgss = ShmemInitStruct("pg_stat_statements", sizeof(pgssSharedState), &found); if (!found) { /* First time through ... */ pgss->lock = LWLockAssign(); pgss->query_size = pgstat_track_activity_query_size; } /* Be sure everyone agrees on the hash table entry size */ query_size = pgss->query_size; memset(&info, 0, sizeof(info)); info.keysize = sizeof(pgssHashKey); info.entrysize = offsetof(pgssEntry, query) +query_size; info.hash = pgss_hash_fn; info.match = pgss_match_fn; pgss_hash = ShmemInitHash("pg_stat_statements hash", pgss_max, pgss_max, &info, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); LWLockRelease(AddinShmemInitLock); /* * If we're in the postmaster (or a standalone backend...), set up a shmem * exit hook to dump the statistics to disk. */ if (!IsUnderPostmaster) on_shmem_exit(pgss_shmem_shutdown, (Datum) 0); /* * Attempt to load old statistics from the dump file, if this is the first * time through and we weren't told not to. */ if (found || !pgss_save) return; /* * Note: we don't bother with locks here, because there should be no other * processes running when this code is reached. */ file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R); if (file == NULL) { if (errno == ENOENT) return; /* ignore not-found error */ goto error; } buffer_size = query_size; buffer = (char *) palloc(buffer_size); if (fread(&header, sizeof(uint32), 1, file) != 1 || header != PGSS_FILE_HEADER || fread(&num, sizeof(int32), 1, file) != 1) goto error; for (i = 0; i < num; i++) { pgssEntry temp; pgssEntry *entry; if (fread(&temp, offsetof(pgssEntry, mutex), 1, file) != 1) goto error; /* Encoding is the only field we can easily sanity-check */ if (!PG_VALID_BE_ENCODING(temp.key.encoding)) goto error; /* Previous incarnation might have had a larger query_size */ if (temp.key.query_len >= buffer_size) { buffer = (char *) repalloc(buffer, temp.key.query_len + 1); buffer_size = temp.key.query_len + 1; } if (fread(buffer, 1, temp.key.query_len, file) != temp.key.query_len) goto error; buffer[temp.key.query_len] = '\0'; /* Clip to available length if needed */ if (temp.key.query_len >= query_size) temp.key.query_len = pg_encoding_mbcliplen(temp.key.encoding, buffer, temp.key.query_len, query_size - 1); temp.key.query_ptr = buffer; /* make the hashtable entry (discards old entries if too many) */ entry = entry_alloc(&temp.key); /* copy in the actual stats */ entry->counters = temp.counters; } pfree(buffer); FreeFile(file); return; error: ereport(LOG, (errcode_for_file_access(), errmsg("could not read pg_stat_statement file \"%s\": %m", PGSS_DUMP_FILE))); if (buffer) pfree(buffer); if (file) FreeFile(file); /* If possible, throw away the bogus file; ignore any error */ unlink(PGSS_DUMP_FILE); } /* * shmem_shutdown hook: Dump statistics into file. * * Note: we don't bother with acquiring lock, because there should be no * other processes running when this is called. */ static void pgss_shmem_shutdown(int code, Datum arg) { FILE *file; HASH_SEQ_STATUS hash_seq; int32 num_entries; pgssEntry *entry; /* Don't try to dump during a crash. */ if (code) return; /* Safety check ... shouldn't get here unless shmem is set up. */ if (!pgss || !pgss_hash) return; /* Don't dump if told not to. */ if (!pgss_save) return; file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_W); if (file == NULL) goto error; if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1) goto error; num_entries = hash_get_num_entries(pgss_hash); if (fwrite(&num_entries, sizeof(int32), 1, file) != 1) goto error; hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { int len = entry->key.query_len; if (fwrite(entry, offsetof(pgssEntry, mutex), 1, file) != 1 || fwrite(entry->query, 1, len, file) != len) goto error; } if (FreeFile(file)) { file = NULL; goto error; } return; error: ereport(LOG, (errcode_for_file_access(), errmsg("could not write pg_stat_statement file \"%s\": %m", PGSS_DUMP_FILE))); if (file) FreeFile(file); unlink(PGSS_DUMP_FILE); } /* * ExecutorStart hook: start up tracking if needed */ static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags) { if (prev_ExecutorStart) prev_ExecutorStart(queryDesc, eflags); else standard_ExecutorStart(queryDesc, eflags); if (pgss_enabled()) { /* * Set up to track total elapsed time in ExecutorRun. Make sure the * space is allocated in the per-query context so it will go away at * ExecutorEnd. */ if (queryDesc->totaltime == NULL) { MemoryContext oldcxt; oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL); MemoryContextSwitchTo(oldcxt); } } } /* * ExecutorRun hook: all we need do is track nesting depth */ static void pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count) { nested_level++; PG_TRY(); { if (prev_ExecutorRun) prev_ExecutorRun(queryDesc, direction, count); else standard_ExecutorRun(queryDesc, direction, count); nested_level--; } PG_CATCH(); { nested_level--; PG_RE_THROW(); } PG_END_TRY(); } /* * ExecutorEnd hook: store results if needed */ static void pgss_ExecutorEnd(QueryDesc *queryDesc) { if (queryDesc->totaltime && pgss_enabled()) { /* * Make sure stats accumulation is done. (Note: it's okay if several * levels of hook all do this.) */ InstrEndLoop(queryDesc->totaltime); pgss_store(queryDesc->sourceText, queryDesc->totaltime->total, queryDesc->estate->es_processed, &queryDesc->totaltime->bufusage); } if (prev_ExecutorEnd) prev_ExecutorEnd(queryDesc); else standard_ExecutorEnd(queryDesc); } /* * ProcessUtility hook */ static void pgss_ProcessUtility(Node *parsetree, const char *queryString, ParamListInfo params, bool isTopLevel, DestReceiver *dest, char *completionTag) { if (pgss_track_utility && pgss_enabled()) { instr_time start; instr_time duration; uint64 rows = 0; BufferUsage bufusage; bufusage = pgBufferUsage; INSTR_TIME_SET_CURRENT(start); nested_level++; PG_TRY(); { if (prev_ProcessUtility) prev_ProcessUtility(parsetree, queryString, params, isTopLevel, dest, completionTag); else standard_ProcessUtility(parsetree, queryString, params, isTopLevel, dest, completionTag); nested_level--; } PG_CATCH(); { nested_level--; PG_RE_THROW(); } PG_END_TRY(); INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); /* parse command tag to retrieve the number of affected rows. */ if (completionTag && sscanf(completionTag, "COPY " UINT64_FORMAT, &rows) != 1) rows = 0; /* calc differences of buffer counters. */ bufusage.shared_blks_hit = pgBufferUsage.shared_blks_hit - bufusage.shared_blks_hit; bufusage.shared_blks_read = pgBufferUsage.shared_blks_read - bufusage.shared_blks_read; bufusage.shared_blks_written = pgBufferUsage.shared_blks_written - bufusage.shared_blks_written; bufusage.local_blks_hit = pgBufferUsage.local_blks_hit - bufusage.local_blks_hit; bufusage.local_blks_read = pgBufferUsage.local_blks_read - bufusage.local_blks_read; bufusage.local_blks_written = pgBufferUsage.local_blks_written - bufusage.local_blks_written; bufusage.temp_blks_read = pgBufferUsage.temp_blks_read - bufusage.temp_blks_read; bufusage.temp_blks_written = pgBufferUsage.temp_blks_written - bufusage.temp_blks_written; pgss_store(queryString, INSTR_TIME_GET_DOUBLE(duration), rows, &bufusage); } else { if (prev_ProcessUtility) prev_ProcessUtility(parsetree, queryString, params, isTopLevel, dest, completionTag); else standard_ProcessUtility(parsetree, queryString, params, isTopLevel, dest, completionTag); } } /* * Calculate hash value for a key */ static uint32 pgss_hash_fn(const void *key, Size keysize) { const pgssHashKey *k = (const pgssHashKey *) key; /* we don't bother to include encoding in the hash */ return hash_uint32((uint32) k->userid) ^ hash_uint32((uint32) k->dbid) ^ DatumGetUInt32(hash_any((const unsigned char *) k->query_ptr, k->query_len)); } /* * Compare two keys - zero means match */ static int pgss_match_fn(const void *key1, const void *key2, Size keysize) { const pgssHashKey *k1 = (const pgssHashKey *) key1; const pgssHashKey *k2 = (const pgssHashKey *) key2; if (k1->userid == k2->userid && k1->dbid == k2->dbid && k1->encoding == k2->encoding && k1->query_len == k2->query_len && memcmp(k1->query_ptr, k2->query_ptr, k1->query_len) == 0) return 0; else return 1; } /* * Store some statistics for a statement. */ static void pgss_store(const char *query, double total_time, uint64 rows, const BufferUsage *bufusage) { pgssHashKey key; double usage; pgssEntry *entry; Assert(query != NULL); /* Safety check... */ if (!pgss || !pgss_hash) return; /* Set up key for hashtable search */ key.userid = GetUserId(); key.dbid = MyDatabaseId; key.encoding = GetDatabaseEncoding(); key.query_len = strlen(query); if (key.query_len >= pgss->query_size) key.query_len = pg_encoding_mbcliplen(key.encoding, query, key.query_len, pgss->query_size - 1); key.query_ptr = query; usage = USAGE_EXEC(duration); /* Lookup the hash table entry with shared lock. */ LWLockAcquire(pgss->lock, LW_SHARED); entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL); if (!entry) { /* Must acquire exclusive lock to add a new entry. */ LWLockRelease(pgss->lock); LWLockAcquire(pgss->lock, LW_EXCLUSIVE); entry = entry_alloc(&key); } /* Grab the spinlock while updating the counters. */ { volatile pgssEntry *e = (volatile pgssEntry *) entry; SpinLockAcquire(&e->mutex); e->counters.calls += 1; e->counters.total_time += total_time; e->counters.rows += rows; e->counters.shared_blks_hit += bufusage->shared_blks_hit; e->counters.shared_blks_read += bufusage->shared_blks_read; e->counters.shared_blks_written += bufusage->shared_blks_written; e->counters.local_blks_hit += bufusage->local_blks_hit; e->counters.local_blks_read += bufusage->local_blks_read; e->counters.local_blks_written += bufusage->local_blks_written; e->counters.temp_blks_read += bufusage->temp_blks_read; e->counters.temp_blks_written += bufusage->temp_blks_written; e->counters.usage += usage; SpinLockRelease(&e->mutex); } LWLockRelease(pgss->lock); } /* * Reset all statement statistics. */ Datum pg_stat_statements_reset(PG_FUNCTION_ARGS) { if (!pgss || !pgss_hash) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("pg_stat_statements must be loaded via shared_preload_libraries"))); entry_reset(); PG_RETURN_VOID(); } #define PG_STAT_STATEMENTS_COLS 14 /* * Retrieve statement statistics. */ Datum pg_stat_statements(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; Tuplestorestate *tupstore; MemoryContext per_query_ctx; MemoryContext oldcontext; Oid userid = GetUserId(); bool is_superuser = superuser(); HASH_SEQ_STATUS hash_seq; pgssEntry *entry; if (!pgss || !pgss_hash) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("pg_stat_statements must be loaded via shared_preload_libraries"))); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not " \ "allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); LWLockAcquire(pgss->lock, LW_SHARED); hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { Datum values[PG_STAT_STATEMENTS_COLS]; bool nulls[PG_STAT_STATEMENTS_COLS]; int i = 0; Counters tmp; memset(values, 0, sizeof(values)); memset(nulls, 0, sizeof(nulls)); values[i++] = ObjectIdGetDatum(entry->key.userid); values[i++] = ObjectIdGetDatum(entry->key.dbid); if (is_superuser || entry->key.userid == userid) { char *qstr; qstr = (char *) pg_do_encoding_conversion((unsigned char *) entry->query, entry->key.query_len, entry->key.encoding, GetDatabaseEncoding()); values[i++] = CStringGetTextDatum(qstr); if (qstr != entry->query) pfree(qstr); } else values[i++] = CStringGetTextDatum(""); /* copy counters to a local variable to keep locking time short */ { volatile pgssEntry *e = (volatile pgssEntry *) entry; SpinLockAcquire(&e->mutex); tmp = e->counters; SpinLockRelease(&e->mutex); } values[i++] = Int64GetDatumFast(tmp.calls); values[i++] = Float8GetDatumFast(tmp.total_time); values[i++] = Int64GetDatumFast(tmp.rows); values[i++] = Int64GetDatumFast(tmp.shared_blks_hit); values[i++] = Int64GetDatumFast(tmp.shared_blks_read); values[i++] = Int64GetDatumFast(tmp.shared_blks_written); values[i++] = Int64GetDatumFast(tmp.local_blks_hit); values[i++] = Int64GetDatumFast(tmp.local_blks_read); values[i++] = Int64GetDatumFast(tmp.local_blks_written); values[i++] = Int64GetDatumFast(tmp.temp_blks_read); values[i++] = Int64GetDatumFast(tmp.temp_blks_written); Assert(i == PG_STAT_STATEMENTS_COLS); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } LWLockRelease(pgss->lock); /* clean up and return the tuplestore */ tuplestore_donestoring(tupstore); return (Datum) 0; } /* * Estimate shared memory space needed. */ static Size pgss_memsize(void) { Size size; Size entrysize; size = MAXALIGN(sizeof(pgssSharedState)); entrysize = offsetof(pgssEntry, query) +pgstat_track_activity_query_size; size = add_size(size, hash_estimate_size(pgss_max, entrysize)); return size; } /* * Allocate a new hashtable entry. * caller must hold an exclusive lock on pgss->lock * * Note: despite needing exclusive lock, it's not an error for the target * entry to already exist. This is because pgss_store releases and * reacquires lock after failing to find a match; so someone else could * have made the entry while we waited to get exclusive lock. */ static pgssEntry * entry_alloc(pgssHashKey *key) { pgssEntry *entry; bool found; /* Caller must have clipped query properly */ Assert(key->query_len < pgss->query_size); /* Make space if needed */ while (hash_get_num_entries(pgss_hash) >= pgss_max) entry_dealloc(); /* Find or create an entry with desired hash code */ entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found); if (!found) { /* New entry, initialize it */ /* dynahash tried to copy the key for us, but must fix query_ptr */ entry->key.query_ptr = entry->query; /* reset the statistics */ memset(&entry->counters, 0, sizeof(Counters)); entry->counters.usage = USAGE_INIT; /* re-initialize the mutex each time ... we assume no one using it */ SpinLockInit(&entry->mutex); /* ... and don't forget the query text */ memcpy(entry->query, key->query_ptr, key->query_len); entry->query[key->query_len] = '\0'; } return entry; } /* * qsort comparator for sorting into increasing usage order */ static int entry_cmp(const void *lhs, const void *rhs) { double l_usage = (*(const pgssEntry **) lhs)->counters.usage; double r_usage = (*(const pgssEntry **) rhs)->counters.usage; if (l_usage < r_usage) return -1; else if (l_usage > r_usage) return +1; else return 0; } /* * Deallocate least used entries. * Caller must hold an exclusive lock on pgss->lock. */ static void entry_dealloc(void) { HASH_SEQ_STATUS hash_seq; pgssEntry **entries; pgssEntry *entry; int nvictims; int i; /* Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. */ entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *)); i = 0; hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { entries[i++] = entry; entry->counters.usage *= USAGE_DECREASE_FACTOR; } qsort(entries, i, sizeof(pgssEntry *), entry_cmp); nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100); nvictims = Min(nvictims, i); for (i = 0; i < nvictims; i++) { hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL); } pfree(entries); } /* * Release all entries. */ static void entry_reset(void) { HASH_SEQ_STATUS hash_seq; pgssEntry *entry; LWLockAcquire(pgss->lock, LW_EXCLUSIVE); hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL); } LWLockRelease(pgss->lock); }