postgresql/src/include/pgstat.h

999 lines
28 KiB
C
Raw Normal View History

/* ----------
* pgstat.h
*
* Definitions for the PostgreSQL statistics collector daemon.
*
* Copyright (c) 2001-2015, PostgreSQL Global Development Group
*
2010-09-20 22:08:53 +02:00
* src/include/pgstat.h
* ----------
*/
#ifndef PGSTAT_H
#define PGSTAT_H
#include "datatype/timestamp.h"
#include "fmgr.h"
#include "libpq/pqcomm.h"
#include "portability/instr_time.h"
#include "postmaster/pgarch.h"
#include "storage/barrier.h"
#include "utils/hsearch.h"
#include "utils/relcache.h"
/* ----------
* Paths for the statistics files (relative to installation's $PGDATA).
* ----------
*/
#define PGSTAT_STAT_PERMANENT_DIRECTORY "pg_stat"
#define PGSTAT_STAT_PERMANENT_FILENAME "pg_stat/global.stat"
#define PGSTAT_STAT_PERMANENT_TMPFILE "pg_stat/global.tmp"
/* Default directory to store temporary statistics data in */
#define PG_STAT_TMP_DIR "pg_stat_tmp"
/* Values for track_functions GUC variable --- order is significant! */
typedef enum TrackFunctionsLevel
{
TRACK_FUNC_OFF,
TRACK_FUNC_PL,
TRACK_FUNC_ALL
2011-04-10 17:42:00 +02:00
} TrackFunctionsLevel;
/* ----------
* The types of backend -> collector messages
* ----------
*/
typedef enum StatMsgType
{
PGSTAT_MTYPE_DUMMY,
PGSTAT_MTYPE_INQUIRY,
PGSTAT_MTYPE_TABSTAT,
PGSTAT_MTYPE_TABPURGE,
PGSTAT_MTYPE_DROPDB,
PGSTAT_MTYPE_RESETCOUNTER,
PGSTAT_MTYPE_RESETSHAREDCOUNTER,
PGSTAT_MTYPE_RESETSINGLECOUNTER,
PGSTAT_MTYPE_AUTOVAC_START,
PGSTAT_MTYPE_VACUUM,
PGSTAT_MTYPE_ANALYZE,
PGSTAT_MTYPE_ARCHIVER,
PGSTAT_MTYPE_BGWRITER,
PGSTAT_MTYPE_FUNCSTAT,
PGSTAT_MTYPE_FUNCPURGE,
PGSTAT_MTYPE_RECOVERYCONFLICT,
PGSTAT_MTYPE_TEMPFILE,
PGSTAT_MTYPE_DEADLOCK
} StatMsgType;
/* ----------
* The data type used for counters.
* ----------
*/
typedef int64 PgStat_Counter;
/* ----------
* PgStat_TableCounts The actual per-table counts kept by a backend
*
* This struct should contain only actual event counters, because we memcmp
* it against zeroes to detect whether there are any counts to transmit.
* It is a component of PgStat_TableStatus (within-backend state) and
* PgStat_TableEntry (the transmitted message format).
*
* Note: for a table, tuples_returned is the number of tuples successfully
* fetched by heap_getnext, while tuples_fetched is the number of tuples
* successfully fetched by heap_fetch under the control of bitmap indexscans.
* For an index, tuples_returned is the number of index entries returned by
* the index AM, while tuples_fetched is the number of tuples successfully
* fetched by heap_fetch under the control of simple indexscans for this index.
*
* tuples_inserted/updated/deleted/hot_updated count attempted actions,
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
* regardless of whether the transaction committed. delta_live_tuples,
* delta_dead_tuples, and changed_tuples are set depending on commit or abort.
* Note that delta_live_tuples and delta_dead_tuples can be negative!
* ----------
*/
typedef struct PgStat_TableCounts
{
PgStat_Counter t_numscans;
PgStat_Counter t_tuples_returned;
PgStat_Counter t_tuples_fetched;
PgStat_Counter t_tuples_inserted;
PgStat_Counter t_tuples_updated;
PgStat_Counter t_tuples_deleted;
PgStat_Counter t_tuples_hot_updated;
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
PgStat_Counter t_delta_live_tuples;
PgStat_Counter t_delta_dead_tuples;
PgStat_Counter t_changed_tuples;
PgStat_Counter t_blocks_fetched;
PgStat_Counter t_blocks_hit;
} PgStat_TableCounts;
/* Possible targets for resetting cluster-wide shared values */
typedef enum PgStat_Shared_Reset_Target
{
RESET_ARCHIVER,
2010-02-26 03:01:40 +01:00
RESET_BGWRITER
} PgStat_Shared_Reset_Target;
/* Possible object types for resetting single counters */
typedef enum PgStat_Single_Reset_Type
{
RESET_TABLE,
RESET_FUNCTION
} PgStat_Single_Reset_Type;
/* ------------------------------------------------------------
* Structures kept in backend local memory while accumulating counts
* ------------------------------------------------------------
*/
/* ----------
* PgStat_TableStatus Per-table status within a backend
*
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
* Many of the event counters are nontransactional, ie, we count events
* in committed and aborted transactions alike. For these, we just count
* directly in the PgStat_TableStatus. However, delta_live_tuples,
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
* delta_dead_tuples, and changed_tuples must be derived from event counts
* with awareness of whether the transaction or subtransaction committed or
* aborted. Hence, we also keep a stack of per-(sub)transaction status
* records for every table modified in the current transaction. At commit
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
* or abort, we propagate tuples_inserted/updated/deleted up to the
* parent subtransaction level, or out to the parent PgStat_TableStatus,
* as appropriate.
* ----------
*/
typedef struct PgStat_TableStatus
{
2007-11-15 22:14:46 +01:00
Oid t_id; /* table's OID */
bool t_shared; /* is it a shared catalog? */
struct PgStat_TableXactStatus *trans; /* lowest subxact's counts */
PgStat_TableCounts t_counts; /* event counts to be sent */
} PgStat_TableStatus;
/* ----------
* PgStat_TableXactStatus Per-table, per-subtransaction status
* ----------
*/
typedef struct PgStat_TableXactStatus
{
2007-11-15 22:14:46 +01:00
PgStat_Counter tuples_inserted; /* tuples inserted in (sub)xact */
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
PgStat_Counter tuples_updated; /* tuples updated in (sub)xact */
2007-11-15 22:14:46 +01:00
PgStat_Counter tuples_deleted; /* tuples deleted in (sub)xact */
int nest_level; /* subtransaction nest level */
/* links to other structs for same relation: */
2007-11-15 22:14:46 +01:00
struct PgStat_TableXactStatus *upper; /* next higher subxact if any */
PgStat_TableStatus *parent; /* per-table status */
/* structs of same subxact level are linked here: */
2007-11-15 22:14:46 +01:00
struct PgStat_TableXactStatus *next; /* next of same subxact */
} PgStat_TableXactStatus;
/* ------------------------------------------------------------
* Message formats follow
* ------------------------------------------------------------
*/
/* ----------
* PgStat_MsgHdr The common message header
* ----------
*/
typedef struct PgStat_MsgHdr
{
2005-10-15 04:49:52 +02:00
StatMsgType m_type;
int m_size;
} PgStat_MsgHdr;
/* ----------
* Space available in a message. This will keep the UDP packets below 1K,
* which should fit unfragmented into the MTU of the loopback interface.
* (Larger values of PGSTAT_MAX_MSG_SIZE would work for that on most
* platforms, but we're being conservative here.)
* ----------
*/
#define PGSTAT_MAX_MSG_SIZE 1000
#define PGSTAT_MSG_PAYLOAD (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr))
/* ----------
* PgStat_MsgDummy A dummy message, ignored by the collector
* ----------
*/
typedef struct PgStat_MsgDummy
{
PgStat_MsgHdr m_hdr;
} PgStat_MsgDummy;
/* ----------
* PgStat_MsgInquiry Sent by a backend to ask the collector
* to write the stats file.
* ----------
*/
typedef struct PgStat_MsgInquiry
{
PgStat_MsgHdr m_hdr;
Fix stats collector to recover nicely when system clock goes backwards. Formerly, if the system clock went backwards, the stats collector would fail to update the stats file any more until the clock reading again exceeds whatever timestamp was last written into the stats file. Such glitches in the clock's behavior are not terribly unlikely on machines not using NTP. Such a scenario has been observed to cause regression test failures in the buildfarm, and it could have bad effects on the behavior of autovacuum, so it seems prudent to install some defenses. We could directly detect the clock going backwards by adding GetCurrentTimestamp calls in the stats collector's main loop, but that would hurt performance on platforms where GetCurrentTimestamp is expensive. To minimize the performance hit in normal cases, adopt a more complicated scheme wherein backends check for clock skew when reading the stats file, and if they see it, signal the stats collector by sending an extra stats inquiry message. The stats collector does an extra GetCurrentTimestamp only when it receives an inquiry with an apparently out-of-order timestamp. To avoid unnecessary GetCurrentTimestamp calls, expand the inquiry messages to carry the backend's current clock reading as well as its stats cutoff time. The latter, being intentionally slightly in-the-past, would trigger more clock rechecks than we need if it were used for this purpose. We might want to backpatch this change at some point, but let's let it shake out in the buildfarm for awhile first.
2012-06-17 23:11:07 +02:00
TimestampTz clock_time; /* observed local clock time */
TimestampTz cutoff_time; /* minimum acceptable file timestamp */
Oid databaseid; /* requested DB (InvalidOid => all DBs) */
} PgStat_MsgInquiry;
/* ----------
* PgStat_TableEntry Per-table info in a MsgTabstat
* ----------
*/
typedef struct PgStat_TableEntry
{
Oid t_id;
PgStat_TableCounts t_counts;
} PgStat_TableEntry;
/* ----------
* PgStat_MsgTabstat Sent by the backend to report table
* and buffer access statistics.
* ----------
*/
#define PGSTAT_NUM_TABENTRIES \
((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 2 * sizeof(PgStat_Counter)) \
/ sizeof(PgStat_TableEntry))
typedef struct PgStat_MsgTabstat
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
int m_nentries;
int m_xact_commit;
int m_xact_rollback;
PgStat_Counter m_block_read_time; /* times in microseconds */
PgStat_Counter m_block_write_time;
PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES];
} PgStat_MsgTabstat;
/* ----------
* PgStat_MsgTabpurge Sent by the backend to tell the collector
* about dead tables.
* ----------
*/
#define PGSTAT_NUM_TABPURGE \
((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \
/ sizeof(Oid))
typedef struct PgStat_MsgTabpurge
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
int m_nentries;
Oid m_tableid[PGSTAT_NUM_TABPURGE];
} PgStat_MsgTabpurge;
/* ----------
* PgStat_MsgDropdb Sent by the backend to tell the collector
* about a dropped database
* ----------
*/
typedef struct PgStat_MsgDropdb
{
2005-10-15 04:49:52 +02:00
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
} PgStat_MsgDropdb;
/* ----------
* PgStat_MsgResetcounter Sent by the backend to tell the collector
* to reset counters
* ----------
*/
typedef struct PgStat_MsgResetcounter
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
} PgStat_MsgResetcounter;
/* ----------
2010-02-26 03:01:40 +01:00
* PgStat_MsgResetsharedcounter Sent by the backend to tell the collector
* to reset a shared counter
* ----------
*/
typedef struct PgStat_MsgResetsharedcounter
{
PgStat_MsgHdr m_hdr;
PgStat_Shared_Reset_Target m_resettarget;
} PgStat_MsgResetsharedcounter;
/* ----------
2010-02-26 03:01:40 +01:00
* PgStat_MsgResetsinglecounter Sent by the backend to tell the collector
* to reset a single counter
* ----------
*/
typedef struct PgStat_MsgResetsinglecounter
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
PgStat_Single_Reset_Type m_resettype;
Oid m_objectid;
} PgStat_MsgResetsinglecounter;
/* ----------
* PgStat_MsgAutovacStart Sent by the autovacuum daemon to signal
2005-10-15 04:49:52 +02:00
* that a database is going to be processed
* ----------
*/
typedef struct PgStat_MsgAutovacStart
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
2005-10-15 04:49:52 +02:00
TimestampTz m_start_time;
} PgStat_MsgAutovacStart;
/* ----------
* PgStat_MsgVacuum Sent by the backend or autovacuum daemon
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
* after VACUUM
* ----------
*/
typedef struct PgStat_MsgVacuum
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
Oid m_tableoid;
bool m_autovacuum;
TimestampTz m_vacuumtime;
PgStat_Counter m_live_tuples;
PgStat_Counter m_dead_tuples;
} PgStat_MsgVacuum;
/* ----------
* PgStat_MsgAnalyze Sent by the backend or autovacuum daemon
2005-10-15 04:49:52 +02:00
* after ANALYZE
* ----------
*/
typedef struct PgStat_MsgAnalyze
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
Oid m_tableoid;
2006-10-04 02:30:14 +02:00
bool m_autovacuum;
TimestampTz m_analyzetime;
2005-10-15 04:49:52 +02:00
PgStat_Counter m_live_tuples;
PgStat_Counter m_dead_tuples;
} PgStat_MsgAnalyze;
/* ----------
* PgStat_MsgArchiver Sent by the archiver to update statistics.
* ----------
*/
typedef struct PgStat_MsgArchiver
{
PgStat_MsgHdr m_hdr;
bool m_failed; /* Failed attempt */
char m_xlog[MAX_XFN_CHARS + 1];
TimestampTz m_timestamp;
} PgStat_MsgArchiver;
/* ----------
2007-11-15 22:14:46 +01:00
* PgStat_MsgBgWriter Sent by the bgwriter to update statistics.
* ----------
*/
typedef struct PgStat_MsgBgWriter
{
PgStat_MsgHdr m_hdr;
2007-11-15 22:14:46 +01:00
PgStat_Counter m_timed_checkpoints;
PgStat_Counter m_requested_checkpoints;
PgStat_Counter m_buf_written_checkpoints;
PgStat_Counter m_buf_written_clean;
PgStat_Counter m_maxwritten_clean;
PgStat_Counter m_buf_written_backend;
PgStat_Counter m_buf_fsync_backend;
2007-11-15 22:14:46 +01:00
PgStat_Counter m_buf_alloc;
PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
PgStat_Counter m_checkpoint_sync_time;
} PgStat_MsgBgWriter;
/* ----------
* PgStat_MsgRecoveryConflict Sent by the backend upon recovery conflict
* ----------
*/
typedef struct PgStat_MsgRecoveryConflict
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
int m_reason;
} PgStat_MsgRecoveryConflict;
/* ----------
* PgStat_MsgTempFile Sent by the backend upon creating a temp file
* ----------
*/
typedef struct PgStat_MsgTempFile
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
size_t m_filesize;
} PgStat_MsgTempFile;
/* ----------
* PgStat_FunctionCounts The actual per-function counts kept by a backend
*
* This struct should contain only actual event counters, because we memcmp
* it against zeroes to detect whether there are any counts to transmit.
*
* Note that the time counters are in instr_time format here. We convert to
* microseconds in PgStat_Counter format when transmitting to the collector.
* ----------
*/
typedef struct PgStat_FunctionCounts
{
PgStat_Counter f_numcalls;
instr_time f_total_time;
instr_time f_self_time;
} PgStat_FunctionCounts;
/* ----------
* PgStat_BackendFunctionEntry Entry in backend's per-function hash table
* ----------
*/
typedef struct PgStat_BackendFunctionEntry
{
Oid f_id;
PgStat_FunctionCounts f_counts;
} PgStat_BackendFunctionEntry;
/* ----------
* PgStat_FunctionEntry Per-function info in a MsgFuncstat
* ----------
*/
typedef struct PgStat_FunctionEntry
{
Oid f_id;
PgStat_Counter f_numcalls;
PgStat_Counter f_total_time; /* times in microseconds */
PgStat_Counter f_self_time;
} PgStat_FunctionEntry;
/* ----------
* PgStat_MsgFuncstat Sent by the backend to report function
* usage statistics.
* ----------
*/
#define PGSTAT_NUM_FUNCENTRIES \
((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \
/ sizeof(PgStat_FunctionEntry))
typedef struct PgStat_MsgFuncstat
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
int m_nentries;
PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES];
} PgStat_MsgFuncstat;
/* ----------
* PgStat_MsgFuncpurge Sent by the backend to tell the collector
* about dead functions.
* ----------
*/
#define PGSTAT_NUM_FUNCPURGE \
((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int)) \
/ sizeof(Oid))
typedef struct PgStat_MsgFuncpurge
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
int m_nentries;
Oid m_functionid[PGSTAT_NUM_FUNCPURGE];
} PgStat_MsgFuncpurge;
/* ----------
* PgStat_MsgDeadlock Sent by the backend to tell the collector
* about a deadlock that occurred.
* ----------
*/
typedef struct PgStat_MsgDeadlock
{
PgStat_MsgHdr m_hdr;
Oid m_databaseid;
} PgStat_MsgDeadlock;
/* ----------
* PgStat_Msg Union over all possible messages.
* ----------
*/
typedef union PgStat_Msg
{
PgStat_MsgHdr msg_hdr;
PgStat_MsgDummy msg_dummy;
PgStat_MsgInquiry msg_inquiry;
PgStat_MsgTabstat msg_tabstat;
PgStat_MsgTabpurge msg_tabpurge;
PgStat_MsgDropdb msg_dropdb;
PgStat_MsgResetcounter msg_resetcounter;
PgStat_MsgResetsharedcounter msg_resetsharedcounter;
PgStat_MsgResetsinglecounter msg_resetsinglecounter;
PgStat_MsgAutovacStart msg_autovacuum;
PgStat_MsgVacuum msg_vacuum;
PgStat_MsgAnalyze msg_analyze;
PgStat_MsgArchiver msg_archiver;
PgStat_MsgBgWriter msg_bgwriter;
PgStat_MsgFuncstat msg_funcstat;
PgStat_MsgFuncpurge msg_funcpurge;
PgStat_MsgRecoveryConflict msg_recoveryconflict;
PgStat_MsgDeadlock msg_deadlock;
} PgStat_Msg;
/* ------------------------------------------------------------
* Statistic collector data structures follow
*
* PGSTAT_FILE_FORMAT_ID should be changed whenever any of these
* data structures change.
* ------------------------------------------------------------
*/
#define PGSTAT_FILE_FORMAT_ID 0x01A5BC9C
/* ----------
* PgStat_StatDBEntry The collector's data per database
* ----------
*/
typedef struct PgStat_StatDBEntry
{
Oid databaseid;
PgStat_Counter n_xact_commit;
PgStat_Counter n_xact_rollback;
PgStat_Counter n_blocks_fetched;
PgStat_Counter n_blocks_hit;
PgStat_Counter n_tuples_returned;
PgStat_Counter n_tuples_fetched;
PgStat_Counter n_tuples_inserted;
PgStat_Counter n_tuples_updated;
PgStat_Counter n_tuples_deleted;
2005-10-15 04:49:52 +02:00
TimestampTz last_autovac_time;
PgStat_Counter n_conflict_tablespace;
PgStat_Counter n_conflict_lock;
PgStat_Counter n_conflict_snapshot;
PgStat_Counter n_conflict_bufferpin;
PgStat_Counter n_conflict_startup_deadlock;
PgStat_Counter n_temp_files;
PgStat_Counter n_temp_bytes;
PgStat_Counter n_deadlocks;
PgStat_Counter n_block_read_time; /* times in microseconds */
PgStat_Counter n_block_write_time;
TimestampTz stat_reset_timestamp;
TimestampTz stats_timestamp; /* time of db stats file update */
/*
* tables and functions must be last in the struct, because we don't write
* the pointers out to the stats file.
*/
HTAB *tables;
HTAB *functions;
} PgStat_StatDBEntry;
/* ----------
* PgStat_StatTabEntry The collector's data per table (or index)
* ----------
*/
typedef struct PgStat_StatTabEntry
{
Oid tableid;
PgStat_Counter numscans;
PgStat_Counter tuples_returned;
PgStat_Counter tuples_fetched;
PgStat_Counter tuples_inserted;
PgStat_Counter tuples_updated;
PgStat_Counter tuples_deleted;
PgStat_Counter tuples_hot_updated;
PgStat_Counter n_live_tuples;
PgStat_Counter n_dead_tuples;
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
PgStat_Counter changes_since_analyze;
PgStat_Counter blocks_fetched;
PgStat_Counter blocks_hit;
2006-10-04 02:30:14 +02:00
TimestampTz vacuum_timestamp; /* user initiated vacuum */
PgStat_Counter vacuum_count;
2006-10-04 02:30:14 +02:00
TimestampTz autovac_vacuum_timestamp; /* autovacuum initiated */
PgStat_Counter autovac_vacuum_count;
2006-10-04 02:30:14 +02:00
TimestampTz analyze_timestamp; /* user initiated */
PgStat_Counter analyze_count;
2006-10-04 02:30:14 +02:00
TimestampTz autovac_analyze_timestamp; /* autovacuum initiated */
PgStat_Counter autovac_analyze_count;
} PgStat_StatTabEntry;
/* ----------
* PgStat_StatFuncEntry The collector's data per function
* ----------
*/
typedef struct PgStat_StatFuncEntry
{
Oid functionid;
PgStat_Counter f_numcalls;
PgStat_Counter f_total_time; /* times in microseconds */
PgStat_Counter f_self_time;
} PgStat_StatFuncEntry;
/*
* Archiver statistics kept in the stats collector
*/
typedef struct PgStat_ArchiverStats
{
PgStat_Counter archived_count; /* archival successes */
char last_archived_wal[MAX_XFN_CHARS + 1]; /* last WAL file
* archived */
TimestampTz last_archived_timestamp; /* last archival success time */
PgStat_Counter failed_count; /* failed archival attempts */
char last_failed_wal[MAX_XFN_CHARS + 1]; /* WAL file involved in
* last failure */
TimestampTz last_failed_timestamp; /* last archival failure time */
TimestampTz stat_reset_timestamp;
} PgStat_ArchiverStats;
/*
* Global statistics kept in the stats collector
*/
typedef struct PgStat_GlobalStats
{
TimestampTz stats_timestamp; /* time of stats file update */
2007-11-15 22:14:46 +01:00
PgStat_Counter timed_checkpoints;
PgStat_Counter requested_checkpoints;
PgStat_Counter checkpoint_write_time; /* times in milliseconds */
PgStat_Counter checkpoint_sync_time;
2007-11-15 22:14:46 +01:00
PgStat_Counter buf_written_checkpoints;
PgStat_Counter buf_written_clean;
PgStat_Counter maxwritten_clean;
PgStat_Counter buf_written_backend;
PgStat_Counter buf_fsync_backend;
2007-11-15 22:14:46 +01:00
PgStat_Counter buf_alloc;
TimestampTz stat_reset_timestamp;
} PgStat_GlobalStats;
/* ----------
* Backend states
* ----------
*/
typedef enum BackendState
{
STATE_UNDEFINED,
STATE_IDLE,
STATE_RUNNING,
STATE_IDLEINTRANSACTION,
STATE_FASTPATH,
STATE_IDLEINTRANSACTION_ABORTED,
STATE_DISABLED
} BackendState;
/* ----------
* Shared-memory data structures
* ----------
*/
/* ----------
* PgBackendStatus
*
* Each live backend maintains a PgBackendStatus struct in shared memory
* showing its current activity. (The structs are allocated according to
* BackendId, but that is not critical.) Note that the collector process
* has no involvement in, or even access to, these structs.
* ----------
*/
typedef struct PgBackendStatus
{
/*
* To avoid locking overhead, we use the following protocol: a backend
* increments st_changecount before modifying its entry, and again after
2006-10-04 02:30:14 +02:00
* finishing a modification. A would-be reader should note the value of
* st_changecount, copy the entry into private memory, then check
* st_changecount again. If the value hasn't changed, and if it's even,
* the copy is valid; otherwise start over. This makes updates cheap
* while reads are potentially expensive, but that's the tradeoff we want.
*
* The above protocol needs the memory barriers to ensure that
* the apparent order of execution is as it desires. Otherwise,
* for example, the CPU might rearrange the code so that st_changecount
* is incremented twice before the modification on a machine with
* weak memory ordering. This surprising result can lead to bugs.
*/
int st_changecount;
/* The entry is valid iff st_procpid > 0, unused if st_procpid == 0 */
int st_procpid;
/* Times when current backend, transaction, and activity started */
TimestampTz st_proc_start_timestamp;
TimestampTz st_xact_start_timestamp;
TimestampTz st_activity_start_timestamp;
TimestampTz st_state_start_timestamp;
/* Database OID, owning user's OID, connection client address */
Oid st_databaseid;
Oid st_userid;
SockAddr st_clientaddr;
2011-04-10 17:42:00 +02:00
char *st_clienthostname; /* MUST be null-terminated */
/* Is backend currently waiting on an lmgr lock? */
bool st_waiting;
/* current state */
BackendState st_state;
/* application name; MUST be null-terminated */
2010-02-26 03:01:40 +01:00
char *st_appname;
/* current command string; MUST be null-terminated */
char *st_activity;
} PgBackendStatus;
/*
* Macros to load and store st_changecount with the memory barriers.
*
* pgstat_increment_changecount_before() and
* pgstat_increment_changecount_after() need to be called before and after
* PgBackendStatus entries are modified, respectively. This makes sure that
* st_changecount is incremented around the modification.
*
* Also pgstat_save_changecount_before() and pgstat_save_changecount_after()
* need to be called before and after PgBackendStatus entries are copied into
* private memory, respectively.
*/
#define pgstat_increment_changecount_before(beentry) \
do { \
beentry->st_changecount++; \
pg_write_barrier(); \
} while (0)
#define pgstat_increment_changecount_after(beentry) \
do { \
pg_write_barrier(); \
beentry->st_changecount++; \
Assert((beentry->st_changecount & 1) == 0); \
} while (0)
#define pgstat_save_changecount_before(beentry, save_changecount) \
do { \
save_changecount = beentry->st_changecount; \
pg_read_barrier(); \
} while (0)
#define pgstat_save_changecount_after(beentry, save_changecount) \
do { \
pg_read_barrier(); \
save_changecount = beentry->st_changecount; \
} while (0)
/* ----------
* LocalPgBackendStatus
*
* When we build the backend status array, we use LocalPgBackendStatus to be
* able to add new values to the struct when needed without adding new fields
* to the shared memory. It contains the backend status as a first member.
* ----------
*/
typedef struct LocalPgBackendStatus
{
/*
* Local version of the backend status entry.
*/
PgBackendStatus backendStatus;
/*
* The xid of the current transaction if available, InvalidTransactionId
* if not.
*/
TransactionId backend_xid;
/*
* The xmin of the current session if available, InvalidTransactionId if
* not.
*/
TransactionId backend_xmin;
} LocalPgBackendStatus;
/*
* Working state needed to accumulate per-function-call timing statistics.
*/
typedef struct PgStat_FunctionCallUsage
{
/* Link to function's hashtable entry (must still be there at exit!) */
/* NULL means we are not tracking the current function call */
PgStat_FunctionCounts *fs;
/* Total time previously charged to function, as of function start */
instr_time save_f_total_time;
/* Backend-wide total time as of function start */
instr_time save_total;
/* system clock as of function start */
instr_time f_start;
} PgStat_FunctionCallUsage;
/* ----------
* GUC parameters
* ----------
*/
extern bool pgstat_track_activities;
extern bool pgstat_track_counts;
extern int pgstat_track_functions;
extern PGDLLIMPORT int pgstat_track_activity_query_size;
extern char *pgstat_stat_directory;
extern char *pgstat_stat_tmpname;
extern char *pgstat_stat_filename;
/*
* BgWriter statistics counters are updated directly by bgwriter and bufmgr
*/
extern PgStat_MsgBgWriter BgWriterStats;
/*
* Updated by pgstat_count_buffer_*_time macros
*/
extern PgStat_Counter pgStatBlockReadTime;
extern PgStat_Counter pgStatBlockWriteTime;
/* ----------
* Functions called from postmaster
* ----------
*/
extern Size BackendStatusShmemSize(void);
extern void CreateSharedBackendStatus(void);
extern void pgstat_init(void);
extern int pgstat_start(void);
extern void pgstat_reset_all(void);
extern void allow_immediate_pgstat_restart(void);
2007-11-15 22:14:46 +01:00
#ifdef EXEC_BACKEND
extern void PgstatCollectorMain(int argc, char *argv[]) __attribute__((noreturn));
#endif
/* ----------
* Functions called from backends
* ----------
*/
extern void pgstat_ping(void);
extern void pgstat_report_stat(bool force);
extern void pgstat_vacuum_stat(void);
extern void pgstat_drop_database(Oid databaseid);
extern void pgstat_clear_snapshot(void);
extern void pgstat_reset_counters(void);
extern void pgstat_reset_shared_counters(const char *);
extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type type);
extern void pgstat_report_autovac(Oid dboid);
Fix VACUUM so that it always updates pg_class.reltuples/relpages. When we added the ability for vacuum to skip heap pages by consulting the visibility map, we made it just not update the reltuples/relpages statistics if it skipped any pages. But this could leave us with extremely out-of-date stats for a table that contains any unchanging areas, especially for TOAST tables which never get processed by ANALYZE. In particular this could result in autovacuum making poor decisions about when to process the table, as in recent report from Florian Helmberger. And in general it's a bad idea to not update the stats at all. Instead, use the previous values of reltuples/relpages as an estimate of the tuple density in unvisited pages. This approach results in a "moving average" estimate of reltuples, which should converge to the correct value over multiple VACUUM and ANALYZE cycles even when individual measurements aren't very good. This new method for updating reltuples is used by both VACUUM and ANALYZE, with the result that we no longer need the grotty interconnections that caused ANALYZE to not update the stats depending on what had happened in the parent VACUUM command. Also, fix the logic for skipping all-visible pages during VACUUM so that it looks ahead rather than behind to decide what to do, as per a suggestion from Greg Stark. This eliminates useless scanning of all-visible pages at the start of the relation or just after a not-all-visible page. In particular, the first few pages of the relation will not be invariably included in the scanned pages, which seems to help in not overweighting them in the reltuples estimate. Back-patch to 8.4, where the visibility map was introduced.
2011-05-30 23:05:26 +02:00
extern void pgstat_report_vacuum(Oid tableoid, bool shared,
PgStat_Counter livetuples, PgStat_Counter deadtuples);
Fix VACUUM so that it always updates pg_class.reltuples/relpages. When we added the ability for vacuum to skip heap pages by consulting the visibility map, we made it just not update the reltuples/relpages statistics if it skipped any pages. But this could leave us with extremely out-of-date stats for a table that contains any unchanging areas, especially for TOAST tables which never get processed by ANALYZE. In particular this could result in autovacuum making poor decisions about when to process the table, as in recent report from Florian Helmberger. And in general it's a bad idea to not update the stats at all. Instead, use the previous values of reltuples/relpages as an estimate of the tuple density in unvisited pages. This approach results in a "moving average" estimate of reltuples, which should converge to the correct value over multiple VACUUM and ANALYZE cycles even when individual measurements aren't very good. This new method for updating reltuples is used by both VACUUM and ANALYZE, with the result that we no longer need the grotty interconnections that caused ANALYZE to not update the stats depending on what had happened in the parent VACUUM command. Also, fix the logic for skipping all-visible pages during VACUUM so that it looks ahead rather than behind to decide what to do, as per a suggestion from Greg Stark. This eliminates useless scanning of all-visible pages at the start of the relation or just after a not-all-visible page. In particular, the first few pages of the relation will not be invariably included in the scanned pages, which seems to help in not overweighting them in the reltuples estimate. Back-patch to 8.4, where the visibility map was introduced.
2011-05-30 23:05:26 +02:00
extern void pgstat_report_analyze(Relation rel,
Revise pgstat's tracking of tuple changes to improve the reliability of decisions about when to auto-analyze. The previous code depended on n_live_tuples + n_dead_tuples - last_anl_tuples, where all three of these numbers could be bad estimates from ANALYZE itself. Even worse, in the presence of a steady flow of HOT updates and matching HOT-tuple reclamations, auto-analyze might never trigger at all, even if all three numbers are exactly right, because n_dead_tuples could hold steady. To fix, replace last_anl_tuples with an accurately tracked count of the total number of committed tuple inserts + updates + deletes since the last ANALYZE on the table. This can still be compared to the same threshold as before, but it's much more trustworthy than the old computation. Tracking this requires one more intra-transaction counter per modified table within backends, but no additional memory space in the stats collector. There probably isn't any measurable speed difference; if anything it might be a bit faster than before, since I was able to eliminate some per-tuple arithmetic operations in favor of adding sums once per (sub)transaction. Also, simplify the logic around pgstat vacuum and analyze reporting messages by not trying to fold VACUUM ANALYZE into a single pgstat message. The original thought behind this patch was to allow scheduling of analyzes on parent tables by artificially inflating their changes_since_analyze count. I've left that for a separate patch since this change seems to stand on its own merit.
2009-12-30 21:32:14 +01:00
PgStat_Counter livetuples, PgStat_Counter deadtuples);
extern void pgstat_report_recovery_conflict(int reason);
extern void pgstat_report_deadlock(void);
extern void pgstat_initialize(void);
extern void pgstat_bestart(void);
extern void pgstat_report_activity(BackendState state, const char *cmd_str);
extern void pgstat_report_tempfile(size_t filesize);
extern void pgstat_report_appname(const char *appname);
extern void pgstat_report_xact_timestamp(TimestampTz tstamp);
extern void pgstat_report_waiting(bool waiting);
extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser);
extern const char *pgstat_get_crashed_backend_activity(int pid, char *buffer,
int buflen);
extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id);
extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id);
extern void pgstat_initstats(Relation rel);
/* nontransactional event counts are simple enough to inline */
#define pgstat_count_heap_scan(rel) \
do { \
if ((rel)->pgstat_info != NULL) \
(rel)->pgstat_info->t_counts.t_numscans++; \
} while (0)
#define pgstat_count_heap_getnext(rel) \
do { \
if ((rel)->pgstat_info != NULL) \
(rel)->pgstat_info->t_counts.t_tuples_returned++; \
} while (0)
#define pgstat_count_heap_fetch(rel) \
do { \
if ((rel)->pgstat_info != NULL) \
(rel)->pgstat_info->t_counts.t_tuples_fetched++; \
} while (0)
#define pgstat_count_index_scan(rel) \
do { \
if ((rel)->pgstat_info != NULL) \
(rel)->pgstat_info->t_counts.t_numscans++; \
} while (0)
#define pgstat_count_index_tuples(rel, n) \
do { \
if ((rel)->pgstat_info != NULL) \
(rel)->pgstat_info->t_counts.t_tuples_returned += (n); \
} while (0)
#define pgstat_count_buffer_read(rel) \
do { \
if ((rel)->pgstat_info != NULL) \
(rel)->pgstat_info->t_counts.t_blocks_fetched++; \
} while (0)
#define pgstat_count_buffer_hit(rel) \
do { \
if ((rel)->pgstat_info != NULL) \
(rel)->pgstat_info->t_counts.t_blocks_hit++; \
} while (0)
#define pgstat_count_buffer_read_time(n) \
(pgStatBlockReadTime += (n))
#define pgstat_count_buffer_write_time(n) \
(pgStatBlockWriteTime += (n))
extern void pgstat_count_heap_insert(Relation rel, int n);
extern void pgstat_count_heap_update(Relation rel, bool hot);
extern void pgstat_count_heap_delete(Relation rel);
extern void pgstat_update_heap_dead_tuples(Relation rel, int delta);
extern void pgstat_init_function_usage(FunctionCallInfoData *fcinfo,
PgStat_FunctionCallUsage *fcu);
extern void pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu,
bool finalize);
extern void AtEOXact_PgStat(bool isCommit);
extern void AtEOSubXact_PgStat(bool isCommit, int nestDepth);
extern void AtPrepare_PgStat(void);
extern void PostPrepare_PgStat(void);
extern void pgstat_twophase_postcommit(TransactionId xid, uint16 info,
2007-11-15 22:14:46 +01:00
void *recdata, uint32 len);
extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
2007-11-15 22:14:46 +01:00
void *recdata, uint32 len);
extern void pgstat_send_archiver(const char *xlog, bool failed);
extern void pgstat_send_bgwriter(void);
/* ----------
* Support functions for the SQL-callable functions to
* generate the pgstat* views.
* ----------
*/
extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid);
extern LocalPgBackendStatus *pgstat_fetch_stat_local_beentry(int beid);
extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
extern int pgstat_fetch_stat_numbackends(void);
extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
extern PgStat_GlobalStats *pgstat_fetch_global(void);
#endif /* PGSTAT_H */