postgresql/contrib/pg_stat_statements/pg_stat_statements.c

3257 lines
91 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* pg_stat_statements.c
* Track statement execution times across a whole database cluster.
*
* Execution costs are totalled for each distinct source query, and kept in
* a shared hashtable. (We track only as many distinct queries as will fit
* in the designated amount of shared memory.)
*
* As of Postgres 9.2, this module normalizes query entries. Normalization
* is a process whereby similar queries, typically differing only in their
* constants (though the exact rules are somewhat more subtle than that) are
* recognized as equivalent, and are tracked as a single entry. This is
* particularly useful for non-prepared queries.
*
* Normalization is implemented by fingerprinting queries, selectively
* serializing those fields of each query tree's nodes that are judged to be
* essential to the query. This is referred to as a query jumble. This is
* distinct from a regular serialization in that various extraneous
* information is ignored as irrelevant or not essential to the query, such
* as the collations of Vars and, most notably, the values of constants.
*
* This jumble is acquired at the end of parse analysis of each query, and
* a 64-bit hash of it is stored into the query's Query.queryId field.
* The server then copies this value around, making it available in plan
* tree(s) generated from the query. The executor can then use this value
* to blame query costs on the proper queryId.
*
* To facilitate presenting entries to users, we create "representative" query
* strings in which constants are replaced with parameter symbols ($n), to
* make it clearer what a normalized entry can represent. To save on shared
* memory, and to avoid having to truncate oversized query strings, we store
* these strings in a temporary external query-texts file. Offsets into this
* file are kept in shared memory.
*
* Note about locking issues: to create or delete an entry in the shared
* hashtable, one must hold pgss->lock exclusively. Modifying any field
* in an entry except the counters requires the same. To look up an entry,
* one must hold the lock shared. To read or update the counters within
* an entry, one must hold the lock shared or exclusive (so the entry doesn't
* disappear!) and also take the entry's mutex spinlock.
* The shared state variable pgss->extent (the next free spot in the external
* query-text file) should be accessed only while holding either the
* pgss->mutex spinlock, or exclusive lock on pgss->lock. We use the mutex to
* allow reserving file space while holding only shared lock on pgss->lock.
* Rewriting the entire external query-text file, eg for garbage collection,
* requires holding pgss->lock exclusively; this allows individual entries
* in the file to be read or written while holding only shared lock.
*
*
* Copyright (c) 2008-2020, PostgreSQL Global Development Group
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* contrib/pg_stat_statements/pg_stat_statements.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include <sys/stat.h>
#include <unistd.h>
#include "catalog/pg_authid.h"
#include "common/hashfn.h"
#include "executor/instrument.h"
#include "funcapi.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "parser/analyze.h"
#include "parser/parsetree.h"
#include "parser/scanner.h"
#include "parser/scansup.h"
#include "pgstat.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/spin.h"
#include "tcop/utility.h"
#include "utils/acl.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
PG_MODULE_MAGIC;
/* Location of permanent stats file (valid when database is shut down) */
#define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
/*
* Location of external query text file. We don't keep it in the core
* system's stats_temp_directory. The core system can safely use that GUC
* setting, because the statistics collector temp file paths are set only once
* as part of changing the GUC, but pg_stat_statements has no way of avoiding
* race conditions. Besides, we only expect modest, infrequent I/O for query
* strings, so placing the file on a faster filesystem is not compelling.
*/
#define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat"
/* Magic number identifying the stats file format */
static const uint32 PGSS_FILE_HEADER = 0x20171004;
/* PostgreSQL major version number, changes in which invalidate all entries */
static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
/* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
#define USAGE_EXEC(duration) (1.0)
#define USAGE_INIT (1.0) /* including initial planning */
#define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
#define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
#define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
#define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
#define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
#define JUMBLE_SIZE 1024 /* query serialization buffer size */
/*
* Extension version number, for supporting older extension versions' objects
*/
typedef enum pgssVersion
{
PGSS_V1_0 = 0,
PGSS_V1_1,
PGSS_V1_2,
PGSS_V1_3
} pgssVersion;
/*
* Hashtable key that defines the identity of a hashtable entry. We separate
* queries by user and by database even if they are otherwise identical.
*
* Right now, this structure contains no padding. If you add any, make sure
* to teach pgss_store() to zero the padding bytes. Otherwise, things will
* break, because pgss_hash is created using HASH_BLOBS, and thus tag_hash
* is used to hash this.
*/
typedef struct pgssHashKey
{
Oid userid; /* user OID */
Oid dbid; /* database OID */
uint64 queryid; /* query identifier */
} pgssHashKey;
/*
* The actual stats counters kept within pgssEntry.
*/
typedef struct Counters
{
2010-02-26 03:01:40 +01:00
int64 calls; /* # of times executed */
double total_time; /* total execution time, in msec */
double min_time; /* minimum execution time in msec */
2015-05-24 03:35:49 +02:00
double max_time; /* maximum execution time in msec */
double mean_time; /* mean execution time in msec */
double sum_var_time; /* sum of variances in execution time in msec */
2010-02-26 03:01:40 +01:00
int64 rows; /* total # of retrieved or affected rows */
int64 shared_blks_hit; /* # of shared buffer hits */
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
int64 shared_blks_read; /* # of shared disk blocks read */
int64 shared_blks_dirtied; /* # of shared disk blocks dirtied */
2010-02-26 03:01:40 +01:00
int64 shared_blks_written; /* # of shared disk blocks written */
int64 local_blks_hit; /* # of local buffer hits */
int64 local_blks_read; /* # of local disk blocks read */
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
int64 local_blks_dirtied; /* # of local disk blocks dirtied */
int64 local_blks_written; /* # of local disk blocks written */
int64 temp_blks_read; /* # of temp blocks read */
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
int64 temp_blks_written; /* # of temp blocks written */
double blk_read_time; /* time spent reading, in msec */
double blk_write_time; /* time spent writing, in msec */
double usage; /* usage factor */
} Counters;
/*
* Statistics per statement
*
* Note: in event of a failure in garbage collection of the query text file,
* we reset query_offset to zero and query_len to -1. This will be seen as
* an invalid state by qtext_fetch().
*/
typedef struct pgssEntry
{
pgssHashKey key; /* hash key of entry - MUST BE FIRST */
Counters counters; /* the statistics for this query */
Size query_offset; /* query text offset in external file */
2015-10-04 23:58:29 +02:00
int query_len; /* # of valid bytes in query string, or -1 */
int encoding; /* query text encoding */
slock_t mutex; /* protects the counters only */
} pgssEntry;
/*
* Global shared state
*/
typedef struct pgssSharedState
{
LWLock *lock; /* protects hashtable search/modification */
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
double cur_median_usage; /* current median usage in hashtable */
Size mean_query_len; /* current mean entry text length */
slock_t mutex; /* protects following fields only: */
Size extent; /* current extent of query file */
int n_writers; /* number of active writers to query file */
int gc_count; /* query file garbage collection cycle count */
} pgssSharedState;
/*
* Struct for tracking locations/lengths of constants during normalization
*/
typedef struct pgssLocationLen
{
int location; /* start offset in query text */
int length; /* length in bytes, or -1 to ignore */
} pgssLocationLen;
/*
* Working state for computing a query jumble and producing a normalized
* query string
*/
typedef struct pgssJumbleState
{
/* Jumble of current query tree */
unsigned char *jumble;
/* Number of bytes used in jumble[] */
Size jumble_len;
/* Array of locations of constants that should be removed */
pgssLocationLen *clocations;
/* Allocated length of clocations array */
int clocations_buf_size;
/* Current number of valid entries in clocations array */
int clocations_count;
/* highest Param id we've seen, in order to start normalization correctly */
int highest_extern_param_id;
} pgssJumbleState;
/*---- Local variables ----*/
/* Current nesting depth of ExecutorRun+ProcessUtility calls */
static int nested_level = 0;
/* Saved hook values in case of unload */
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
static ExecutorStart_hook_type prev_ExecutorStart = NULL;
static ExecutorRun_hook_type prev_ExecutorRun = NULL;
static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
static ProcessUtility_hook_type prev_ProcessUtility = NULL;
/* Links to shared memory state */
static pgssSharedState *pgss = NULL;
static HTAB *pgss_hash = NULL;
/*---- GUC variables ----*/
typedef enum
{
PGSS_TRACK_NONE, /* track no statements */
PGSS_TRACK_TOP, /* only top level statements */
PGSS_TRACK_ALL /* all statements, including nested ones */
2017-06-21 20:39:04 +02:00
} PGSSTrackLevel;
static const struct config_enum_entry track_options[] =
{
{"none", PGSS_TRACK_NONE, false},
{"top", PGSS_TRACK_TOP, false},
{"all", PGSS_TRACK_ALL, false},
{NULL, 0, false}
};
static int pgss_max; /* max # statements to track */
static int pgss_track; /* tracking level */
2010-02-26 03:01:40 +01:00
static bool pgss_track_utility; /* whether to track utility commands */
static bool pgss_save; /* whether to save stats across shutdown */
#define pgss_enabled() \
(pgss_track == PGSS_TRACK_ALL || \
(pgss_track == PGSS_TRACK_TOP && nested_level == 0))
#define record_gc_qtexts() \
do { \
volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \
SpinLockAcquire(&s->mutex); \
s->gc_count++; \
SpinLockRelease(&s->mutex); \
} while(0)
/*---- Function declarations ----*/
void _PG_init(void);
void _PG_fini(void);
PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
PG_FUNCTION_INFO_V1(pg_stat_statements_reset_1_7);
PG_FUNCTION_INFO_V1(pg_stat_statements_1_2);
PG_FUNCTION_INFO_V1(pg_stat_statements_1_3);
PG_FUNCTION_INFO_V1(pg_stat_statements);
static void pgss_shmem_startup(void);
static void pgss_shmem_shutdown(int code, Datum arg);
static void pgss_post_parse_analyze(ParseState *pstate, Query *query);
static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
static void pgss_ExecutorRun(QueryDesc *queryDesc,
ScanDirection direction,
uint64 count, bool execute_once);
static void pgss_ExecutorFinish(QueryDesc *queryDesc);
static void pgss_ExecutorEnd(QueryDesc *queryDesc);
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
ProcessUtilityContext context, ParamListInfo params,
QueryEnvironment *queryEnv,
DestReceiver *dest, char *completionTag);
static uint64 pgss_hash_string(const char *str, int len);
static void pgss_store(const char *query, uint64 queryId,
int query_location, int query_len,
double total_time, uint64 rows,
const BufferUsage *bufusage,
pgssJumbleState *jstate);
static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
pgssVersion api_version,
bool showtext);
static Size pgss_memsize(void);
static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len,
int encoding, bool sticky);
static void entry_dealloc(void);
static bool qtext_store(const char *query, int query_len,
Size *query_offset, int *gc_count);
static char *qtext_load_file(Size *buffer_size);
static char *qtext_fetch(Size query_offset, int query_len,
char *buffer, Size buffer_size);
static bool need_gc_qtexts(void);
static void gc_qtexts(void);
static void entry_reset(Oid userid, Oid dbid, uint64 queryid);
static void AppendJumble(pgssJumbleState *jstate,
const unsigned char *item, Size size);
static void JumbleQuery(pgssJumbleState *jstate, Query *query);
static void JumbleRangeTable(pgssJumbleState *jstate, List *rtable);
static void JumbleRowMarks(pgssJumbleState *jstate, List *rowMarks);
static void JumbleExpr(pgssJumbleState *jstate, Node *node);
static void RecordConstLocation(pgssJumbleState *jstate, int location);
static char *generate_normalized_query(pgssJumbleState *jstate, const char *query,
int query_loc, int *query_len_p, int encoding);
static void fill_in_constant_lengths(pgssJumbleState *jstate, const char *query,
int query_loc);
static int comp_location(const void *a, const void *b);
/*
* Module load callback
*/
void
_PG_init(void)
{
/*
* In order to create our shared memory area, we have to be loaded via
* shared_preload_libraries. If not, fall out without hooking into any of
* the main system. (We don't throw error here because it seems useful to
* allow the pg_stat_statements functions to be created even when the
* module isn't active. The functions must protect themselves against
* being called then, however.)
*/
if (!process_shared_preload_libraries_in_progress)
return;
/*
* Define (or redefine) custom GUC variables.
*/
DefineCustomIntVariable("pg_stat_statements.max",
"Sets the maximum number of statements tracked by pg_stat_statements.",
NULL,
&pgss_max,
5000,
100,
INT_MAX,
PGC_POSTMASTER,
0,
NULL,
NULL,
NULL);
DefineCustomEnumVariable("pg_stat_statements.track",
"Selects which statements are tracked by pg_stat_statements.",
NULL,
&pgss_track,
PGSS_TRACK_TOP,
track_options,
PGC_SUSET,
0,
NULL,
NULL,
NULL);
DefineCustomBoolVariable("pg_stat_statements.track_utility",
"Selects whether utility commands are tracked by pg_stat_statements.",
NULL,
&pgss_track_utility,
true,
PGC_SUSET,
0,
NULL,
NULL,
NULL);
DefineCustomBoolVariable("pg_stat_statements.save",
"Save pg_stat_statements statistics across server shutdowns.",
NULL,
&pgss_save,
true,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
EmitWarningsOnPlaceholders("pg_stat_statements");
/*
* Request additional shared resources. (These are no-ops if we're not in
* the postmaster process.) We'll allocate or attach to the shared
* resources in pgss_shmem_startup().
*/
RequestAddinShmemSpace(pgss_memsize());
RequestNamedLWLockTranche("pg_stat_statements", 1);
/*
* Install hooks.
*/
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = pgss_shmem_startup;
prev_post_parse_analyze_hook = post_parse_analyze_hook;
post_parse_analyze_hook = pgss_post_parse_analyze;
prev_ExecutorStart = ExecutorStart_hook;
ExecutorStart_hook = pgss_ExecutorStart;
prev_ExecutorRun = ExecutorRun_hook;
ExecutorRun_hook = pgss_ExecutorRun;
prev_ExecutorFinish = ExecutorFinish_hook;
ExecutorFinish_hook = pgss_ExecutorFinish;
prev_ExecutorEnd = ExecutorEnd_hook;
ExecutorEnd_hook = pgss_ExecutorEnd;
prev_ProcessUtility = ProcessUtility_hook;
ProcessUtility_hook = pgss_ProcessUtility;
}
/*
* Module unload callback
*/
void
_PG_fini(void)
{
/* Uninstall hooks. */
shmem_startup_hook = prev_shmem_startup_hook;
post_parse_analyze_hook = prev_post_parse_analyze_hook;
ExecutorStart_hook = prev_ExecutorStart;
ExecutorRun_hook = prev_ExecutorRun;
ExecutorFinish_hook = prev_ExecutorFinish;
ExecutorEnd_hook = prev_ExecutorEnd;
ProcessUtility_hook = prev_ProcessUtility;
}
/*
* shmem_startup hook: allocate or attach to shared memory,
* then load any pre-existing statistics from file.
* Also create and load the query-texts file, which is expected to exist
* (even if empty) while the module is enabled.
*/
static void
pgss_shmem_startup(void)
{
bool found;
HASHCTL info;
FILE *file = NULL;
FILE *qfile = NULL;
uint32 header;
int32 num;
int32 pgver;
int32 i;
int buffer_size;
char *buffer = NULL;
if (prev_shmem_startup_hook)
prev_shmem_startup_hook();
/* reset in case this is a restart within the postmaster */
pgss = NULL;
pgss_hash = NULL;
/*
* Create or attach to the shared memory state, including hash table
*/
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pgss = ShmemInitStruct("pg_stat_statements",
sizeof(pgssSharedState),
&found);
if (!found)
{
/* First time through ... */
pgss->lock = &(GetNamedLWLockTranche("pg_stat_statements"))->lock;
pgss->cur_median_usage = ASSUMED_MEDIAN_INIT;
pgss->mean_query_len = ASSUMED_LENGTH_INIT;
SpinLockInit(&pgss->mutex);
pgss->extent = 0;
pgss->n_writers = 0;
pgss->gc_count = 0;
}
memset(&info, 0, sizeof(info));
info.keysize = sizeof(pgssHashKey);
info.entrysize = sizeof(pgssEntry);
pgss_hash = ShmemInitHash("pg_stat_statements hash",
pgss_max, pgss_max,
&info,
HASH_ELEM | HASH_BLOBS);
LWLockRelease(AddinShmemInitLock);
/*
* If we're in the postmaster (or a standalone backend...), set up a shmem
* exit hook to dump the statistics to disk.
*/
if (!IsUnderPostmaster)
on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
/*
* Done if some other process already completed our initialization.
*/
if (found)
return;
/*
* Note: we don't bother with locks here, because there should be no other
* processes running when this code is reached.
*/
/* Unlink query text file possibly left over from crash */
unlink(PGSS_TEXT_FILE);
/* Allocate new query text temp file */
qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
if (qfile == NULL)
goto write_error;
/*
* If we were told not to load old statistics, we're done. (Note we do
* not try to unlink any old dump file in this case. This seems a bit
* questionable but it's the historical behavior.)
*/
if (!pgss_save)
{
FreeFile(qfile);
return;
}
/*
* Attempt to load old statistics from the dump file.
*/
file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
if (file == NULL)
{
if (errno != ENOENT)
goto read_error;
/* No existing persisted stats file, so we're done */
FreeFile(qfile);
return;
}
buffer_size = 2048;
buffer = (char *) palloc(buffer_size);
if (fread(&header, sizeof(uint32), 1, file) != 1 ||
fread(&pgver, sizeof(uint32), 1, file) != 1 ||
fread(&num, sizeof(int32), 1, file) != 1)
goto read_error;
if (header != PGSS_FILE_HEADER ||
pgver != PGSS_PG_MAJOR_VERSION)
goto data_error;
for (i = 0; i < num; i++)
{
pgssEntry temp;
pgssEntry *entry;
Size query_offset;
if (fread(&temp, sizeof(pgssEntry), 1, file) != 1)
goto read_error;
/* Encoding is the only field we can easily sanity-check */
if (!PG_VALID_BE_ENCODING(temp.encoding))
goto data_error;
/* Resize buffer as needed */
if (temp.query_len >= buffer_size)
{
buffer_size = Max(buffer_size * 2, temp.query_len + 1);
buffer = repalloc(buffer, buffer_size);
}
if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1)
goto read_error;
/* Should have a trailing null, but let's make sure */
buffer[temp.query_len] = '\0';
/* Skip loading "sticky" entries */
if (temp.counters.calls == 0)
continue;
/* Store the query text */
query_offset = pgss->extent;
if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1)
goto write_error;
pgss->extent += temp.query_len + 1;
/* make the hashtable entry (discards old entries if too many) */
entry = entry_alloc(&temp.key, query_offset, temp.query_len,
temp.encoding,
false);
/* copy in the actual stats */
entry->counters = temp.counters;
}
pfree(buffer);
FreeFile(file);
FreeFile(qfile);
/*
* Remove the persisted stats file so it's not included in
* backups/replication standbys, etc. A new file will be written on next
* shutdown.
*
* Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup,
* because we remove that file on startup; it acts inversely to
* PGSS_DUMP_FILE, in that it is only supposed to be around when the
* server is running, whereas PGSS_DUMP_FILE is only supposed to be around
* when the server is not running. Leaving the file creates no danger of
* a newly restored database having a spurious record of execution costs,
* which is what we're really concerned about here.
*/
unlink(PGSS_DUMP_FILE);
return;
read_error:
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
PGSS_DUMP_FILE)));
goto fail;
data_error:
ereport(LOG,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ignoring invalid data in file \"%s\"",
PGSS_DUMP_FILE)));
goto fail;
write_error:
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
PGSS_TEXT_FILE)));
fail:
if (buffer)
pfree(buffer);
if (file)
FreeFile(file);
if (qfile)
FreeFile(qfile);
/* If possible, throw away the bogus file; ignore any error */
unlink(PGSS_DUMP_FILE);
/*
* Don't unlink PGSS_TEXT_FILE here; it should always be around while the
* server is running with pg_stat_statements enabled
*/
}
/*
* shmem_shutdown hook: Dump statistics into file.
*
* Note: we don't bother with acquiring lock, because there should be no
* other processes running when this is called.
*/
static void
pgss_shmem_shutdown(int code, Datum arg)
{
FILE *file;
char *qbuffer = NULL;
Size qbuffer_size = 0;
HASH_SEQ_STATUS hash_seq;
int32 num_entries;
pgssEntry *entry;
/* Don't try to dump during a crash. */
if (code)
return;
/* Safety check ... shouldn't get here unless shmem is set up. */
if (!pgss || !pgss_hash)
return;
/* Don't dump if told not to. */
if (!pgss_save)
return;
file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W);
if (file == NULL)
goto error;
if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
goto error;
if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
goto error;
num_entries = hash_get_num_entries(pgss_hash);
if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
goto error;
qbuffer = qtext_load_file(&qbuffer_size);
if (qbuffer == NULL)
goto error;
/*
* When serializing to disk, we store query texts immediately after their
* entry data. Any orphaned query texts are thereby excluded.
*/
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
{
int len = entry->query_len;
char *qstr = qtext_fetch(entry->query_offset, len,
qbuffer, qbuffer_size);
if (qstr == NULL)
continue; /* Ignore any entries with bogus texts */
if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 ||
fwrite(qstr, 1, len + 1, file) != len + 1)
{
/* note: we assume hash_seq_term won't change errno */
hash_seq_term(&hash_seq);
goto error;
}
}
free(qbuffer);
qbuffer = NULL;
if (FreeFile(file))
{
file = NULL;
goto error;
}
/*
* Rename file into place, so we atomically replace any old one.
*/
(void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG);
/* Unlink query-texts file; it's not needed while shutdown */
unlink(PGSS_TEXT_FILE);
return;
error:
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
PGSS_DUMP_FILE ".tmp")));
if (qbuffer)
free(qbuffer);
if (file)
FreeFile(file);
unlink(PGSS_DUMP_FILE ".tmp");
unlink(PGSS_TEXT_FILE);
}
/*
* Post-parse-analysis hook: mark query with a queryId
*/
static void
pgss_post_parse_analyze(ParseState *pstate, Query *query)
{
pgssJumbleState jstate;
if (prev_post_parse_analyze_hook)
prev_post_parse_analyze_hook(pstate, query);
/* Assert we didn't do this already */
Assert(query->queryId == UINT64CONST(0));
/* Safety check... */
if (!pgss || !pgss_hash || !pgss_enabled())
return;
/*
* Utility statements get queryId zero. We do this even in cases where
* the statement contains an optimizable statement for which a queryId
* could be derived (such as EXPLAIN or DECLARE CURSOR). For such cases,
* runtime control will first go through ProcessUtility and then the
* executor, and we don't want the executor hooks to do anything, since we
* are already measuring the statement's costs at the utility level.
*/
if (query->utilityStmt)
{
query->queryId = UINT64CONST(0);
return;
}
/* Set up workspace for query jumbling */
jstate.jumble = (unsigned char *) palloc(JUMBLE_SIZE);
jstate.jumble_len = 0;
jstate.clocations_buf_size = 32;
jstate.clocations = (pgssLocationLen *)
palloc(jstate.clocations_buf_size * sizeof(pgssLocationLen));
jstate.clocations_count = 0;
jstate.highest_extern_param_id = 0;
/* Compute query ID and mark the Query node with it */
JumbleQuery(&jstate, query);
query->queryId =
DatumGetUInt64(hash_any_extended(jstate.jumble, jstate.jumble_len, 0));
/*
* If we are unlucky enough to get a hash of zero, use 1 instead, to
* prevent confusion with the utility-statement case.
*/
if (query->queryId == UINT64CONST(0))
query->queryId = UINT64CONST(1);
/*
* If we were able to identify any ignorable constants, we immediately
* create a hash table entry for the query, so that we can record the
* normalized form of the query string. If there were no such constants,
* the normalized string would be the same as the query text anyway, so
* there's no need for an early entry.
*/
if (jstate.clocations_count > 0)
pgss_store(pstate->p_sourcetext,
query->queryId,
query->stmt_location,
query->stmt_len,
0,
0,
NULL,
&jstate);
}
/*
* ExecutorStart hook: start up tracking if needed
*/
static void
pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
{
if (prev_ExecutorStart)
prev_ExecutorStart(queryDesc, eflags);
else
standard_ExecutorStart(queryDesc, eflags);
/*
* If query has queryId zero, don't track it. This prevents double
* counting of optimizable statements that are directly contained in
* utility statements.
*/
if (pgss_enabled() && queryDesc->plannedstmt->queryId != UINT64CONST(0))
{
/*
* Set up to track total elapsed time in ExecutorRun. Make sure the
* space is allocated in the per-query context so it will go away at
* ExecutorEnd.
*/
if (queryDesc->totaltime == NULL)
{
MemoryContext oldcxt;
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL);
MemoryContextSwitchTo(oldcxt);
}
}
}
/*
* ExecutorRun hook: all we need do is track nesting depth
*/
static void
2017-03-23 18:05:48 +01:00
pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
bool execute_once)
{
nested_level++;
PG_TRY();
{
if (prev_ExecutorRun)
2017-03-23 18:05:48 +01:00
prev_ExecutorRun(queryDesc, direction, count, execute_once);
else
2017-03-23 18:05:48 +01:00
standard_ExecutorRun(queryDesc, direction, count, execute_once);
}
PG_FINALLY();
{
nested_level--;
}
PG_END_TRY();
}
/*
* ExecutorFinish hook: all we need do is track nesting depth
*/
static void
pgss_ExecutorFinish(QueryDesc *queryDesc)
{
nested_level++;
PG_TRY();
{
if (prev_ExecutorFinish)
prev_ExecutorFinish(queryDesc);
else
standard_ExecutorFinish(queryDesc);
}
PG_FINALLY();
{
nested_level--;
}
PG_END_TRY();
}
/*
* ExecutorEnd hook: store results if needed
*/
static void
pgss_ExecutorEnd(QueryDesc *queryDesc)
{
uint64 queryId = queryDesc->plannedstmt->queryId;
if (queryId != UINT64CONST(0) && queryDesc->totaltime && pgss_enabled())
{
/*
* Make sure stats accumulation is done. (Note: it's okay if several
* levels of hook all do this.)
*/
InstrEndLoop(queryDesc->totaltime);
pgss_store(queryDesc->sourceText,
queryId,
queryDesc->plannedstmt->stmt_location,
queryDesc->plannedstmt->stmt_len,
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
queryDesc->totaltime->total * 1000.0, /* convert to msec */
queryDesc->estate->es_processed,
&queryDesc->totaltime->bufusage,
NULL);
}
if (prev_ExecutorEnd)
prev_ExecutorEnd(queryDesc);
else
standard_ExecutorEnd(queryDesc);
}
/*
* ProcessUtility hook
*/
static void
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
ProcessUtilityContext context,
ParamListInfo params, QueryEnvironment *queryEnv,
DestReceiver *dest, char *completionTag)
{
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
Node *parsetree = pstmt->utilityStmt;
/*
* If it's an EXECUTE statement, we don't track it and don't increment the
* nesting level. This allows the cycles to be charged to the underlying
* PREPARE instead (by the Executor hooks), which is much more useful.
*
* We also don't track execution of PREPARE. If we did, we would get one
* hash table entry for the PREPARE (with hash calculated from the query
* string), and then a different one with the same query string (but hash
* calculated from the query tree) would be used to accumulate costs of
* ensuing EXECUTEs. This would be confusing, and inconsistent with other
* cases where planning time is not included at all.
*
* Likewise, we don't track execution of DEALLOCATE.
*/
if (pgss_track_utility && pgss_enabled() &&
!IsA(parsetree, ExecuteStmt) &&
!IsA(parsetree, PrepareStmt) &&
!IsA(parsetree, DeallocateStmt))
{
instr_time start;
instr_time duration;
uint64 rows;
BufferUsage bufusage_start,
bufusage;
bufusage_start = pgBufferUsage;
INSTR_TIME_SET_CURRENT(start);
nested_level++;
PG_TRY();
{
if (prev_ProcessUtility)
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
prev_ProcessUtility(pstmt, queryString,
context, params, queryEnv,
dest, completionTag);
else
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
standard_ProcessUtility(pstmt, queryString,
context, params, queryEnv,
dest, completionTag);
}
PG_FINALLY();
{
nested_level--;
}
PG_END_TRY();
INSTR_TIME_SET_CURRENT(duration);
INSTR_TIME_SUBTRACT(duration, start);
/* parse command tag to retrieve the number of affected rows. */
if (completionTag &&
strncmp(completionTag, "COPY ", 5) == 0)
rows = pg_strtouint64(completionTag + 5, NULL, 10);
else
rows = 0;
/* calc differences of buffer counters. */
bufusage.shared_blks_hit =
pgBufferUsage.shared_blks_hit - bufusage_start.shared_blks_hit;
bufusage.shared_blks_read =
pgBufferUsage.shared_blks_read - bufusage_start.shared_blks_read;
bufusage.shared_blks_dirtied =
pgBufferUsage.shared_blks_dirtied - bufusage_start.shared_blks_dirtied;
bufusage.shared_blks_written =
pgBufferUsage.shared_blks_written - bufusage_start.shared_blks_written;
bufusage.local_blks_hit =
pgBufferUsage.local_blks_hit - bufusage_start.local_blks_hit;
bufusage.local_blks_read =
pgBufferUsage.local_blks_read - bufusage_start.local_blks_read;
bufusage.local_blks_dirtied =
pgBufferUsage.local_blks_dirtied - bufusage_start.local_blks_dirtied;
bufusage.local_blks_written =
pgBufferUsage.local_blks_written - bufusage_start.local_blks_written;
bufusage.temp_blks_read =
pgBufferUsage.temp_blks_read - bufusage_start.temp_blks_read;
bufusage.temp_blks_written =
pgBufferUsage.temp_blks_written - bufusage_start.temp_blks_written;
bufusage.blk_read_time = pgBufferUsage.blk_read_time;
INSTR_TIME_SUBTRACT(bufusage.blk_read_time, bufusage_start.blk_read_time);
bufusage.blk_write_time = pgBufferUsage.blk_write_time;
INSTR_TIME_SUBTRACT(bufusage.blk_write_time, bufusage_start.blk_write_time);
pgss_store(queryString,
0, /* signal that it's a utility stmt */
pstmt->stmt_location,
pstmt->stmt_len,
INSTR_TIME_GET_MILLISEC(duration),
rows,
&bufusage,
NULL);
}
else
{
if (prev_ProcessUtility)
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
prev_ProcessUtility(pstmt, queryString,
context, params, queryEnv,
dest, completionTag);
else
Change representation of statement lists, and add statement location info. This patch makes several changes that improve the consistency of representation of lists of statements. It's always been the case that the output of parse analysis is a list of Query nodes, whatever the types of the individual statements in the list. This patch brings similar consistency to the outputs of raw parsing and planning steps: * The output of raw parsing is now always a list of RawStmt nodes; the statement-type-dependent nodes are one level down from that. * The output of pg_plan_queries() is now always a list of PlannedStmt nodes, even for utility statements. In the case of a utility statement, "planning" just consists of wrapping a CMD_UTILITY PlannedStmt around the utility node. This list representation is now used in Portal and CachedPlan plan lists, replacing the former convention of intermixing PlannedStmts with bare utility-statement nodes. Now, every list of statements has a consistent head-node type depending on how far along it is in processing. This allows changing many places that formerly used generic "Node *" pointers to use a more specific pointer type, thus reducing the number of IsA() tests and casts needed, as well as improving code clarity. Also, the post-parse-analysis representation of DECLARE CURSOR is changed so that it looks more like EXPLAIN, PREPARE, etc. That is, the contained SELECT remains a child of the DeclareCursorStmt rather than getting flipped around to be the other way. It's now true for both Query and PlannedStmt that utilityStmt is non-null if and only if commandType is CMD_UTILITY. That allows simplifying a lot of places that were testing both fields. (I think some of those were just defensive programming, but in many places, it was actually necessary to avoid confusing DECLARE CURSOR with SELECT.) Because PlannedStmt carries a canSetTag field, we're also able to get rid of some ad-hoc rules about how to reconstruct canSetTag for a bare utility statement; specifically, the assumption that a utility is canSetTag if and only if it's the only one in its list. While I see no near-term need for relaxing that restriction, it's nice to get rid of the ad-hocery. The API of ProcessUtility() is changed so that what it's passed is the wrapper PlannedStmt not just the bare utility statement. This will affect all users of ProcessUtility_hook, but the changes are pretty trivial; see the affected contrib modules for examples of the minimum change needed. (Most compilers should give pointer-type-mismatch warnings for uncorrected code.) There's also a change in the API of ExplainOneQuery_hook, to pass through cursorOptions instead of expecting hook functions to know what to pick. This is needed because of the DECLARE CURSOR changes, but really should have been done in 9.6; it's unlikely that any extant hook functions know about using CURSOR_OPT_PARALLEL_OK. Finally, teach gram.y to save statement boundary locations in RawStmt nodes, and pass those through to Query and PlannedStmt nodes. This allows more intelligent handling of cases where a source query string contains multiple statements. This patch doesn't actually do anything with the information, but a follow-on patch will. (Passing this information through cleanly is the true motivation for these changes; while I think this is all good cleanup, it's unlikely we'd have bothered without this end goal.) catversion bump because addition of location fields to struct Query affects stored rules. This patch is by me, but it owes a good deal to Fabien Coelho who did a lot of preliminary work on the problem, and also reviewed the patch. Discussion: https://postgr.es/m/alpine.DEB.2.20.1612200926310.29821@lancre
2017-01-14 22:02:35 +01:00
standard_ProcessUtility(pstmt, queryString,
context, params, queryEnv,
dest, completionTag);
}
}
/*
* Given an arbitrarily long query string, produce a hash for the purposes of
* identifying the query, without normalizing constants. Used when hashing
* utility statements.
*/
static uint64
pgss_hash_string(const char *str, int len)
{
return DatumGetUInt64(hash_any_extended((const unsigned char *) str,
len, 0));
}
/*
* Store some statistics for a statement.
*
* If queryId is 0 then this is a utility statement and we should compute
* a suitable queryId internally.
*
* If jstate is not NULL then we're trying to create an entry for which
* we have no statistics as yet; we just want to record the normalized
* query string. total_time, rows, bufusage are ignored in this case.
*/
static void
pgss_store(const char *query, uint64 queryId,
int query_location, int query_len,
double total_time, uint64 rows,
const BufferUsage *bufusage,
pgssJumbleState *jstate)
{
pgssHashKey key;
pgssEntry *entry;
char *norm_query = NULL;
int encoding = GetDatabaseEncoding();
Assert(query != NULL);
/* Safety check... */
if (!pgss || !pgss_hash)
return;
/*
* Confine our attention to the relevant part of the string, if the query
* is a portion of a multi-statement source string.
*
* First apply starting offset, unless it's -1 (unknown).
*/
if (query_location >= 0)
{
Assert(query_location <= strlen(query));
query += query_location;
/* Length of 0 (or -1) means "rest of string" */
if (query_len <= 0)
query_len = strlen(query);
else
Assert(query_len <= strlen(query));
}
else
{
/* If query location is unknown, distrust query_len as well */
query_location = 0;
query_len = strlen(query);
}
/*
* Discard leading and trailing whitespace, too. Use scanner_isspace()
* not libc's isspace(), because we want to match the lexer's behavior.
*/
while (query_len > 0 && scanner_isspace(query[0]))
query++, query_location++, query_len--;
while (query_len > 0 && scanner_isspace(query[query_len - 1]))
query_len--;
/*
* For utility statements, we just hash the query string to get an ID.
*/
if (queryId == UINT64CONST(0))
{
queryId = pgss_hash_string(query, query_len);
/*
* If we are unlucky enough to get a hash of zero(invalid), use
* queryID as 2 instead, queryID 1 is already in use for normal
* statements.
*/
if (queryId == UINT64CONST(0))
queryId = UINT64CONST(2);
}
/* Set up key for hashtable search */
key.userid = GetUserId();
key.dbid = MyDatabaseId;
key.queryid = queryId;
/* Lookup the hash table entry with shared lock. */
LWLockAcquire(pgss->lock, LW_SHARED);
entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
/* Create new entry, if not present */
if (!entry)
{
Size query_offset;
int gc_count;
bool stored;
bool do_gc;
/*
* Create a new, normalized query string if caller asked. We don't
* need to hold the lock while doing this work. (Note: in any case,
* it's possible that someone else creates a duplicate hashtable entry
* in the interval where we don't hold the lock below. That case is
* handled by entry_alloc.)
*/
if (jstate)
{
LWLockRelease(pgss->lock);
norm_query = generate_normalized_query(jstate, query,
query_location,
&query_len,
encoding);
LWLockAcquire(pgss->lock, LW_SHARED);
}
/* Append new query text to file with only shared lock held */
stored = qtext_store(norm_query ? norm_query : query, query_len,
&query_offset, &gc_count);
/*
* Determine whether we need to garbage collect external query texts
* while the shared lock is still held. This micro-optimization
* avoids taking the time to decide this while holding exclusive lock.
*/
do_gc = need_gc_qtexts();
/* Need exclusive lock to make a new hashtable entry - promote */
LWLockRelease(pgss->lock);
LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
/*
* A garbage collection may have occurred while we weren't holding the
* lock. In the unlikely event that this happens, the query text we
* stored above will have been garbage collected, so write it again.
* This should be infrequent enough that doing it while holding
* exclusive lock isn't a performance problem.
*/
if (!stored || pgss->gc_count != gc_count)
stored = qtext_store(norm_query ? norm_query : query, query_len,
&query_offset, NULL);
/* If we failed to write to the text file, give up */
if (!stored)
goto done;
/* OK to create a new hashtable entry */
entry = entry_alloc(&key, query_offset, query_len, encoding,
jstate != NULL);
/* If needed, perform garbage collection while exclusive lock held */
if (do_gc)
gc_qtexts();
}
/* Increment the counts, except when jstate is not NULL */
if (!jstate)
{
/*
* Grab the spinlock while updating the counters (see comment about
* locking rules at the head of the file)
*/
volatile pgssEntry *e = (volatile pgssEntry *) entry;
SpinLockAcquire(&e->mutex);
/* "Unstick" entry if it was previously sticky */
if (e->counters.calls == 0)
e->counters.usage = USAGE_INIT;
e->counters.calls += 1;
e->counters.total_time += total_time;
if (e->counters.calls == 1)
{
e->counters.min_time = total_time;
e->counters.max_time = total_time;
e->counters.mean_time = total_time;
}
else
{
/*
2015-05-24 03:35:49 +02:00
* Welford's method for accurately computing variance. See
* <http://www.johndcook.com/blog/standard_deviation/>
*/
2015-05-24 03:35:49 +02:00
double old_mean = e->counters.mean_time;
e->counters.mean_time +=
(total_time - old_mean) / e->counters.calls;
e->counters.sum_var_time +=
(total_time - old_mean) * (total_time - e->counters.mean_time);
/* calculate min and max time */
if (e->counters.min_time > total_time)
e->counters.min_time = total_time;
if (e->counters.max_time < total_time)
e->counters.max_time = total_time;
}
e->counters.rows += rows;
e->counters.shared_blks_hit += bufusage->shared_blks_hit;
e->counters.shared_blks_read += bufusage->shared_blks_read;
e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
e->counters.shared_blks_written += bufusage->shared_blks_written;
e->counters.local_blks_hit += bufusage->local_blks_hit;
e->counters.local_blks_read += bufusage->local_blks_read;
e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
e->counters.local_blks_written += bufusage->local_blks_written;
e->counters.temp_blks_read += bufusage->temp_blks_read;
e->counters.temp_blks_written += bufusage->temp_blks_written;
e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
e->counters.usage += USAGE_EXEC(total_time);
SpinLockRelease(&e->mutex);
}
done:
LWLockRelease(pgss->lock);
/* We postpone this clean-up until we're out of the lock */
if (norm_query)
pfree(norm_query);
}
/*
* Reset statement statistics corresponding to userid, dbid, and queryid.
*/
Datum
pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)
{
Oid userid;
Oid dbid;
uint64 queryid;
userid = PG_GETARG_OID(0);
dbid = PG_GETARG_OID(1);
queryid = (uint64) PG_GETARG_INT64(2);
entry_reset(userid, dbid, queryid);
PG_RETURN_VOID();
}
/*
* Reset statement statistics.
*/
Datum
pg_stat_statements_reset(PG_FUNCTION_ARGS)
{
entry_reset(0, 0, 0);
PG_RETURN_VOID();
}
/* Number of output arguments (columns) for various API versions */
#define PG_STAT_STATEMENTS_COLS_V1_0 14
#define PG_STAT_STATEMENTS_COLS_V1_1 18
#define PG_STAT_STATEMENTS_COLS_V1_2 19
#define PG_STAT_STATEMENTS_COLS_V1_3 23
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
#define PG_STAT_STATEMENTS_COLS 23 /* maximum of above */
/*
* Retrieve statement statistics.
*
* The SQL API of this function has changed multiple times, and will likely
* do so again in future. To support the case where a newer version of this
* loadable module is being used with an old SQL declaration of the function,
* we continue to support the older API versions. For 1.2 and later, the
* expected API version is identified by embedding it in the C name of the
* function. Unfortunately we weren't bright enough to do that for 1.1.
*/
Datum
pg_stat_statements_1_3(PG_FUNCTION_ARGS)
{
bool showtext = PG_GETARG_BOOL(0);
pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext);
return (Datum) 0;
}
Datum
pg_stat_statements_1_2(PG_FUNCTION_ARGS)
{
bool showtext = PG_GETARG_BOOL(0);
pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
return (Datum) 0;
}
/*
* Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1.
* This can be removed someday, perhaps.
*/
Datum
pg_stat_statements(PG_FUNCTION_ARGS)
{
/* If it's really API 1.1, we'll figure that out below */
pg_stat_statements_internal(fcinfo, PGSS_V1_0, true);
return (Datum) 0;
}
/* Common code for all versions of pg_stat_statements() */
static void
pg_stat_statements_internal(FunctionCallInfo fcinfo,
pgssVersion api_version,
bool showtext)
{
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
TupleDesc tupdesc;
Tuplestorestate *tupstore;
MemoryContext per_query_ctx;
MemoryContext oldcontext;
Oid userid = GetUserId();
bool is_allowed_role = false;
char *qbuffer = NULL;
Size qbuffer_size = 0;
Size extent = 0;
int gc_count = 0;
HASH_SEQ_STATUS hash_seq;
pgssEntry *entry;
/* Superusers or members of pg_read_all_stats members are allowed */
is_allowed_role = is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS);
/* hash table must exist already */
if (!pgss || !pgss_hash)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
/* check to see if caller supports us returning a tuplestore */
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("set-valued function called in context that cannot accept a set")));
if (!(rsinfo->allowedModes & SFRM_Materialize))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("materialize mode required, but it is not allowed in this context")));
/* Switch into long-lived context to construct returned data structures */
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
oldcontext = MemoryContextSwitchTo(per_query_ctx);
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
/*
* Check we have the expected number of output arguments. Aside from
* being a good safety check, we need a kluge here to detect API version
* 1.1, which was wedged into the code in an ill-considered way.
*/
switch (tupdesc->natts)
{
case PG_STAT_STATEMENTS_COLS_V1_0:
if (api_version != PGSS_V1_0)
elog(ERROR, "incorrect number of output arguments");
break;
case PG_STAT_STATEMENTS_COLS_V1_1:
/* pg_stat_statements() should have told us 1.0 */
if (api_version != PGSS_V1_0)
elog(ERROR, "incorrect number of output arguments");
api_version = PGSS_V1_1;
break;
case PG_STAT_STATEMENTS_COLS_V1_2:
if (api_version != PGSS_V1_2)
elog(ERROR, "incorrect number of output arguments");
break;
case PG_STAT_STATEMENTS_COLS_V1_3:
if (api_version != PGSS_V1_3)
elog(ERROR, "incorrect number of output arguments");
break;
default:
elog(ERROR, "incorrect number of output arguments");
}
tupstore = tuplestore_begin_heap(true, false, work_mem);
rsinfo->returnMode = SFRM_Materialize;
rsinfo->setResult = tupstore;
rsinfo->setDesc = tupdesc;
MemoryContextSwitchTo(oldcontext);
/*
* We'd like to load the query text file (if needed) while not holding any
* lock on pgss->lock. In the worst case we'll have to do this again
* after we have the lock, but it's unlikely enough to make this a win
* despite occasional duplicated work. We need to reload if anybody
* writes to the file (either a retail qtext_store(), or a garbage
* collection) between this point and where we've gotten shared lock. If
* a qtext_store is actually in progress when we look, we might as well
* skip the speculative load entirely.
*/
if (showtext)
{
int n_writers;
/* Take the mutex so we can examine variables */
{
volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
SpinLockAcquire(&s->mutex);
extent = s->extent;
n_writers = s->n_writers;
gc_count = s->gc_count;
SpinLockRelease(&s->mutex);
}
/* No point in loading file now if there are active writers */
if (n_writers == 0)
qbuffer = qtext_load_file(&qbuffer_size);
}
/*
* Get shared lock, load or reload the query text file if we must, and
* iterate over the hashtable entries.
*
* With a large hash table, we might be holding the lock rather longer
* than one could wish. However, this only blocks creation of new hash
* table entries, and the larger the hash table the less likely that is to
* be needed. So we can hope this is okay. Perhaps someday we'll decide
* we need to partition the hash table to limit the time spent holding any
* one lock.
*/
LWLockAcquire(pgss->lock, LW_SHARED);
if (showtext)
{
/*
* Here it is safe to examine extent and gc_count without taking the
* mutex. Note that although other processes might change
* pgss->extent just after we look at it, the strings they then write
* into the file cannot yet be referenced in the hashtable, so we
* don't care whether we see them or not.
*
* If qtext_load_file fails, we just press on; we'll return NULL for
* every query text.
*/
if (qbuffer == NULL ||
pgss->extent != extent ||
pgss->gc_count != gc_count)
{
if (qbuffer)
free(qbuffer);
qbuffer = qtext_load_file(&qbuffer_size);
}
}
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
{
Datum values[PG_STAT_STATEMENTS_COLS];
bool nulls[PG_STAT_STATEMENTS_COLS];
int i = 0;
Counters tmp;
double stddev;
int64 queryid = entry->key.queryid;
memset(values, 0, sizeof(values));
memset(nulls, 0, sizeof(nulls));
values[i++] = ObjectIdGetDatum(entry->key.userid);
values[i++] = ObjectIdGetDatum(entry->key.dbid);
if (is_allowed_role || entry->key.userid == userid)
{
if (api_version >= PGSS_V1_2)
values[i++] = Int64GetDatumFast(queryid);
if (showtext)
{
char *qstr = qtext_fetch(entry->query_offset,
entry->query_len,
qbuffer,
qbuffer_size);
if (qstr)
{
char *enc;
enc = pg_any_to_server(qstr,
entry->query_len,
entry->encoding);
values[i++] = CStringGetTextDatum(enc);
if (enc != qstr)
pfree(enc);
}
else
{
/* Just return a null if we fail to find the text */
nulls[i++] = true;
}
}
else
{
/* Query text not requested */
nulls[i++] = true;
}
}
else
{
/* Don't show queryid */
if (api_version >= PGSS_V1_2)
nulls[i++] = true;
/*
* Don't show query text, but hint as to the reason for not doing
* so if it was requested
*/
if (showtext)
values[i++] = CStringGetTextDatum("<insufficient privilege>");
else
nulls[i++] = true;
}
/* copy counters to a local variable to keep locking time short */
{
volatile pgssEntry *e = (volatile pgssEntry *) entry;
SpinLockAcquire(&e->mutex);
tmp = e->counters;
SpinLockRelease(&e->mutex);
}
/* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
if (tmp.calls == 0)
continue;
values[i++] = Int64GetDatumFast(tmp.calls);
values[i++] = Float8GetDatumFast(tmp.total_time);
if (api_version >= PGSS_V1_3)
{
values[i++] = Float8GetDatumFast(tmp.min_time);
values[i++] = Float8GetDatumFast(tmp.max_time);
values[i++] = Float8GetDatumFast(tmp.mean_time);
2015-05-24 03:35:49 +02:00
/*
* Note we are calculating the population variance here, not the
2015-05-24 03:35:49 +02:00
* sample variance, as we have data for the whole population, so
* Bessel's correction is not used, and we don't divide by
* tmp.calls - 1.
*/
if (tmp.calls > 1)
stddev = sqrt(tmp.sum_var_time / tmp.calls);
else
stddev = 0.0;
values[i++] = Float8GetDatumFast(stddev);
}
values[i++] = Int64GetDatumFast(tmp.rows);
values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
if (api_version >= PGSS_V1_1)
values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
values[i++] = Int64GetDatumFast(tmp.local_blks_read);
if (api_version >= PGSS_V1_1)
values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
values[i++] = Int64GetDatumFast(tmp.local_blks_written);
values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
if (api_version >= PGSS_V1_1)
{
values[i++] = Float8GetDatumFast(tmp.blk_read_time);
values[i++] = Float8GetDatumFast(tmp.blk_write_time);
}
Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 :
api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 :
api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 :
api_version == PGSS_V1_3 ? PG_STAT_STATEMENTS_COLS_V1_3 :
-1 /* fail if you forget to update this assert */ ));
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
}
/* clean up and return the tuplestore */
LWLockRelease(pgss->lock);
if (qbuffer)
free(qbuffer);
tuplestore_donestoring(tupstore);
}
/*
* Estimate shared memory space needed.
*/
static Size
pgss_memsize(void)
{
Size size;
size = MAXALIGN(sizeof(pgssSharedState));
size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry)));
return size;
}
/*
* Allocate a new hashtable entry.
* caller must hold an exclusive lock on pgss->lock
*
* "query" need not be null-terminated; we rely on query_len instead
*
* If "sticky" is true, make the new entry artificially sticky so that it will
* probably still be there when the query finishes execution. We do this by
* giving it a median usage value rather than the normal value. (Strictly
* speaking, query strings are normalized on a best effort basis, though it
* would be difficult to demonstrate this even under artificial conditions.)
*
* Note: despite needing exclusive lock, it's not an error for the target
* entry to already exist. This is because pgss_store releases and
* reacquires lock after failing to find a match; so someone else could
* have made the entry while we waited to get exclusive lock.
*/
static pgssEntry *
entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding,
bool sticky)
{
pgssEntry *entry;
bool found;
/* Make space if needed */
while (hash_get_num_entries(pgss_hash) >= pgss_max)
entry_dealloc();
/* Find or create an entry with desired hash code */
entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
if (!found)
{
/* New entry, initialize it */
/* reset the statistics */
memset(&entry->counters, 0, sizeof(Counters));
/* set the appropriate initial usage count */
entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT;
/* re-initialize the mutex each time ... we assume no one using it */
SpinLockInit(&entry->mutex);
/* ... and don't forget the query text metadata */
Assert(query_len >= 0);
entry->query_offset = query_offset;
entry->query_len = query_len;
entry->encoding = encoding;
}
return entry;
}
/*
* qsort comparator for sorting into increasing usage order
*/
static int
entry_cmp(const void *lhs, const void *rhs)
{
double l_usage = (*(pgssEntry *const *) lhs)->counters.usage;
double r_usage = (*(pgssEntry *const *) rhs)->counters.usage;
if (l_usage < r_usage)
return -1;
else if (l_usage > r_usage)
return +1;
else
return 0;
}
/*
2015-10-04 23:58:29 +02:00
* Deallocate least-used entries.
*
* Caller must hold an exclusive lock on pgss->lock.
*/
static void
entry_dealloc(void)
{
HASH_SEQ_STATUS hash_seq;
pgssEntry **entries;
pgssEntry *entry;
int nvictims;
int i;
2015-10-04 23:58:29 +02:00
Size tottextlen;
int nvalidtexts;
/*
* Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
* While we're scanning the table, apply the decay factor to the usage
2015-10-04 23:58:29 +02:00
* values, and update the mean query length.
*
* Note that the mean query length is almost immediately obsolete, since
* we compute it before not after discarding the least-used entries.
* Hopefully, that doesn't affect the mean too much; it doesn't seem worth
* making two passes to get a more current result. Likewise, the new
* cur_median_usage includes the entries we're about to zap.
*/
entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
i = 0;
2015-10-04 23:58:29 +02:00
tottextlen = 0;
nvalidtexts = 0;
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
{
entries[i++] = entry;
/* "Sticky" entries get a different usage decay rate. */
if (entry->counters.calls == 0)
entry->counters.usage *= STICKY_DECREASE_FACTOR;
else
entry->counters.usage *= USAGE_DECREASE_FACTOR;
2015-10-04 23:58:29 +02:00
/* In the mean length computation, ignore dropped texts. */
if (entry->query_len >= 0)
{
tottextlen += entry->query_len + 1;
nvalidtexts++;
}
}
2015-10-04 23:58:29 +02:00
/* Sort into increasing order by usage */
qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
2015-10-04 23:58:29 +02:00
/* Record the (approximate) median usage */
if (i > 0)
pgss->cur_median_usage = entries[i / 2]->counters.usage;
2015-10-04 23:58:29 +02:00
/* Record the mean query length */
if (nvalidtexts > 0)
pgss->mean_query_len = tottextlen / nvalidtexts;
else
pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2015-10-04 23:58:29 +02:00
/* Now zap an appropriate fraction of lowest-usage entries */
nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
nvictims = Min(nvictims, i);
for (i = 0; i < nvictims; i++)
{
hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
}
pfree(entries);
}
/*
* Given a query string (not necessarily null-terminated), allocate a new
* entry in the external query text file and store the string there.
*
* If successful, returns true, and stores the new entry's offset in the file
* into *query_offset. Also, if gc_count isn't NULL, *gc_count is set to the
* number of garbage collections that have occurred so far.
*
* On failure, returns false.
*
* At least a shared lock on pgss->lock must be held by the caller, so as
* to prevent a concurrent garbage collection. Share-lock-holding callers
* should pass a gc_count pointer to obtain the number of garbage collections,
* so that they can recheck the count after obtaining exclusive lock to
* detect whether a garbage collection occurred (and removed this entry).
*/
static bool
qtext_store(const char *query, int query_len,
Size *query_offset, int *gc_count)
{
Size off;
int fd;
/*
* We use a spinlock to protect extent/n_writers/gc_count, so that
* multiple processes may execute this function concurrently.
*/
{
volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
SpinLockAcquire(&s->mutex);
off = s->extent;
s->extent += query_len + 1;
s->n_writers++;
if (gc_count)
*gc_count = s->gc_count;
SpinLockRelease(&s->mutex);
}
*query_offset = off;
/* Now write the data into the successfully-reserved part of the file */
fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
if (fd < 0)
goto error;
if (pg_pwrite(fd, query, query_len, off) != query_len)
goto error;
if (pg_pwrite(fd, "\0", 1, off + query_len) != 1)
goto error;
CloseTransientFile(fd);
/* Mark our write complete */
{
volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
SpinLockAcquire(&s->mutex);
s->n_writers--;
SpinLockRelease(&s->mutex);
}
return true;
error:
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
PGSS_TEXT_FILE)));
if (fd >= 0)
CloseTransientFile(fd);
/* Mark our write complete */
{
volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
SpinLockAcquire(&s->mutex);
s->n_writers--;
SpinLockRelease(&s->mutex);
}
return false;
}
/*
* Read the external query text file into a malloc'd buffer.
*
* Returns NULL (without throwing an error) if unable to read, eg
* file not there or insufficient memory.
*
* On success, the buffer size is also returned into *buffer_size.
*
* This can be called without any lock on pgss->lock, but in that case
* the caller is responsible for verifying that the result is sane.
*/
static char *
qtext_load_file(Size *buffer_size)
{
char *buf;
int fd;
struct stat stat;
fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY);
if (fd < 0)
{
if (errno != ENOENT)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
PGSS_TEXT_FILE)));
return NULL;
}
/* Get file length */
if (fstat(fd, &stat))
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not stat file \"%s\": %m",
PGSS_TEXT_FILE)));
CloseTransientFile(fd);
return NULL;
}
/* Allocate buffer; beware that off_t might be wider than size_t */
2015-10-04 23:58:29 +02:00
if (stat.st_size <= MaxAllocHugeSize)
buf = (char *) malloc(stat.st_size);
else
buf = NULL;
if (buf == NULL)
{
ereport(LOG,
(errcode(ERRCODE_OUT_OF_MEMORY),
2015-10-04 23:58:29 +02:00
errmsg("out of memory"),
errdetail("Could not allocate enough memory to read file \"%s\".",
2015-10-04 23:58:29 +02:00
PGSS_TEXT_FILE)));
CloseTransientFile(fd);
return NULL;
}
/*
* OK, slurp in the file. If we get a short read and errno doesn't get
* set, the reason is probably that garbage collection truncated the file
* since we did the fstat(), so we don't log a complaint --- but we don't
* return the data, either, since it's most likely corrupt due to
* concurrent writes from garbage collection.
*/
errno = 0;
if (read(fd, buf, stat.st_size) != stat.st_size)
{
if (errno)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
PGSS_TEXT_FILE)));
free(buf);
CloseTransientFile(fd);
return NULL;
}
if (CloseTransientFile(fd) != 0)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", PGSS_TEXT_FILE)));
*buffer_size = stat.st_size;
return buf;
}
/*
* Locate a query text in the file image previously read by qtext_load_file().
*
* We validate the given offset/length, and return NULL if bogus. Otherwise,
* the result points to a null-terminated string within the buffer.
*/
static char *
qtext_fetch(Size query_offset, int query_len,
char *buffer, Size buffer_size)
{
/* File read failed? */
if (buffer == NULL)
return NULL;
/* Bogus offset/length? */
if (query_len < 0 ||
query_offset + query_len >= buffer_size)
return NULL;
/* As a further sanity check, make sure there's a trailing null */
if (buffer[query_offset + query_len] != '\0')
return NULL;
/* Looks OK */
return buffer + query_offset;
}
/*
* Do we need to garbage-collect the external query text file?
*
* Caller should hold at least a shared lock on pgss->lock.
*/
static bool
need_gc_qtexts(void)
{
Size extent;
/* Read shared extent pointer */
{
volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
SpinLockAcquire(&s->mutex);
extent = s->extent;
SpinLockRelease(&s->mutex);
}
/* Don't proceed if file does not exceed 512 bytes per possible entry */
if (extent < 512 * pgss_max)
return false;
/*
* Don't proceed if file is less than about 50% bloat. Nothing can or
* should be done in the event of unusually large query texts accounting
* for file's large size. We go to the trouble of maintaining the mean
* query length in order to prevent garbage collection from thrashing
* uselessly.
*/
if (extent < pgss->mean_query_len * pgss_max * 2)
return false;
return true;
}
/*
* Garbage-collect orphaned query texts in external file.
*
* This won't be called often in the typical case, since it's likely that
* there won't be too much churn, and besides, a similar compaction process
* occurs when serializing to disk at shutdown or as part of resetting.
* Despite this, it seems prudent to plan for the edge case where the file
* becomes unreasonably large, with no other method of compaction likely to
* occur in the foreseeable future.
*
* The caller must hold an exclusive lock on pgss->lock.
2015-10-04 23:58:29 +02:00
*
* At the first sign of trouble we unlink the query text file to get a clean
* slate (although existing statistics are retained), rather than risk
* thrashing by allowing the same problem case to recur indefinitely.
*/
static void
gc_qtexts(void)
{
char *qbuffer;
Size qbuffer_size;
2015-10-04 23:58:29 +02:00
FILE *qfile = NULL;
HASH_SEQ_STATUS hash_seq;
pgssEntry *entry;
Size extent;
int nentries;
/*
* When called from pgss_store, some other session might have proceeded
* with garbage collection in the no-lock-held interim of lock strength
* escalation. Check once more that this is actually necessary.
*/
if (!need_gc_qtexts())
return;
/*
2015-10-04 23:58:29 +02:00
* Load the old texts file. If we fail (out of memory, for instance),
* invalidate query texts. Hopefully this is rare. It might seem better
* to leave things alone on an OOM failure, but the problem is that the
* file is only going to get bigger; hoping for a future non-OOM result is
* risky and can easily lead to complete denial of service.
*/
qbuffer = qtext_load_file(&qbuffer_size);
if (qbuffer == NULL)
2015-10-04 23:58:29 +02:00
goto gc_fail;
/*
* We overwrite the query texts file in place, so as to reduce the risk of
* an out-of-disk-space failure. Since the file is guaranteed not to get
* larger, this should always work on traditional filesystems; though we
* could still lose on copy-on-write filesystems.
*/
qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
if (qfile == NULL)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
PGSS_TEXT_FILE)));
goto gc_fail;
}
extent = 0;
nentries = 0;
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
{
int query_len = entry->query_len;
char *qry = qtext_fetch(entry->query_offset,
query_len,
qbuffer,
qbuffer_size);
if (qry == NULL)
{
/* Trouble ... drop the text */
entry->query_offset = 0;
entry->query_len = -1;
2015-10-04 23:58:29 +02:00
/* entry will not be counted in mean query length computation */
continue;
}
if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
PGSS_TEXT_FILE)));
hash_seq_term(&hash_seq);
goto gc_fail;
}
entry->query_offset = extent;
extent += query_len + 1;
nentries++;
}
/*
* Truncate away any now-unused space. If this fails for some odd reason,
* we log it, but there's no need to fail.
*/
if (ftruncate(fileno(qfile), extent) != 0)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not truncate file \"%s\": %m",
PGSS_TEXT_FILE)));
if (FreeFile(qfile))
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
PGSS_TEXT_FILE)));
qfile = NULL;
goto gc_fail;
}
elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu",
pgss->extent, extent);
/* Reset the shared extent pointer */
pgss->extent = extent;
/*
* Also update the mean query length, to be sure that need_gc_qtexts()
* won't still think we have a problem.
*/
if (nentries > 0)
pgss->mean_query_len = extent / nentries;
else
pgss->mean_query_len = ASSUMED_LENGTH_INIT;
free(qbuffer);
/*
* OK, count a garbage collection cycle. (Note: even though we have
* exclusive lock on pgss->lock, we must take pgss->mutex for this, since
* other processes may examine gc_count while holding only the mutex.
* Also, we have to advance the count *after* we've rewritten the file,
* else other processes might not realize they read a stale file.)
*/
record_gc_qtexts();
return;
gc_fail:
/* clean up resources */
if (qfile)
FreeFile(qfile);
if (qbuffer)
free(qbuffer);
/*
* Since the contents of the external file are now uncertain, mark all
* hashtable entries as having invalid texts.
*/
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
{
entry->query_offset = 0;
entry->query_len = -1;
}
2015-10-04 23:58:29 +02:00
/*
* Destroy the query text file and create a new, empty one
*/
(void) unlink(PGSS_TEXT_FILE);
qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
if (qfile == NULL)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not recreate file \"%s\": %m",
PGSS_TEXT_FILE)));
2015-10-04 23:58:29 +02:00
else
FreeFile(qfile);
/* Reset the shared extent pointer */
pgss->extent = 0;
/* Reset mean_query_len to match the new state */
pgss->mean_query_len = ASSUMED_LENGTH_INIT;
/*
* Bump the GC count even though we failed.
*
* This is needed to make concurrent readers of file without any lock on
* pgss->lock notice existence of new version of file. Once readers
* subsequently observe a change in GC count with pgss->lock held, that
* forces a safe reopen of file. Writers also require that we bump here,
* of course. (As required by locking protocol, readers and writers don't
* trust earlier file contents until gc_count is found unchanged after
* pgss->lock acquired in shared or exclusive mode respectively.)
*/
record_gc_qtexts();
}
/*
* Release entries corresponding to parameters passed.
*/
static void
entry_reset(Oid userid, Oid dbid, uint64 queryid)
{
HASH_SEQ_STATUS hash_seq;
pgssEntry *entry;
FILE *qfile;
long num_entries;
long num_remove = 0;
pgssHashKey key;
if (!pgss || !pgss_hash)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
num_entries = hash_get_num_entries(pgss_hash);
if (userid != 0 && dbid != 0 && queryid != UINT64CONST(0))
{
/* If all the parameters are available, use the fast path. */
key.userid = userid;
key.dbid = dbid;
key.queryid = queryid;
/* Remove the key if exists */
entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_REMOVE, NULL);
if (entry) /* found */
num_remove++;
}
else if (userid != 0 || dbid != 0 || queryid != UINT64CONST(0))
{
/* Remove entries corresponding to valid parameters. */
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
{
if ((!userid || entry->key.userid == userid) &&
(!dbid || entry->key.dbid == dbid) &&
(!queryid || entry->key.queryid == queryid))
{
hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
num_remove++;
}
}
}
else
{
/* Remove all entries. */
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
{
hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
num_remove++;
}
}
/* All entries are removed? */
if (num_entries != num_remove)
goto release_lock;
/*
* Write new empty query file, perhaps even creating a new one to recover
* if the file was missing.
*/
qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
if (qfile == NULL)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not create file \"%s\": %m",
PGSS_TEXT_FILE)));
goto done;
}
/* If ftruncate fails, log it, but it's not a fatal problem */
if (ftruncate(fileno(qfile), 0) != 0)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not truncate file \"%s\": %m",
PGSS_TEXT_FILE)));
FreeFile(qfile);
done:
pgss->extent = 0;
/* This counts as a query text garbage collection for our purposes */
record_gc_qtexts();
release_lock:
LWLockRelease(pgss->lock);
}
/*
* AppendJumble: Append a value that is substantive in a given query to
* the current jumble.
*/
static void
AppendJumble(pgssJumbleState *jstate, const unsigned char *item, Size size)
{
unsigned char *jumble = jstate->jumble;
Size jumble_len = jstate->jumble_len;
/*
* Whenever the jumble buffer is full, we hash the current contents and
* reset the buffer to contain just that hash value, thus relying on the
* hash to summarize everything so far.
*/
while (size > 0)
{
Size part_size;
if (jumble_len >= JUMBLE_SIZE)
{
uint64 start_hash;
start_hash = DatumGetUInt64(hash_any_extended(jumble,
JUMBLE_SIZE, 0));
memcpy(jumble, &start_hash, sizeof(start_hash));
jumble_len = sizeof(start_hash);
}
part_size = Min(size, JUMBLE_SIZE - jumble_len);
memcpy(jumble + jumble_len, item, part_size);
jumble_len += part_size;
item += part_size;
size -= part_size;
}
jstate->jumble_len = jumble_len;
}
/*
* Wrappers around AppendJumble to encapsulate details of serialization
* of individual local variable elements.
*/
#define APP_JUMB(item) \
AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
#define APP_JUMB_STRING(str) \
AppendJumble(jstate, (const unsigned char *) (str), strlen(str) + 1)
/*
* JumbleQuery: Selectively serialize the query tree, appending significant
* data to the "query jumble" while ignoring nonsignificant data.
*
* Rule of thumb for what to include is that we should ignore anything not
* semantically significant (such as alias names) as well as anything that can
* be deduced from child nodes (else we'd just be double-hashing that piece
* of information).
*/
static void
JumbleQuery(pgssJumbleState *jstate, Query *query)
{
Assert(IsA(query, Query));
Assert(query->utilityStmt == NULL);
APP_JUMB(query->commandType);
/* resultRelation is usually predictable from commandType */
JumbleExpr(jstate, (Node *) query->cteList);
JumbleRangeTable(jstate, query->rtable);
JumbleExpr(jstate, (Node *) query->jointree);
JumbleExpr(jstate, (Node *) query->targetList);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
JumbleExpr(jstate, (Node *) query->onConflict);
JumbleExpr(jstate, (Node *) query->returningList);
JumbleExpr(jstate, (Node *) query->groupClause);
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
JumbleExpr(jstate, (Node *) query->groupingSets);
JumbleExpr(jstate, query->havingQual);
JumbleExpr(jstate, (Node *) query->windowClause);
JumbleExpr(jstate, (Node *) query->distinctClause);
JumbleExpr(jstate, (Node *) query->sortClause);
JumbleExpr(jstate, query->limitOffset);
JumbleExpr(jstate, query->limitCount);
JumbleRowMarks(jstate, query->rowMarks);
JumbleExpr(jstate, query->setOperations);
}
/*
* Jumble a range table
*/
static void
JumbleRangeTable(pgssJumbleState *jstate, List *rtable)
{
ListCell *lc;
foreach(lc, rtable)
{
RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
APP_JUMB(rte->rtekind);
switch (rte->rtekind)
{
case RTE_RELATION:
APP_JUMB(rte->relid);
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
JumbleExpr(jstate, (Node *) rte->tablesample);
break;
case RTE_SUBQUERY:
JumbleQuery(jstate, rte->subquery);
break;
case RTE_JOIN:
APP_JUMB(rte->jointype);
break;
case RTE_FUNCTION:
JumbleExpr(jstate, (Node *) rte->functions);
break;
case RTE_TABLEFUNC:
JumbleExpr(jstate, (Node *) rte->tablefunc);
break;
case RTE_VALUES:
JumbleExpr(jstate, (Node *) rte->values_lists);
break;
case RTE_CTE:
/*
* Depending on the CTE name here isn't ideal, but it's the
* only info we have to identify the referenced WITH item.
*/
APP_JUMB_STRING(rte->ctename);
APP_JUMB(rte->ctelevelsup);
break;
case RTE_NAMEDTUPLESTORE:
APP_JUMB_STRING(rte->enrname);
break;
In the planner, replace an empty FROM clause with a dummy RTE. The fact that "SELECT expression" has no base relations has long been a thorn in the side of the planner. It makes it hard to flatten a sub-query that looks like that, or is a trivial VALUES() item, because the planner generally uses relid sets to identify sub-relations, and such a sub-query would have an empty relid set if we flattened it. prepjointree.c contains some baroque logic that works around this in certain special cases --- but there is a much better answer. We can replace an empty FROM clause with a dummy RTE that acts like a table of one row and no columns, and then there are no such corner cases to worry about. Instead we need some logic to get rid of useless dummy RTEs, but that's simpler and covers more cases than what was there before. For really trivial cases, where the query is just "SELECT expression" and nothing else, there's a hazard that adding the extra RTE makes for a noticeable slowdown; even though it's not much processing, there's not that much for the planner to do overall. However testing says that the penalty is very small, close to the noise level. In more complex queries, this is able to find optimizations that we could not find before. The new RTE type is called RTE_RESULT, since the "scan" plan type it gives rise to is a Result node (the same plan we produced for a "SELECT expression" query before). To avoid confusion, rename the old ResultPath path type to GroupResultPath, reflecting that it's only used in degenerate grouping cases where we know the query produces just one grouped row. (It wouldn't work to unify the two cases, because there are different rules about where the associated quals live during query_planner.) Note: although this touches readfuncs.c, I don't think a catversion bump is required, because the added case can't occur in stored rules, only plans. Patch by me, reviewed by David Rowley and Mark Dilger Discussion: https://postgr.es/m/15944.1521127664@sss.pgh.pa.us
2019-01-28 23:54:10 +01:00
case RTE_RESULT:
break;
default:
elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind);
break;
}
}
}
/*
* Jumble a rowMarks list
*/
static void
JumbleRowMarks(pgssJumbleState *jstate, List *rowMarks)
{
ListCell *lc;
foreach(lc, rowMarks)
{
RowMarkClause *rowmark = lfirst_node(RowMarkClause, lc);
if (!rowmark->pushedDown)
{
APP_JUMB(rowmark->rti);
APP_JUMB(rowmark->strength);
APP_JUMB(rowmark->waitPolicy);
}
}
}
/*
* Jumble an expression tree
*
* In general this function should handle all the same node types that
* expression_tree_walker() does, and therefore it's coded to be as parallel
* to that function as possible. However, since we are only invoked on
* queries immediately post-parse-analysis, we need not handle node types
* that only appear in planning.
*
* Note: the reason we don't simply use expression_tree_walker() is that the
* point of that function is to support tree walkers that don't care about
* most tree node types, but here we care about all types. We should complain
* about any unrecognized node type.
*/
static void
JumbleExpr(pgssJumbleState *jstate, Node *node)
{
ListCell *temp;
if (node == NULL)
return;
/* Guard against stack overflow due to overly complex expressions */
check_stack_depth();
/*
* We always emit the node's NodeTag, then any additional fields that are
* considered significant, and then we recurse to any child nodes.
*/
APP_JUMB(node->type);
switch (nodeTag(node))
{
case T_Var:
{
Var *var = (Var *) node;
APP_JUMB(var->varno);
APP_JUMB(var->varattno);
APP_JUMB(var->varlevelsup);
}
break;
case T_Const:
{
Const *c = (Const *) node;
/* We jumble only the constant's type, not its value */
APP_JUMB(c->consttype);
/* Also, record its parse location for query normalization */
RecordConstLocation(jstate, c->location);
}
break;
case T_Param:
{
Param *p = (Param *) node;
APP_JUMB(p->paramkind);
APP_JUMB(p->paramid);
APP_JUMB(p->paramtype);
/* Also, track the highest external Param id */
if (p->paramkind == PARAM_EXTERN &&
p->paramid > jstate->highest_extern_param_id)
jstate->highest_extern_param_id = p->paramid;
}
break;
case T_Aggref:
{
Aggref *expr = (Aggref *) node;
APP_JUMB(expr->aggfnoid);
Support ordered-set (WITHIN GROUP) aggregates. This patch introduces generic support for ordered-set and hypothetical-set aggregate functions, as well as implementations of the instances defined in SQL:2008 (percentile_cont(), percentile_disc(), rank(), dense_rank(), percent_rank(), cume_dist()). We also added mode() though it is not in the spec, as well as versions of percentile_cont() and percentile_disc() that can compute multiple percentile values in one pass over the data. Unlike the original submission, this patch puts full control of the sorting process in the hands of the aggregate's support functions. To allow the support functions to find out how they're supposed to sort, a new API function AggGetAggref() is added to nodeAgg.c. This allows retrieval of the aggregate call's Aggref node, which may have other uses beyond the immediate need. There is also support for ordered-set aggregates to install cleanup callback functions, so that they can be sure that infrastructure such as tuplesort objects gets cleaned up. In passing, make some fixes in the recently-added support for variadic aggregates, and make some editorial adjustments in the recent FILTER additions for aggregates. Also, simplify use of IsBinaryCoercible() by allowing it to succeed whenever the target type is ANY or ANYELEMENT. It was inconsistent that it dealt with other polymorphic target types but not these. Atri Sharma and Andrew Gierth; reviewed by Pavel Stehule and Vik Fearing, and rather heavily editorialized upon by Tom Lane
2013-12-23 22:11:35 +01:00
JumbleExpr(jstate, (Node *) expr->aggdirectargs);
JumbleExpr(jstate, (Node *) expr->args);
JumbleExpr(jstate, (Node *) expr->aggorder);
JumbleExpr(jstate, (Node *) expr->aggdistinct);
JumbleExpr(jstate, (Node *) expr->aggfilter);
}
break;
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
case T_GroupingFunc:
{
GroupingFunc *grpnode = (GroupingFunc *) node;
JumbleExpr(jstate, (Node *) grpnode->refs);
}
break;
case T_WindowFunc:
{
WindowFunc *expr = (WindowFunc *) node;
APP_JUMB(expr->winfnoid);
APP_JUMB(expr->winref);
JumbleExpr(jstate, (Node *) expr->args);
JumbleExpr(jstate, (Node *) expr->aggfilter);
}
break;
case T_SubscriptingRef:
{
SubscriptingRef *sbsref = (SubscriptingRef *) node;
JumbleExpr(jstate, (Node *) sbsref->refupperindexpr);
JumbleExpr(jstate, (Node *) sbsref->reflowerindexpr);
JumbleExpr(jstate, (Node *) sbsref->refexpr);
JumbleExpr(jstate, (Node *) sbsref->refassgnexpr);
}
break;
case T_FuncExpr:
{
FuncExpr *expr = (FuncExpr *) node;
APP_JUMB(expr->funcid);
JumbleExpr(jstate, (Node *) expr->args);
}
break;
case T_NamedArgExpr:
{
NamedArgExpr *nae = (NamedArgExpr *) node;
APP_JUMB(nae->argnumber);
JumbleExpr(jstate, (Node *) nae->arg);
}
break;
case T_OpExpr:
case T_DistinctExpr: /* struct-equivalent to OpExpr */
case T_NullIfExpr: /* struct-equivalent to OpExpr */
{
OpExpr *expr = (OpExpr *) node;
APP_JUMB(expr->opno);
JumbleExpr(jstate, (Node *) expr->args);
}
break;
case T_ScalarArrayOpExpr:
{
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node;
APP_JUMB(expr->opno);
APP_JUMB(expr->useOr);
JumbleExpr(jstate, (Node *) expr->args);
}
break;
case T_BoolExpr:
{
BoolExpr *expr = (BoolExpr *) node;
APP_JUMB(expr->boolop);
JumbleExpr(jstate, (Node *) expr->args);
}
break;
case T_SubLink:
{
SubLink *sublink = (SubLink *) node;
APP_JUMB(sublink->subLinkType);
APP_JUMB(sublink->subLinkId);
JumbleExpr(jstate, (Node *) sublink->testexpr);
JumbleQuery(jstate, castNode(Query, sublink->subselect));
}
break;
case T_FieldSelect:
{
FieldSelect *fs = (FieldSelect *) node;
APP_JUMB(fs->fieldnum);
JumbleExpr(jstate, (Node *) fs->arg);
}
break;
case T_FieldStore:
{
FieldStore *fstore = (FieldStore *) node;
JumbleExpr(jstate, (Node *) fstore->arg);
JumbleExpr(jstate, (Node *) fstore->newvals);
}
break;
case T_RelabelType:
{
RelabelType *rt = (RelabelType *) node;
APP_JUMB(rt->resulttype);
JumbleExpr(jstate, (Node *) rt->arg);
}
break;
case T_CoerceViaIO:
{
CoerceViaIO *cio = (CoerceViaIO *) node;
APP_JUMB(cio->resulttype);
JumbleExpr(jstate, (Node *) cio->arg);
}
break;
case T_ArrayCoerceExpr:
{
ArrayCoerceExpr *acexpr = (ArrayCoerceExpr *) node;
APP_JUMB(acexpr->resulttype);
JumbleExpr(jstate, (Node *) acexpr->arg);
Support arrays over domains. Allowing arrays with a domain type as their element type was left un-done in the original domain patch, but not for any very good reason. This omission leads to such surprising results as array_agg() not working on a domain column, because the parser can't identify a suitable output type for the polymorphic aggregate. In order to fix this, first clean up the APIs of coerce_to_domain() and some internal functions in parse_coerce.c so that we consistently pass around a CoercionContext along with CoercionForm. Previously, we sometimes passed an "isExplicit" boolean flag instead, which is strictly less information; and coerce_to_domain() didn't even get that, but instead had to reverse-engineer isExplicit from CoercionForm. That's contrary to the documentation in primnodes.h that says that CoercionForm only affects display and not semantics. I don't think this change fixes any live bugs, but it makes things more consistent. The main reason for doing it though is that now build_coercion_expression() receives ccontext, which it needs in order to be able to recursively invoke coerce_to_target_type(). Next, reimplement ArrayCoerceExpr so that the node does not directly know any details of what has to be done to the individual array elements while performing the array coercion. Instead, the per-element processing is represented by a sub-expression whose input is a source array element and whose output is a target array element. This simplifies life in parse_coerce.c, because it can build that sub-expression by a recursive invocation of coerce_to_target_type(). The executor now handles the per-element processing as a compiled expression instead of hard-wired code. The main advantage of this is that we can use a single ArrayCoerceExpr to handle as many as three successive steps per element: base type conversion, typmod coercion, and domain constraint checking. The old code used two stacked ArrayCoerceExprs to handle type + typmod coercion, which was pretty inefficient, and adding yet another array deconstruction to do domain constraint checking seemed very unappetizing. In the case where we just need a single, very simple coercion function, doing this straightforwardly leads to a noticeable increase in the per-array-element runtime cost. Hence, add an additional shortcut evalfunc in execExprInterp.c that skips unnecessary overhead for that specific form of expression. The runtime speed of simple cases is within 1% or so of where it was before, while cases that previously required two levels of array processing are significantly faster. Finally, create an implicit array type for every domain type, as we do for base types, enums, etc. Everything except the array-coercion case seems to just work without further effort. Tom Lane, reviewed by Andrew Dunstan Discussion: https://postgr.es/m/9852.1499791473@sss.pgh.pa.us
2017-09-30 19:40:56 +02:00
JumbleExpr(jstate, (Node *) acexpr->elemexpr);
}
break;
case T_ConvertRowtypeExpr:
{
ConvertRowtypeExpr *crexpr = (ConvertRowtypeExpr *) node;
APP_JUMB(crexpr->resulttype);
JumbleExpr(jstate, (Node *) crexpr->arg);
}
break;
case T_CollateExpr:
{
CollateExpr *ce = (CollateExpr *) node;
APP_JUMB(ce->collOid);
JumbleExpr(jstate, (Node *) ce->arg);
}
break;
case T_CaseExpr:
{
CaseExpr *caseexpr = (CaseExpr *) node;
JumbleExpr(jstate, (Node *) caseexpr->arg);
foreach(temp, caseexpr->args)
{
CaseWhen *when = lfirst_node(CaseWhen, temp);
JumbleExpr(jstate, (Node *) when->expr);
JumbleExpr(jstate, (Node *) when->result);
}
JumbleExpr(jstate, (Node *) caseexpr->defresult);
}
break;
case T_CaseTestExpr:
{
CaseTestExpr *ct = (CaseTestExpr *) node;
APP_JUMB(ct->typeId);
}
break;
case T_ArrayExpr:
JumbleExpr(jstate, (Node *) ((ArrayExpr *) node)->elements);
break;
case T_RowExpr:
JumbleExpr(jstate, (Node *) ((RowExpr *) node)->args);
break;
case T_RowCompareExpr:
{
RowCompareExpr *rcexpr = (RowCompareExpr *) node;
APP_JUMB(rcexpr->rctype);
JumbleExpr(jstate, (Node *) rcexpr->largs);
JumbleExpr(jstate, (Node *) rcexpr->rargs);
}
break;
case T_CoalesceExpr:
JumbleExpr(jstate, (Node *) ((CoalesceExpr *) node)->args);
break;
case T_MinMaxExpr:
{
MinMaxExpr *mmexpr = (MinMaxExpr *) node;
APP_JUMB(mmexpr->op);
JumbleExpr(jstate, (Node *) mmexpr->args);
}
break;
case T_SQLValueFunction:
{
SQLValueFunction *svf = (SQLValueFunction *) node;
APP_JUMB(svf->op);
/* type is fully determined by op */
APP_JUMB(svf->typmod);
}
break;
case T_XmlExpr:
{
XmlExpr *xexpr = (XmlExpr *) node;
APP_JUMB(xexpr->op);
JumbleExpr(jstate, (Node *) xexpr->named_args);
JumbleExpr(jstate, (Node *) xexpr->args);
}
break;
case T_NullTest:
{
NullTest *nt = (NullTest *) node;
APP_JUMB(nt->nulltesttype);
JumbleExpr(jstate, (Node *) nt->arg);
}
break;
case T_BooleanTest:
{
BooleanTest *bt = (BooleanTest *) node;
APP_JUMB(bt->booltesttype);
JumbleExpr(jstate, (Node *) bt->arg);
}
break;
case T_CoerceToDomain:
{
CoerceToDomain *cd = (CoerceToDomain *) node;
APP_JUMB(cd->resulttype);
JumbleExpr(jstate, (Node *) cd->arg);
}
break;
case T_CoerceToDomainValue:
{
CoerceToDomainValue *cdv = (CoerceToDomainValue *) node;
APP_JUMB(cdv->typeId);
}
break;
case T_SetToDefault:
{
SetToDefault *sd = (SetToDefault *) node;
APP_JUMB(sd->typeId);
}
break;
case T_CurrentOfExpr:
{
CurrentOfExpr *ce = (CurrentOfExpr *) node;
APP_JUMB(ce->cvarno);
if (ce->cursor_name)
APP_JUMB_STRING(ce->cursor_name);
APP_JUMB(ce->cursor_param);
}
break;
Code review for NextValueExpr expression node type. Add missing infrastructure for this node type, notably in ruleutils.c where its lack could demonstrably cause EXPLAIN to fail. Add outfuncs/readfuncs support. (outfuncs support is useful today for debugging purposes. The readfuncs support may never be needed, since at present it would only matter for parallel query and NextValueExpr should never appear in a parallelizable query; but it seems like a bad idea to have a primnode type that isn't fully supported here.) Teach planner infrastructure that NextValueExpr is a volatile, parallel-unsafe, non-leaky expression node with cost cpu_operator_cost. Given its limited scope of usage, there *might* be no live bug today from the lack of that knowledge, but it's certainly going to bite us on the rear someday. Teach pg_stat_statements about the new node type, too. While at it, also teach cost_qual_eval() that MinMaxExpr, SQLValueFunction, XmlExpr, and CoerceToDomain should be charged as cpu_operator_cost. Failing to do this for SQLValueFunction was an oversight in my commit 0bb51aa96. The others are longer-standing oversights, but no time like the present to fix them. (In principle, CoerceToDomain could have cost much higher than this, but it doesn't presently seem worth trying to examine the domain's constraints here.) Modify execExprInterp.c to execute NextValueExpr as an out-of-line function; it seems quite unlikely to me that it's worth insisting that it be inlined in all expression eval methods. Besides, providing the out-of-line function doesn't stop anyone from inlining if they want to. Adjust some places where NextValueExpr support had been inserted with the aid of a dartboard rather than keeping it in the same order as elsewhere. Discussion: https://postgr.es/m/23862.1499981661@sss.pgh.pa.us
2017-07-14 21:25:43 +02:00
case T_NextValueExpr:
{
NextValueExpr *nve = (NextValueExpr *) node;
APP_JUMB(nve->seqid);
APP_JUMB(nve->typeId);
}
break;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
case T_InferenceElem:
{
InferenceElem *ie = (InferenceElem *) node;
APP_JUMB(ie->infercollid);
APP_JUMB(ie->inferopclass);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
JumbleExpr(jstate, ie->expr);
}
break;
case T_TargetEntry:
{
TargetEntry *tle = (TargetEntry *) node;
APP_JUMB(tle->resno);
APP_JUMB(tle->ressortgroupref);
JumbleExpr(jstate, (Node *) tle->expr);
}
break;
case T_RangeTblRef:
{
RangeTblRef *rtr = (RangeTblRef *) node;
APP_JUMB(rtr->rtindex);
}
break;
case T_JoinExpr:
{
JoinExpr *join = (JoinExpr *) node;
APP_JUMB(join->jointype);
APP_JUMB(join->isNatural);
APP_JUMB(join->rtindex);
JumbleExpr(jstate, join->larg);
JumbleExpr(jstate, join->rarg);
JumbleExpr(jstate, join->quals);
}
break;
case T_FromExpr:
{
FromExpr *from = (FromExpr *) node;
JumbleExpr(jstate, (Node *) from->fromlist);
JumbleExpr(jstate, from->quals);
}
break;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
case T_OnConflictExpr:
{
2015-05-24 03:35:49 +02:00
OnConflictExpr *conf = (OnConflictExpr *) node;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
APP_JUMB(conf->action);
JumbleExpr(jstate, (Node *) conf->arbiterElems);
JumbleExpr(jstate, conf->arbiterWhere);
2015-05-24 03:35:49 +02:00
JumbleExpr(jstate, (Node *) conf->onConflictSet);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
JumbleExpr(jstate, conf->onConflictWhere);
APP_JUMB(conf->constraint);
APP_JUMB(conf->exclRelIndex);
2015-05-24 03:35:49 +02:00
JumbleExpr(jstate, (Node *) conf->exclRelTlist);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
}
break;
case T_List:
foreach(temp, (List *) node)
{
JumbleExpr(jstate, (Node *) lfirst(temp));
}
break;
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
case T_IntList:
foreach(temp, (List *) node)
{
APP_JUMB(lfirst_int(temp));
}
break;
case T_SortGroupClause:
{
SortGroupClause *sgc = (SortGroupClause *) node;
APP_JUMB(sgc->tleSortGroupRef);
APP_JUMB(sgc->eqop);
APP_JUMB(sgc->sortop);
APP_JUMB(sgc->nulls_first);
}
break;
Support GROUPING SETS, CUBE and ROLLUP. This SQL standard functionality allows to aggregate data by different GROUP BY clauses at once. Each grouping set returns rows with columns grouped by in other sets set to NULL. This could previously be achieved by doing each grouping as a separate query, conjoined by UNION ALLs. Besides being considerably more concise, grouping sets will in many cases be faster, requiring only one scan over the underlying data. The current implementation of grouping sets only supports using sorting for input. Individual sets that share a sort order are computed in one pass. If there are sets that don't share a sort order, additional sort & aggregation steps are performed. These additional passes are sourced by the previous sort step; thus avoiding repeated scans of the source data. The code is structured in a way that adding support for purely using hash aggregation or a mix of hashing and sorting is possible. Sorting was chosen to be supported first, as it is the most generic method of implementation. Instead of, as in an earlier versions of the patch, representing the chain of sort and aggregation steps as full blown planner and executor nodes, all but the first sort are performed inside the aggregation node itself. This avoids the need to do some unusual gymnastics to handle having to return aggregated and non-aggregated tuples from underlying nodes, as well as having to shut down underlying nodes early to limit memory usage. The optimizer still builds Sort/Agg node to describe each phase, but they're not part of the plan tree, but instead additional data for the aggregation node. They're a convenient and preexisting way to describe aggregation and sorting. The first (and possibly only) sort step is still performed as a separate execution step. That retains similarity with existing group by plans, makes rescans fairly simple, avoids very deep plans (leading to slow explains) and easily allows to avoid the sorting step if the underlying data is sorted by other means. A somewhat ugly side of this patch is having to deal with a grammar ambiguity between the new CUBE keyword and the cube extension/functions named cube (and rollup). To avoid breaking existing deployments of the cube extension it has not been renamed, neither has cube been made a reserved keyword. Instead precedence hacking is used to make GROUP BY cube(..) refer to the CUBE grouping sets feature, and not the function cube(). To actually group by a function cube(), unlikely as that might be, the function name has to be quoted. Needs a catversion bump because stored rules may change. Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
2015-05-16 03:40:59 +02:00
case T_GroupingSet:
{
GroupingSet *gsnode = (GroupingSet *) node;
JumbleExpr(jstate, (Node *) gsnode->content);
}
break;
case T_WindowClause:
{
WindowClause *wc = (WindowClause *) node;
APP_JUMB(wc->winref);
APP_JUMB(wc->frameOptions);
JumbleExpr(jstate, (Node *) wc->partitionClause);
JumbleExpr(jstate, (Node *) wc->orderClause);
JumbleExpr(jstate, wc->startOffset);
JumbleExpr(jstate, wc->endOffset);
}
break;
case T_CommonTableExpr:
{
CommonTableExpr *cte = (CommonTableExpr *) node;
/* we store the string name because RTE_CTE RTEs need it */
APP_JUMB_STRING(cte->ctename);
Allow user control of CTE materialization, and change the default behavior. Historically we've always materialized the full output of a CTE query, treating WITH as an optimization fence (so that, for example, restrictions from the outer query cannot be pushed into it). This is appropriate when the CTE query is INSERT/UPDATE/DELETE, or is recursive; but when the CTE query is non-recursive and side-effect-free, there's no hazard of changing the query results by pushing restrictions down. Another argument for materialization is that it can avoid duplicate computation of an expensive WITH query --- but that only applies if the WITH query is called more than once in the outer query. Even then it could still be a net loss, if each call has restrictions that would allow just a small part of the WITH query to be computed. Hence, let's change the behavior for WITH queries that are non-recursive and side-effect-free. By default, we will inline them into the outer query (removing the optimization fence) if they are called just once. If they are called more than once, we will keep the old behavior by default, but the user can override this and force inlining by specifying NOT MATERIALIZED. Lastly, the user can force the old behavior by specifying MATERIALIZED; this would mainly be useful when the query had deliberately been employing WITH as an optimization fence to prevent a poor choice of plan. Andreas Karlsson, Andrew Gierth, David Fetter Discussion: https://postgr.es/m/87sh48ffhb.fsf@news-spur.riddles.org.uk
2019-02-16 22:11:12 +01:00
APP_JUMB(cte->ctematerialized);
JumbleQuery(jstate, castNode(Query, cte->ctequery));
}
break;
case T_SetOperationStmt:
{
SetOperationStmt *setop = (SetOperationStmt *) node;
APP_JUMB(setop->op);
APP_JUMB(setop->all);
JumbleExpr(jstate, setop->larg);
JumbleExpr(jstate, setop->rarg);
}
break;
case T_RangeTblFunction:
{
RangeTblFunction *rtfunc = (RangeTblFunction *) node;
JumbleExpr(jstate, rtfunc->funcexpr);
}
break;
case T_TableFunc:
{
TableFunc *tablefunc = (TableFunc *) node;
JumbleExpr(jstate, tablefunc->docexpr);
JumbleExpr(jstate, tablefunc->rowexpr);
JumbleExpr(jstate, (Node *) tablefunc->colexprs);
}
break;
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
case T_TableSampleClause:
{
TableSampleClause *tsc = (TableSampleClause *) node;
APP_JUMB(tsc->tsmhandler);
JumbleExpr(jstate, (Node *) tsc->args);
JumbleExpr(jstate, (Node *) tsc->repeatable);
}
break;
default:
/* Only a warning, since we can stumble along anyway */
elog(WARNING, "unrecognized node type: %d",
(int) nodeTag(node));
break;
}
}
/*
* Record location of constant within query string of query tree
* that is currently being walked.
*/
static void
RecordConstLocation(pgssJumbleState *jstate, int location)
{
/* -1 indicates unknown or undefined location */
if (location >= 0)
{
/* enlarge array if needed */
if (jstate->clocations_count >= jstate->clocations_buf_size)
{
jstate->clocations_buf_size *= 2;
jstate->clocations = (pgssLocationLen *)
repalloc(jstate->clocations,
jstate->clocations_buf_size *
sizeof(pgssLocationLen));
}
jstate->clocations[jstate->clocations_count].location = location;
/* initialize lengths to -1 to simplify fill_in_constant_lengths */
jstate->clocations[jstate->clocations_count].length = -1;
jstate->clocations_count++;
}
}
/*
* Generate a normalized version of the query string that will be used to
* represent all similar queries.
*
* Note that the normalized representation may well vary depending on
* just which "equivalent" query is used to create the hashtable entry.
* We assume this is OK.
*
* If query_loc > 0, then "query" has been advanced by that much compared to
* the original string start, so we need to translate the provided locations
* to compensate. (This lets us avoid re-scanning statements before the one
* of interest, so it's worth doing.)
*
* *query_len_p contains the input string length, and is updated with
* the result string length on exit. The resulting string might be longer
* or shorter depending on what happens with replacement of constants.
*
* Returns a palloc'd string.
*/
static char *
generate_normalized_query(pgssJumbleState *jstate, const char *query,
int query_loc, int *query_len_p, int encoding)
{
char *norm_query;
int query_len = *query_len_p;
int i,
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
norm_query_buflen, /* Space allowed for norm_query */
len_to_wrt, /* Length (in bytes) to write */
quer_loc = 0, /* Source query byte location */
n_quer_loc = 0, /* Normalized query byte location */
last_off = 0, /* Offset from start for previous tok */
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
last_tok_len = 0; /* Length (in bytes) of that tok */
/*
* Get constants' lengths (core system only gives us locations). Note
* this also ensures the items are sorted by location.
*/
fill_in_constant_lengths(jstate, query, query_loc);
/*
* Allow for $n symbols to be longer than the constants they replace.
* Constants must take at least one byte in text form, while a $n symbol
* certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
* could refine that limit based on the max value of n for the current
* query, but it hardly seems worth any extra effort to do so.
*/
norm_query_buflen = query_len + jstate->clocations_count * 10;
/* Allocate result buffer */
norm_query = palloc(norm_query_buflen + 1);
for (i = 0; i < jstate->clocations_count; i++)
{
int off, /* Offset from start for cur tok */
tok_len; /* Length (in bytes) of that tok */
off = jstate->clocations[i].location;
/* Adjust recorded location if we're dealing with partial string */
off -= query_loc;
tok_len = jstate->clocations[i].length;
if (tok_len < 0)
continue; /* ignore any duplicates */
/* Copy next chunk (what precedes the next constant) */
len_to_wrt = off - last_off;
len_to_wrt -= last_tok_len;
Assert(len_to_wrt >= 0);
memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
n_quer_loc += len_to_wrt;
/* And insert a param symbol in place of the constant token */
n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
i + 1 + jstate->highest_extern_param_id);
quer_loc = off + tok_len;
last_off = off;
last_tok_len = tok_len;
}
/*
* We've copied up until the last ignorable constant. Copy over the
* remaining bytes of the original query string.
*/
len_to_wrt = query_len - quer_loc;
Assert(len_to_wrt >= 0);
memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
n_quer_loc += len_to_wrt;
Assert(n_quer_loc <= norm_query_buflen);
norm_query[n_quer_loc] = '\0';
*query_len_p = n_quer_loc;
return norm_query;
}
/*
* Given a valid SQL string and an array of constant-location records,
* fill in the textual lengths of those constants.
*
* The constants may use any allowed constant syntax, such as float literals,
* bit-strings, single-quoted strings and dollar-quoted strings. This is
* accomplished by using the public API for the core scanner.
*
* It is the caller's job to ensure that the string is a valid SQL statement
* with constants at the indicated locations. Since in practice the string
* has already been parsed, and the locations that the caller provides will
* have originated from within the authoritative parser, this should not be
* a problem.
*
* Duplicate constant pointers are possible, and will have their lengths
* marked as '-1', so that they are later ignored. (Actually, we assume the
* lengths were initialized as -1 to start with, and don't change them here.)
*
* If query_loc > 0, then "query" has been advanced by that much compared to
* the original string start, so we need to translate the provided locations
* to compensate. (This lets us avoid re-scanning statements before the one
* of interest, so it's worth doing.)
*
* N.B. There is an assumption that a '-' character at a Const location begins
* a negative numeric constant. This precludes there ever being another
* reason for a constant to start with a '-'.
*/
static void
fill_in_constant_lengths(pgssJumbleState *jstate, const char *query,
int query_loc)
{
pgssLocationLen *locs;
core_yyscan_t yyscanner;
core_yy_extra_type yyextra;
core_YYSTYPE yylval;
YYLTYPE yylloc;
int last_loc = -1;
int i;
/*
* Sort the records by location so that we can process them in order while
* scanning the query text.
*/
if (jstate->clocations_count > 1)
qsort(jstate->clocations, jstate->clocations_count,
sizeof(pgssLocationLen), comp_location);
locs = jstate->clocations;
/* initialize the flex scanner --- should match raw_parser() */
yyscanner = scanner_init(query,
&yyextra,
Replace the data structure used for keyword lookup. Previously, ScanKeywordLookup was passed an array of string pointers. This had some performance deficiencies: the strings themselves might be scattered all over the place depending on the compiler (and some quick checking shows that at least with gcc-on-Linux, they indeed weren't reliably close together). That led to very cache-unfriendly behavior as the binary search touched strings in many different pages. Also, depending on the platform, the string pointers might need to be adjusted at program start, so that they couldn't be simple constant data. And the ScanKeyword struct had been designed with an eye to 32-bit machines originally; on 64-bit it requires 16 bytes per keyword, making it even more cache-unfriendly. Redesign so that the keyword strings themselves are allocated consecutively (as part of one big char-string constant), thereby eliminating the touch-lots-of-unrelated-pages syndrome. And get rid of the ScanKeyword array in favor of three separate arrays: uint16 offsets into the keyword array, uint16 token codes, and uint8 keyword categories. That reduces the overhead per keyword to 5 bytes instead of 16 (even less in programs that only need one of the token codes and categories); moreover, the binary search only touches the offsets array, further reducing its cache footprint. This also lets us put the token codes somewhere else than the keyword strings are, which avoids some unpleasant build dependencies. While we're at it, wrap the data used by ScanKeywordLookup into a struct that can be treated as an opaque type by most callers. That doesn't change things much right now, but it will make it less painful to switch to a hash-based lookup method, as is being discussed in the mailing list thread. Most of the change here is associated with adding a generator script that can build the new data structure from the same list-of-PG_KEYWORD header representation we used before. The PG_KEYWORD lists that plpgsql and ecpg used to embed in their scanner .c files have to be moved into headers, and the Makefiles have to be taught to invoke the generator script. This work is also necessary if we're to consider hash-based lookup, since the generator script is what would be responsible for constructing a hash table. Aside from saving a few kilobytes in each program that includes the keyword table, this seems to speed up raw parsing (flex+bison) by a few percent. So it's worth doing even as it stands, though we think we can gain even more with a follow-on patch to switch to hash-based lookup. John Naylor, with further hacking by me Discussion: https://postgr.es/m/CAJVSVGXdFVU2sgym89XPL=Lv1zOS5=EHHQ8XWNzFL=mTXkKMLw@mail.gmail.com
2019-01-06 23:02:57 +01:00
&ScanKeywords,
ScanKeywordTokens);
/* we don't want to re-emit any escape string warnings */
yyextra.escape_string_warning = false;
/* Search for each constant, in sequence */
for (i = 0; i < jstate->clocations_count; i++)
{
int loc = locs[i].location;
int tok;
/* Adjust recorded location if we're dealing with partial string */
loc -= query_loc;
Assert(loc >= 0);
if (loc <= last_loc)
continue; /* Duplicate constant, ignore */
/* Lex tokens until we find the desired constant */
for (;;)
{
tok = core_yylex(&yylval, &yylloc, yyscanner);
/* We should not hit end-of-string, but if we do, behave sanely */
if (tok == 0)
break; /* out of inner for-loop */
/*
* We should find the token position exactly, but if we somehow
* run past it, work with that.
*/
if (yylloc >= loc)
{
if (query[loc] == '-')
{
/*
* It's a negative value - this is the one and only case
* where we replace more than a single token.
*
* Do not compensate for the core system's special-case
* adjustment of location to that of the leading '-'
* operator in the event of a negative constant. It is
* also useful for our purposes to start from the minus
* symbol. In this way, queries like "select * from foo
* where bar = 1" and "select * from foo where bar = -2"
* will have identical normalized query strings.
*/
tok = core_yylex(&yylval, &yylloc, yyscanner);
if (tok == 0)
break; /* out of inner for-loop */
}
/*
* We now rely on the assumption that flex has placed a zero
* byte after the text of the current token in scanbuf.
*/
locs[i].length = strlen(yyextra.scanbuf + loc);
break; /* out of inner for-loop */
}
}
/* If we hit end-of-string, give up, leaving remaining lengths -1 */
if (tok == 0)
break;
last_loc = loc;
}
scanner_finish(yyscanner);
}
/*
* comp_location: comparator for qsorting pgssLocationLen structs by location
*/
static int
comp_location(const void *a, const void *b)
{
int l = ((const pgssLocationLen *) a)->location;
int r = ((const pgssLocationLen *) b)->location;
if (l < r)
return -1;
else if (l > r)
return +1;
else
return 0;
}