diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 0f8703af5a..f103d914a6 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -7385,8 +7385,22 @@ SCRAM-SHA-256$<iteration count>:&l d for n-distinct statistics, f for functional dependency statistics, and m for most common values (MCV) list statistics + e for expression statistics + + + + stxexprs pg_node_tree + + + Expression trees (in nodeToString() + representation) for statistics object attributes that are not simple + column references. This is a list with one element per expression. + Null if all statistics object attributes are simple references. + + + @@ -7452,7 +7466,7 @@ SCRAM-SHA-256$<iteration count>:&l (references pg_statistic_ext.oid) - Extended statistic object containing the definition for this data + Extended statistics object containing the definition for this data @@ -7484,6 +7498,15 @@ SCRAM-SHA-256$<iteration count>:&l pg_mcv_list type + + + + stxexprs pg_node_tree + + + A list of any expressions covered by this statistics object. + + @@ -7637,6 +7660,16 @@ SCRAM-SHA-256$<iteration count>:&l see . + + + + stxdexpr pg_statistic[] + + + Per-expression statistics, serialized as an array of + pg_statistic type + + @@ -9444,6 +9477,11 @@ SCRAM-SHA-256$<iteration count>:&l extended planner statistics + + pg_stats_ext_exprs + extended planner statistics for expressions + + pg_tables tables @@ -12696,10 +12734,19 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx (references pg_attribute.attname) - Name of the column described by this row + Names of the columns included in the extended statistics object + + + exprs text[] + + + Expressions included in the extended statistics object + + + inherited bool @@ -12851,7 +12898,8 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx The view pg_stats_ext provides access to - the information stored in the pg_statistic_ext and pg_statistic_ext_data catalogs. This view allows access only to rows of @@ -12908,7 +12956,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx (references pg_namespace.nspname) - Name of schema containing extended statistic + Name of schema containing extended statistics object @@ -12918,7 +12966,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx (references pg_statistic_ext.stxname) - Name of extended statistics + Name of extended statistics object @@ -12928,7 +12976,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx (references pg_authid.rolname) - Owner of the extended statistics + Owner of the extended statistics object @@ -12938,7 +12986,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx (references pg_attribute.attname) - Names of the columns the extended statistics is defined on + Names of the columns the extended statistics object is defined on @@ -12947,7 +12995,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx kinds char[] - Types of extended statistics enabled for this record + Types of extended statistics object enabled for this record @@ -13032,6 +13080,237 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx + + <structname>pg_stats_ext_exprs</structname> + + + pg_stats_ext_exprs + + + + The view pg_stats_ext_exprs provides access to + information about all expressions included in extended statistics objects, + combining information stored in the pg_statistic_ext + and pg_statistic_ext_data + catalogs. This view allows access only to rows of + pg_statistic_ext and pg_statistic_ext_data + that correspond to tables the user has permission to read, and therefore + it is safe to allow public read access to this view. + + + + pg_stats_ext_exprs is also designed to present + the information in a more readable format than the underlying catalogs + — at the cost that its schema must be extended whenever the structure + of statistics in pg_statistic changes. + + + + <structname>pg_stats_ext_exprs</structname> Columns + + + + + Column Type + + + Description + + + + + + + + schemaname name + (references pg_namespace.nspname) + + + Name of schema containing table + + + + + + tablename name + (references pg_class.relname) + + + Name of table the statistics object is defined on + + + + + + statistics_schemaname name + (references pg_namespace.nspname) + + + Name of schema containing extended statistics object + + + + + + statistics_name name + (references pg_statistic_ext.stxname) + + + Name of extended statistics object + + + + + + statistics_owner name + (references pg_authid.rolname) + + + Owner of the extended statistics object + + + + + + expr text + + + Expression included in the extended statistics object + + + + + + null_frac float4 + + + Fraction of expression entries that are null + + + + + + avg_width int4 + + + Average width in bytes of expression's entries + + + + + + n_distinct float4 + + + If greater than zero, the estimated number of distinct values in the + expression. If less than zero, the negative of the number of distinct + values divided by the number of rows. (The negated form is used when + ANALYZE believes that the number of distinct values is + likely to increase as the table grows; the positive form is used when + the expression seems to have a fixed number of possible values.) For + example, -1 indicates a unique expression in which the number of distinct + values is the same as the number of rows. + + + + + + most_common_vals anyarray + + + A list of the most common values in the expression. (Null if + no values seem to be more common than any others.) + + + + + + most_common_freqs float4[] + + + A list of the frequencies of the most common values, + i.e., number of occurrences of each divided by total number of rows. + (Null when most_common_vals is.) + + + + + + histogram_bounds anyarray + + + A list of values that divide the expression's values into groups of + approximately equal population. The values in + most_common_vals, if present, are omitted from this + histogram calculation. (This expression is null if the expression data type + does not have a < operator or if the + most_common_vals list accounts for the entire + population.) + + + + + + correlation float4 + + + Statistical correlation between physical row ordering and + logical ordering of the expression values. This ranges from -1 to +1. + When the value is near -1 or +1, an index scan on the expression will + be estimated to be cheaper than when it is near zero, due to reduction + of random access to the disk. (This expression is null if the expression's + data type does not have a < operator.) + + + + + + most_common_elems anyarray + + + A list of non-null element values most often appearing within values of + the expression. (Null for scalar types.) + + + + + + most_common_elem_freqs float4[] + + + A list of the frequencies of the most common element values, i.e., the + fraction of rows containing at least one instance of the given value. + Two or three additional values follow the per-element frequencies; + these are the minimum and maximum of the preceding per-element + frequencies, and optionally the frequency of null elements. + (Null when most_common_elems is.) + + + + + + elem_count_histogram float4[] + + + A histogram of the counts of distinct non-null element values within the + values of the expression, followed by the average number of distinct + non-null elements. (Null for scalar types.) + + + + +
+ + + The maximum number of entries in the array fields can be controlled on a + column-by-column basis using the ALTER + TABLE SET STATISTICS command, or globally by setting the + run-time parameter. + + +
+ <structname>pg_tables</structname> diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml index 4363be50c3..988f4c573f 100644 --- a/doc/src/sgml/ref/create_statistics.sgml +++ b/doc/src/sgml/ref/create_statistics.sgml @@ -21,9 +21,13 @@ PostgreSQL documentation +CREATE STATISTICS [ IF NOT EXISTS ] statistics_name + ON ( expression ) + FROM table_name + CREATE STATISTICS [ IF NOT EXISTS ] statistics_name [ ( statistics_kind [, ... ] ) ] - ON column_name, column_name [, ...] + ON { column_name | ( expression ) }, { column_name | ( expression ) } [, ...] FROM table_name @@ -39,6 +43,19 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na database and will be owned by the user issuing the command. + + The CREATE STATISTICS command has two basic forms. The + first form allows univariate statistics for a single expression to be + collected, providing benefits similar to an expression index without the + overhead of index maintenance. This form does not allow the statistics + kind to be specified, since the various statistics kinds refer only to + multivariate statistics. The second form of the command allows + multivariate statistics on multiple columns and/or expressions to be + collected, optionally specifying which statistics kinds to include. This + form will also automatically cause univariate statistics to be collected on + any expressions included in the list. + + If a schema name is given (for example, CREATE STATISTICS myschema.mystat ...) then the statistics object is created in the @@ -79,14 +96,16 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na statistics_kind - A statistics kind to be computed in this statistics object. + A multivariate statistics kind to be computed in this statistics object. Currently supported kinds are ndistinct, which enables n-distinct statistics, dependencies, which enables functional dependency statistics, and mcv which enables most-common values lists. If this clause is omitted, all supported statistics kinds are - included in the statistics object. + included in the statistics object. Univariate expression statistics are + built automatically if the statistics definition includes any complex + expressions rather than just simple column references. For more information, see and . @@ -98,8 +117,22 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na The name of a table column to be covered by the computed statistics. - At least two column names must be given; the order of the column names - is insignificant. + This is only allowed when building multivariate statistics. At least + two column names or expressions must be specified, and their order is + not significant. + + + + + + expression + + + An expression to be covered by the computed statistics. This may be + used to build univariate statistics on a single expression, or as part + of a list of multiple column names and/or expressions to build + multivariate statistics. In the latter case, separate univariate + statistics are built automatically for each expression in the list. @@ -125,6 +158,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na reading it. Once created, however, the ownership of the statistics object is independent of the underlying table(s). + + + Expression statistics are per-expression and are similar to creating an + index on the expression, except that they avoid the overhead of index + maintenance. Expression statistics are built automatically for each + expression in the statistics object definition. + @@ -196,6 +236,72 @@ EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2); in the table, allowing it to generate better estimates in both cases. + + Create table t3 with a single timestamp column, + and run queries using expressions on that column. Without extended + statistics, the planner has no information about the data distribution for + the expressions, and uses default estimates. The planner also does not + realize that the value of the date truncated to the month is fully + determined by the value of the date truncated to the day. Then expression + and ndistinct statistics are built on those two expressions: + + +CREATE TABLE t3 ( + a timestamp +); + +INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp, + '2020-12-31'::timestamp, + '1 minute'::interval) s(i); + +ANALYZE t3; + +-- the number of matching rows will be drastically underestimated: +EXPLAIN ANALYZE SELECT * FROM t3 + WHERE date_trunc('month', a) = '2020-01-01'::timestamp; + +EXPLAIN ANALYZE SELECT * FROM t3 + WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp + AND '2020-06-30'::timestamp; + +EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a) + FROM t3 GROUP BY 1, 2; + +-- build ndistinct statistics on the pair of expressions (per-expression +-- statistics are built automatically) +CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3; + +ANALYZE t3; + +-- now the row count estimates are more accurate: +EXPLAIN ANALYZE SELECT * FROM t3 + WHERE date_trunc('month', a) = '2020-01-01'::timestamp; + +EXPLAIN ANALYZE SELECT * FROM t3 + WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp + AND '2020-06-30'::timestamp; + +EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a) + FROM t3 GROUP BY 1, 2; + + + Without expression and ndistinct statistics, the planner has no information + about the number of distinct values for the expressions, and has to rely + on default estimates. The equality and range conditions are assumed to have + 0.5% selectivity, and the number of distinct values in the expression is + assumed to be the same as for the column (i.e. unique). This results in a + significant underestimate of the row count in the first two queries. Moreover, + the planner has no information about the relationship between the expressions, + so it assumes the two WHERE and GROUP BY + conditions are independent, and multiplies their selectivities together to + arrive at a severe overestimate of the group count in the aggregate query. + This is further exacerbated by the lack of accurate statistics for the + expressions, forcing the planner to use a default ndistinct estimate for the + expression derived from ndistinct for the column. With such statistics, the + planner recognizes that the conditions are correlated, and arrives at much + more accurate estimates. + + diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 70bc2123df..e36a9602c1 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -49,15 +49,15 @@ include $(top_srcdir)/src/backend/common.mk # Note: the order of this list determines the order in which the catalog # header files are assembled into postgres.bki. BKI_BOOTSTRAP catalogs -# must appear first, and there are reputedly other, undocumented ordering -# dependencies. +# must appear first, and pg_statistic before pg_statistic_ext_data, and +# there are reputedly other, undocumented ordering dependencies. CATALOG_HEADERS := \ pg_proc.h pg_type.h pg_attribute.h pg_class.h \ pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \ pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \ pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \ - pg_statistic_ext.h pg_statistic_ext_data.h \ - pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \ + pg_statistic.h pg_statistic_ext.h pg_statistic_ext_data.h \ + pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \ pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \ pg_database.h pg_db_role_setting.h pg_tablespace.h \ pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \ diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 0dca65dc7b..6483563204 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -264,6 +264,7 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS JOIN pg_attribute a ON (a.attrelid = s.stxrelid AND a.attnum = k) ) AS attnames, + pg_get_statisticsobjdef_expressions(s.oid) as exprs, s.stxkind AS kinds, sd.stxdndistinct AS n_distinct, sd.stxddependencies AS dependencies, @@ -290,6 +291,74 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS WHERE NOT has_column_privilege(c.oid, a.attnum, 'select') ) AND (c.relrowsecurity = false OR NOT row_security_active(c.oid)); +CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS + SELECT cn.nspname AS schemaname, + c.relname AS tablename, + sn.nspname AS statistics_schemaname, + s.stxname AS statistics_name, + pg_get_userbyid(s.stxowner) AS statistics_owner, + stat.expr, + (stat.a).stanullfrac AS null_frac, + (stat.a).stawidth AS avg_width, + (stat.a).stadistinct AS n_distinct, + (CASE + WHEN (stat.a).stakind1 = 1 THEN (stat.a).stavalues1 + WHEN (stat.a).stakind2 = 1 THEN (stat.a).stavalues2 + WHEN (stat.a).stakind3 = 1 THEN (stat.a).stavalues3 + WHEN (stat.a).stakind4 = 1 THEN (stat.a).stavalues4 + WHEN (stat.a).stakind5 = 1 THEN (stat.a).stavalues5 + END) AS most_common_vals, + (CASE + WHEN (stat.a).stakind1 = 1 THEN (stat.a).stanumbers1 + WHEN (stat.a).stakind2 = 1 THEN (stat.a).stanumbers2 + WHEN (stat.a).stakind3 = 1 THEN (stat.a).stanumbers3 + WHEN (stat.a).stakind4 = 1 THEN (stat.a).stanumbers4 + WHEN (stat.a).stakind5 = 1 THEN (stat.a).stanumbers5 + END) AS most_common_freqs, + (CASE + WHEN (stat.a).stakind1 = 2 THEN (stat.a).stavalues1 + WHEN (stat.a).stakind2 = 2 THEN (stat.a).stavalues2 + WHEN (stat.a).stakind3 = 2 THEN (stat.a).stavalues3 + WHEN (stat.a).stakind4 = 2 THEN (stat.a).stavalues4 + WHEN (stat.a).stakind5 = 2 THEN (stat.a).stavalues5 + END) AS histogram_bounds, + (CASE + WHEN (stat.a).stakind1 = 3 THEN (stat.a).stanumbers1[1] + WHEN (stat.a).stakind2 = 3 THEN (stat.a).stanumbers2[1] + WHEN (stat.a).stakind3 = 3 THEN (stat.a).stanumbers3[1] + WHEN (stat.a).stakind4 = 3 THEN (stat.a).stanumbers4[1] + WHEN (stat.a).stakind5 = 3 THEN (stat.a).stanumbers5[1] + END) correlation, + (CASE + WHEN (stat.a).stakind1 = 4 THEN (stat.a).stavalues1 + WHEN (stat.a).stakind2 = 4 THEN (stat.a).stavalues2 + WHEN (stat.a).stakind3 = 4 THEN (stat.a).stavalues3 + WHEN (stat.a).stakind4 = 4 THEN (stat.a).stavalues4 + WHEN (stat.a).stakind5 = 4 THEN (stat.a).stavalues5 + END) AS most_common_elems, + (CASE + WHEN (stat.a).stakind1 = 4 THEN (stat.a).stanumbers1 + WHEN (stat.a).stakind2 = 4 THEN (stat.a).stanumbers2 + WHEN (stat.a).stakind3 = 4 THEN (stat.a).stanumbers3 + WHEN (stat.a).stakind4 = 4 THEN (stat.a).stanumbers4 + WHEN (stat.a).stakind5 = 4 THEN (stat.a).stanumbers5 + END) AS most_common_elem_freqs, + (CASE + WHEN (stat.a).stakind1 = 5 THEN (stat.a).stanumbers1 + WHEN (stat.a).stakind2 = 5 THEN (stat.a).stanumbers2 + WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3 + WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4 + WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5 + END) AS elem_count_histogram + FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid) + LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid) + LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace) + LEFT JOIN pg_namespace sn ON (sn.oid = s.stxnamespace) + JOIN LATERAL ( + SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr, + unnest(sd.stxdexpr)::pg_statistic AS a + ) stat ON (stat.expr IS NOT NULL); + -- unprivileged users may read pg_statistic_ext but not pg_statistic_ext_data REVOKE ALL on pg_statistic_ext_data FROM public; diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index 2bae205845..df4768952d 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -29,6 +29,8 @@ #include "commands/comment.h" #include "commands/defrem.h" #include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/optimizer.h" #include "statistics/statistics.h" #include "utils/builtins.h" #include "utils/fmgroids.h" @@ -62,7 +64,8 @@ ObjectAddress CreateStatistics(CreateStatsStmt *stmt) { int16 attnums[STATS_MAX_DIMENSIONS]; - int numcols = 0; + int nattnums = 0; + int numcols; char *namestr; NameData stxname; Oid statoid; @@ -74,21 +77,25 @@ CreateStatistics(CreateStatsStmt *stmt) Datum datavalues[Natts_pg_statistic_ext_data]; bool datanulls[Natts_pg_statistic_ext_data]; int2vector *stxkeys; + List *stxexprs = NIL; + Datum exprsDatum; Relation statrel; Relation datarel; Relation rel = NULL; Oid relid; ObjectAddress parentobject, myself; - Datum types[3]; /* one for each possible type of statistic */ + Datum types[4]; /* one for each possible type of statistic */ int ntypes; ArrayType *stxkind; bool build_ndistinct; bool build_dependencies; bool build_mcv; + bool build_expressions; bool requested_type = false; int i; ListCell *cell; + ListCell *cell2; Assert(IsA(stmt, CreateStatsStmt)); @@ -190,101 +197,124 @@ CreateStatistics(CreateStatsStmt *stmt) } /* - * Currently, we only allow simple column references in the expression - * list. That will change someday, and again the grammar already supports - * it so we have to enforce restrictions here. For now, we can convert - * the expression list to a simple array of attnums. While at it, enforce - * some constraints. + * Make sure no more than STATS_MAX_DIMENSIONS columns are used. There + * might be duplicates and so on, but we'll deal with those later. + */ + numcols = list_length(stmt->exprs); + if (numcols > STATS_MAX_DIMENSIONS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("cannot have more than %d columns in statistics", + STATS_MAX_DIMENSIONS))); + + /* + * Convert the expression list to a simple array of attnums, but also keep + * a list of more complex expressions. While at it, enforce some + * constraints. + * + * XXX We do only the bare minimum to separate simple attribute and + * complex expressions - for example "(a)" will be treated as a complex + * expression. No matter how elaborate the check is, there'll always be a + * way around it, if the user is determined (consider e.g. "(a+0)"), so + * it's not worth protecting against it. */ foreach(cell, stmt->exprs) { Node *expr = (Node *) lfirst(cell); - ColumnRef *cref; - char *attname; + StatsElem *selem; HeapTuple atttuple; Form_pg_attribute attForm; TypeCacheEntry *type; - if (!IsA(expr, ColumnRef)) + /* + * We should not get anything else than StatsElem, given the grammar. + * But let's keep it as a safety. + */ + if (!IsA(expr, StatsElem)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only simple column references are allowed in CREATE STATISTICS"))); - cref = (ColumnRef *) expr; + errmsg("only simple column references and expressions are allowed in CREATE STATISTICS"))); - if (list_length(cref->fields) != 1) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only simple column references are allowed in CREATE STATISTICS"))); - attname = strVal((Value *) linitial(cref->fields)); + selem = (StatsElem *) expr; - atttuple = SearchSysCacheAttName(relid, attname); - if (!HeapTupleIsValid(atttuple)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column \"%s\" does not exist", - attname))); - attForm = (Form_pg_attribute) GETSTRUCT(atttuple); + if (selem->name) /* column reference */ + { + char *attname; - /* Disallow use of system attributes in extended stats */ - if (attForm->attnum <= 0) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("statistics creation on system columns is not supported"))); + attname = selem->name; - /* Disallow data types without a less-than operator */ - type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); - if (type->lt_opr == InvalidOid) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class", - attname, format_type_be(attForm->atttypid)))); + atttuple = SearchSysCacheAttName(relid, attname); + if (!HeapTupleIsValid(atttuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" does not exist", + attname))); + attForm = (Form_pg_attribute) GETSTRUCT(atttuple); - /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */ - if (numcols >= STATS_MAX_DIMENSIONS) - ereport(ERROR, - (errcode(ERRCODE_TOO_MANY_COLUMNS), - errmsg("cannot have more than %d columns in statistics", - STATS_MAX_DIMENSIONS))); + /* Disallow use of system attributes in extended stats */ + if (attForm->attnum <= 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("statistics creation on system columns is not supported"))); - attnums[numcols] = attForm->attnum; - numcols++; - ReleaseSysCache(atttuple); + /* Disallow data types without a less-than operator */ + type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class", + attname, format_type_be(attForm->atttypid)))); + + attnums[nattnums] = attForm->attnum; + nattnums++; + ReleaseSysCache(atttuple); + } + else /* expression */ + { + Node *expr = selem->expr; + Oid atttype; + + Assert(expr != NULL); + + /* + * Disallow data types without a less-than operator. + * + * We ignore this for statistics on a single expression, in which + * case we'll build the regular statistics only (and that code can + * deal with such data types). + */ + if (list_length(stmt->exprs) > 1) + { + atttype = exprType(expr); + type = lookup_type_cache(atttype, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class", + format_type_be(atttype)))); + } + + stxexprs = lappend(stxexprs, expr); + } } - /* - * Check that at least two columns were specified in the statement. The - * upper bound was already checked in the loop above. - */ - if (numcols < 2) - ereport(ERROR, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("extended statistics require at least 2 columns"))); - - /* - * Sort the attnums, which makes detecting duplicates somewhat easier, and - * it does not hurt (it does not affect the efficiency, unlike for - * indexes, for example). - */ - qsort(attnums, numcols, sizeof(int16), compare_int16); - - /* - * Check for duplicates in the list of columns. The attnums are sorted so - * just check consecutive elements. - */ - for (i = 1; i < numcols; i++) - { - if (attnums[i] == attnums[i - 1]) - ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_COLUMN), - errmsg("duplicate column name in statistics definition"))); - } - - /* Form an int2vector representation of the sorted column list */ - stxkeys = buildint2vector(attnums, numcols); - /* * Parse the statistics kinds. + * + * First check that if this is the case with a single expression, there + * are no statistics kinds specified (we don't allow that for the simple + * CREATE STATISTICS form). */ + if ((list_length(stmt->exprs) == 1) && (list_length(stxexprs) == 1)) + { + /* statistics kinds not specified */ + if (list_length(stmt->stat_types) > 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("when building statistics on a single expression, statistics kinds may not be specified"))); + } + + /* OK, let's check that we recognize the statistics kinds. */ build_ndistinct = false; build_dependencies = false; build_mcv = false; @@ -313,14 +343,91 @@ CreateStatistics(CreateStatsStmt *stmt) errmsg("unrecognized statistics kind \"%s\"", type))); } - /* If no statistic type was specified, build them all. */ - if (!requested_type) + + /* + * If no statistic type was specified, build them all (but only when the + * statistics is defined on more than one column/expression). + */ + if ((!requested_type) && (numcols >= 2)) { build_ndistinct = true; build_dependencies = true; build_mcv = true; } + /* + * When there are non-trivial expressions, build the expression stats + * automatically. This allows calculating good estimates for stats that + * consider per-clause estimates (e.g. functional dependencies). + */ + build_expressions = (list_length(stxexprs) > 0); + + /* + * Check that at least two columns were specified in the statement, or + * that we're building statistics on a single expression. + */ + if ((numcols < 2) && (list_length(stxexprs) != 1)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("extended statistics require at least 2 columns"))); + + /* + * Sort the attnums, which makes detecting duplicates somewhat easier, and + * it does not hurt (it does not matter for the contents, unlike for + * indexes, for example). + */ + qsort(attnums, nattnums, sizeof(int16), compare_int16); + + /* + * Check for duplicates in the list of columns. The attnums are sorted so + * just check consecutive elements. + */ + for (i = 1; i < nattnums; i++) + { + if (attnums[i] == attnums[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_COLUMN), + errmsg("duplicate column name in statistics definition"))); + } + + /* + * Check for duplicate expressions. We do two loops, counting the + * occurrences of each expression. This is O(N^2) but we only allow small + * number of expressions and it's not executed often. + * + * XXX We don't cross-check attributes and expressions, because it does + * not seem worth it. In principle we could check that expressions don't + * contain trivial attribute references like "(a)", but the reasoning is + * similar to why we don't bother with extracting columns from + * expressions. It's either expensive or very easy to defeat for + * determined user, and there's no risk if we allow such statistics (the + * statistics is useless, but harmless). + */ + foreach(cell, stxexprs) + { + Node *expr1 = (Node *) lfirst(cell); + int cnt = 0; + + foreach(cell2, stxexprs) + { + Node *expr2 = (Node *) lfirst(cell2); + + if (equal(expr1, expr2)) + cnt += 1; + } + + /* every expression should find at least itself */ + Assert(cnt >= 1); + + if (cnt > 1) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_COLUMN), + errmsg("duplicate expression in statistics definition"))); + } + + /* Form an int2vector representation of the sorted column list */ + stxkeys = buildint2vector(attnums, nattnums); + /* construct the char array of enabled statistic types */ ntypes = 0; if (build_ndistinct) @@ -329,9 +436,23 @@ CreateStatistics(CreateStatsStmt *stmt) types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES); if (build_mcv) types[ntypes++] = CharGetDatum(STATS_EXT_MCV); + if (build_expressions) + types[ntypes++] = CharGetDatum(STATS_EXT_EXPRESSIONS); Assert(ntypes > 0 && ntypes <= lengthof(types)); stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR); + /* convert the expressions (if any) to a text datum */ + if (stxexprs != NIL) + { + char *exprsString; + + exprsString = nodeToString(stxexprs); + exprsDatum = CStringGetTextDatum(exprsString); + pfree(exprsString); + } + else + exprsDatum = (Datum) 0; + statrel = table_open(StatisticExtRelationId, RowExclusiveLock); /* @@ -351,6 +472,10 @@ CreateStatistics(CreateStatsStmt *stmt) values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys); values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind); + values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum; + if (exprsDatum == (Datum) 0) + nulls[Anum_pg_statistic_ext_stxexprs - 1] = true; + /* insert it into pg_statistic_ext */ htup = heap_form_tuple(statrel->rd_att, values, nulls); CatalogTupleInsert(statrel, htup); @@ -373,6 +498,7 @@ CreateStatistics(CreateStatsStmt *stmt) datanulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true; datanulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = true; datanulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true; + datanulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = true; /* insert it into pg_statistic_ext_data */ htup = heap_form_tuple(datarel->rd_att, datavalues, datanulls); @@ -396,12 +522,41 @@ CreateStatistics(CreateStatsStmt *stmt) */ ObjectAddressSet(myself, StatisticExtRelationId, statoid); - for (i = 0; i < numcols; i++) + /* add dependencies for plain column references */ + for (i = 0; i < nattnums; i++) { ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]); recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); } + /* + * If there are no dependencies on a column, give the statistics an auto + * dependency on the whole table. In most cases, this will be redundant, + * but it might not be if the statistics expressions contain no Vars + * (which might seem strange but possible). This is consistent with what + * we do for indexes in index_create. + * + * XXX We intentionally don't consider the expressions before adding this + * dependency, because recordDependencyOnSingleRelExpr may not create any + * dependencies for whole-row Vars. + */ + if (!nattnums) + { + ObjectAddressSet(parentobject, RelationRelationId, relid); + recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); + } + + /* + * Store dependencies on anything mentioned in statistics expressions, + * just like we do for index expressions. + */ + if (stxexprs) + recordDependencyOnSingleRelExpr(&myself, + (Node *) stxexprs, + relid, + DEPENDENCY_NORMAL, + DEPENDENCY_AUTO, false, true); + /* * Also add dependencies on namespace and owner. These are required * because the stats object might have a different namespace and/or owner @@ -582,87 +737,6 @@ RemoveStatisticsById(Oid statsOid) table_close(relation, RowExclusiveLock); } -/* - * Update a statistics object for ALTER COLUMN TYPE on a source column. - * - * This could throw an error if the type change can't be supported. - * If it can be supported, but the stats must be recomputed, a likely choice - * would be to set the relevant column(s) of the pg_statistic_ext_data tuple - * to null until the next ANALYZE. (Note that the type change hasn't actually - * happened yet, so one option that's *not* on the table is to recompute - * immediately.) - * - * For both ndistinct and functional-dependencies stats, the on-disk - * representation is independent of the source column data types, and it is - * plausible to assume that the old statistic values will still be good for - * the new column contents. (Obviously, if the ALTER COLUMN TYPE has a USING - * expression that substantially alters the semantic meaning of the column - * values, this assumption could fail. But that seems like a corner case - * that doesn't justify zapping the stats in common cases.) - * - * For MCV lists that's not the case, as those statistics store the datums - * internally. In this case we simply reset the statistics value to NULL. - * - * Note that "type change" includes collation change, which means we can rely - * on the MCV list being consistent with the collation info in pg_attribute - * during estimation. - */ -void -UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum, - Oid oldColumnType, Oid newColumnType) -{ - HeapTuple stup, - oldtup; - - Relation rel; - - Datum values[Natts_pg_statistic_ext_data]; - bool nulls[Natts_pg_statistic_ext_data]; - bool replaces[Natts_pg_statistic_ext_data]; - - oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statsOid)); - if (!HeapTupleIsValid(oldtup)) - elog(ERROR, "cache lookup failed for statistics object %u", statsOid); - - /* - * When none of the defined statistics types contain datum values from the - * table's columns then there's no need to reset the stats. Functional - * dependencies and ndistinct stats should still hold true. - */ - if (!statext_is_kind_built(oldtup, STATS_EXT_MCV)) - { - ReleaseSysCache(oldtup); - return; - } - - /* - * OK, we need to reset some statistics. So let's build the new tuple, - * replacing the affected statistics types with NULL. - */ - memset(nulls, 0, Natts_pg_statistic_ext_data * sizeof(bool)); - memset(replaces, 0, Natts_pg_statistic_ext_data * sizeof(bool)); - memset(values, 0, Natts_pg_statistic_ext_data * sizeof(Datum)); - - replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true; - nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true; - - rel = table_open(StatisticExtDataRelationId, RowExclusiveLock); - - /* replace the old tuple */ - stup = heap_modify_tuple(oldtup, - RelationGetDescr(rel), - values, - nulls, - replaces); - - ReleaseSysCache(oldtup); - CatalogTupleUpdate(rel, &stup->t_self, stup); - - heap_freetuple(stup); - - table_close(rel, RowExclusiveLock); -} - /* * Select a nonconflicting name for a new statistics. * @@ -731,18 +805,27 @@ ChooseExtendedStatisticNameAddition(List *exprs) buf[0] = '\0'; foreach(lc, exprs) { - ColumnRef *cref = (ColumnRef *) lfirst(lc); + StatsElem *selem = (StatsElem *) lfirst(lc); const char *name; /* It should be one of these, but just skip if it happens not to be */ - if (!IsA(cref, ColumnRef)) + if (!IsA(selem, StatsElem)) continue; - name = strVal((Value *) linitial(cref->fields)); + name = selem->name; if (buflen > 0) buf[buflen++] = '_'; /* insert _ between names */ + /* + * We use fixed 'expr' for expressions, which have empty column names. + * For indexes this is handled in ChooseIndexColumnNames, but we have + * no such function for stats and it does not seem worth adding. If a + * better name is needed, the user can specify it explicitly. + */ + if (!name) + name = "expr"; + /* * At this point we have buflen <= NAMEDATALEN. name should be less * than NAMEDATALEN already, but use strlcpy for paranoia. @@ -754,3 +837,29 @@ ChooseExtendedStatisticNameAddition(List *exprs) } return pstrdup(buf); } + +/* + * StatisticsGetRelation: given a statistics's relation OID, get the OID of + * the relation it is an statistics on. Uses the system cache. + */ +Oid +StatisticsGetRelation(Oid statId, bool missing_ok) +{ + HeapTuple tuple; + Form_pg_statistic_ext stx; + Oid result; + + tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statId)); + if (!HeapTupleIsValid(tuple)) + { + if (missing_ok) + return InvalidOid; + elog(ERROR, "cache lookup failed for statistics object %u", statId); + } + stx = (Form_pg_statistic_ext) GETSTRUCT(tuple); + Assert(stx->oid == statId); + + result = stx->stxrelid; + ReleaseSysCache(tuple); + return result; +} diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index efac06f72c..88a68a4697 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -41,6 +41,7 @@ #include "catalog/pg_namespace.h" #include "catalog/pg_opclass.h" #include "catalog/pg_tablespace.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_trigger.h" #include "catalog/pg_type.h" #include "catalog/storage.h" @@ -188,6 +189,8 @@ typedef struct AlteredTableInfo List *changedIndexDefs; /* string definitions of same */ char *replicaIdentityIndex; /* index to reset as REPLICA IDENTITY */ char *clusterOnIndex; /* index to use for CLUSTER */ + List *changedStatisticsOids; /* OIDs of statistics to rebuild */ + List *changedStatisticsDefs; /* string definitions of same */ } AlteredTableInfo; /* Struct describing one new constraint to check in Phase 3 scan */ @@ -440,6 +443,8 @@ static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *c ObjectAddresses *addrs); static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel, IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode); +static ObjectAddress ATExecAddStatistics(AlteredTableInfo *tab, Relation rel, + CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode); static ObjectAddress ATExecAddConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, Constraint *newConstraint, bool recurse, bool is_readd, @@ -496,6 +501,7 @@ static ObjectAddress ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel, AlterTableCmd *cmd, LOCKMODE lockmode); static void RememberConstraintForRebuilding(Oid conoid, AlteredTableInfo *tab); static void RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab); +static void RememberStatisticsForRebuilding(Oid indoid, AlteredTableInfo *tab); static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode); static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, @@ -4756,6 +4762,10 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, true, lockmode); break; + case AT_ReAddStatistics: /* ADD STATISTICS */ + address = ATExecAddStatistics(tab, rel, (CreateStatsStmt *) cmd->def, + true, lockmode); + break; case AT_AddConstraint: /* ADD CONSTRAINT */ /* Transform the command only during initial examination */ if (cur_pass == AT_PASS_ADD_CONSTR) @@ -8283,6 +8293,29 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel, return address; } +/* + * ALTER TABLE ADD STATISTICS + * + * This is no such command in the grammar, but we use this internally to add + * AT_ReAddStatistics subcommands to rebuild extended statistics after a table + * column type change. + */ +static ObjectAddress +ATExecAddStatistics(AlteredTableInfo *tab, Relation rel, + CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode) +{ + ObjectAddress address; + + Assert(IsA(stmt, CreateStatsStmt)); + + /* The CreateStatsStmt has already been through transformStatsStmt */ + Assert(stmt->transformed); + + address = CreateStatistics(stmt); + + return address; +} + /* * ALTER TABLE ADD CONSTRAINT USING INDEX * @@ -11830,9 +11863,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel, * Give the extended-stats machinery a chance to fix anything * that this column type change would break. */ - UpdateStatisticsForTypeChange(foundObject.objectId, - RelationGetRelid(rel), attnum, - attTup->atttypid, targettype); + RememberStatisticsForRebuilding(foundObject.objectId, tab); break; case OCLASS_PROC: @@ -12202,6 +12233,32 @@ RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab) } } +/* + * Subroutine for ATExecAlterColumnType: remember that a statistics object + * needs to be rebuilt (which we might already know). + */ +static void +RememberStatisticsForRebuilding(Oid stxoid, AlteredTableInfo *tab) +{ + /* + * This de-duplication check is critical for two independent reasons: we + * mustn't try to recreate the same statistics object twice, and if the + * statistics depends on more than one column whose type is to be altered, + * we must capture its definition string before applying any of the type + * changes. ruleutils.c will get confused if we ask again later. + */ + if (!list_member_oid(tab->changedStatisticsOids, stxoid)) + { + /* OK, capture the index's existing definition string */ + char *defstring = pg_get_statisticsobjdef_string(stxoid); + + tab->changedStatisticsOids = lappend_oid(tab->changedStatisticsOids, + stxoid); + tab->changedStatisticsDefs = lappend(tab->changedStatisticsDefs, + defstring); + } +} + /* * Cleanup after we've finished all the ALTER TYPE operations for a * particular relation. We have to drop and recreate all the indexes @@ -12306,6 +12363,22 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) add_exact_object_address(&obj, objects); } + /* add dependencies for new statistics */ + forboth(oid_item, tab->changedStatisticsOids, + def_item, tab->changedStatisticsDefs) + { + Oid oldId = lfirst_oid(oid_item); + Oid relid; + + relid = StatisticsGetRelation(oldId, false); + ATPostAlterTypeParse(oldId, relid, InvalidOid, + (char *) lfirst(def_item), + wqueue, lockmode, tab->rewrite); + + ObjectAddressSet(obj, StatisticExtRelationId, oldId); + add_exact_object_address(&obj, objects); + } + /* * Queue up command to restore replica identity index marking */ @@ -12354,9 +12427,9 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) } /* - * Parse the previously-saved definition string for a constraint or index - * against the newly-established column data type(s), and queue up the - * resulting command parsetrees for execution. + * Parse the previously-saved definition string for a constraint, index or + * statistics object against the newly-established column data type(s), and + * queue up the resulting command parsetrees for execution. * * This might fail if, for example, you have a WHERE clause that uses an * operator that's not available for the new column type. @@ -12402,6 +12475,11 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, querytree_list = lappend(querytree_list, stmt); querytree_list = list_concat(querytree_list, afterStmts); } + else if (IsA(stmt, CreateStatsStmt)) + querytree_list = lappend(querytree_list, + transformStatsStmt(oldRelId, + (CreateStatsStmt *) stmt, + cmd)); else querytree_list = lappend(querytree_list, stmt); } @@ -12540,6 +12618,20 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, elog(ERROR, "unexpected statement subtype: %d", (int) stmt->subtype); } + else if (IsA(stm, CreateStatsStmt)) + { + CreateStatsStmt *stmt = (CreateStatsStmt *) stm; + AlterTableCmd *newcmd; + + /* keep the statistics object's comment */ + stmt->stxcomment = GetComment(oldId, StatisticExtRelationId, 0); + + newcmd = makeNode(AlterTableCmd); + newcmd->subtype = AT_ReAddStatistics; + newcmd->def = (Node *) stmt; + tab->subcmds[AT_PASS_MISC] = + lappend(tab->subcmds[AT_PASS_MISC], newcmd); + } else elog(ERROR, "unexpected statement type: %d", (int) nodeTag(stm)); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 38b56231b7..d5b1ad4567 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2980,6 +2980,17 @@ _copyIndexElem(const IndexElem *from) return newnode; } +static StatsElem * +_copyStatsElem(const StatsElem *from) +{ + StatsElem *newnode = makeNode(StatsElem); + + COPY_STRING_FIELD(name); + COPY_NODE_FIELD(expr); + + return newnode; +} + static ColumnDef * _copyColumnDef(const ColumnDef *from) { @@ -5699,6 +5710,9 @@ copyObjectImpl(const void *from) case T_IndexElem: retval = _copyIndexElem(from); break; + case T_StatsElem: + retval = _copyStatsElem(from); + break; case T_ColumnDef: retval = _copyColumnDef(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 3292dda342..d46909bbc4 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2596,6 +2596,16 @@ _equalIndexElem(const IndexElem *a, const IndexElem *b) return true; } + +static bool +_equalStatsElem(const StatsElem *a, const StatsElem *b) +{ + COMPARE_STRING_FIELD(name); + COMPARE_NODE_FIELD(expr); + + return true; +} + static bool _equalColumnDef(const ColumnDef *a, const ColumnDef *b) { @@ -3724,6 +3734,9 @@ equal(const void *a, const void *b) case T_IndexElem: retval = _equalIndexElem(a, b); break; + case T_StatsElem: + retval = _equalStatsElem(a, b); + break; case T_ColumnDef: retval = _equalColumnDef(a, b); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 9f7918c7e9..12561c4757 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2943,6 +2943,15 @@ _outIndexElem(StringInfo str, const IndexElem *node) WRITE_ENUM_FIELD(nulls_ordering, SortByNulls); } +static void +_outStatsElem(StringInfo str, const StatsElem *node) +{ + WRITE_NODE_TYPE("STATSELEM"); + + WRITE_STRING_FIELD(name); + WRITE_NODE_FIELD(expr); +} + static void _outQuery(StringInfo str, const Query *node) { @@ -4286,6 +4295,9 @@ outNode(StringInfo str, const void *obj) case T_IndexElem: _outIndexElem(str, obj); break; + case T_StatsElem: + _outStatsElem(str, obj); + break; case T_Query: _outQuery(str, obj); break; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 6c39bf893f..0fa8875f09 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -34,6 +34,7 @@ #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" #include "nodes/supportnodes.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" @@ -1308,6 +1309,7 @@ get_relation_constraints(PlannerInfo *root, static List * get_relation_statistics(RelOptInfo *rel, Relation relation) { + Index varno = rel->relid; List *statoidlist; List *stainfos = NIL; ListCell *l; @@ -1321,6 +1323,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) HeapTuple htup; HeapTuple dtup; Bitmapset *keys = NULL; + List *exprs = NIL; int i; htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); @@ -1340,6 +1343,49 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) for (i = 0; i < staForm->stxkeys.dim1; i++) keys = bms_add_member(keys, staForm->stxkeys.values[i]); + /* + * Preprocess expressions (if any). We read the expressions, run them + * through eval_const_expressions, and fix the varnos. + */ + { + bool isnull; + Datum datum; + + /* decode expression (if any) */ + datum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stxexprs, &isnull); + + if (!isnull) + { + char *exprsString; + + exprsString = TextDatumGetCString(datum); + exprs = (List *) stringToNode(exprsString); + pfree(exprsString); + + /* + * Run the expressions through eval_const_expressions. This is + * not just an optimization, but is necessary, because the + * planner will be comparing them to similarly-processed qual + * clauses, and may fail to detect valid matches without this. + * We must not use canonicalize_qual, however, since these + * aren't qual expressions. + */ + exprs = (List *) eval_const_expressions(NULL, (Node *) exprs); + + /* May as well fix opfuncids too */ + fix_opfuncids((Node *) exprs); + + /* + * Modify the copies we obtain from the relcache to have the + * correct varno for the parent relation, so that they match + * up correctly against qual clauses. + */ + if (varno != 1) + ChangeVarNodes((Node *) exprs, 1, varno, 0); + } + } + /* add one StatisticExtInfo for each kind built */ if (statext_is_kind_built(dtup, STATS_EXT_NDISTINCT)) { @@ -1349,6 +1395,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) info->rel = rel; info->kind = STATS_EXT_NDISTINCT; info->keys = bms_copy(keys); + info->exprs = exprs; stainfos = lappend(stainfos, info); } @@ -1361,6 +1408,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) info->rel = rel; info->kind = STATS_EXT_DEPENDENCIES; info->keys = bms_copy(keys); + info->exprs = exprs; stainfos = lappend(stainfos, info); } @@ -1373,6 +1421,20 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) info->rel = rel; info->kind = STATS_EXT_MCV; info->keys = bms_copy(keys); + info->exprs = exprs; + + stainfos = lappend(stainfos, info); + } + + if (statext_is_kind_built(dtup, STATS_EXT_EXPRESSIONS)) + { + StatisticExtInfo *info = makeNode(StatisticExtInfo); + + info->statOid = statOid; + info->rel = rel; + info->kind = STATS_EXT_EXPRESSIONS; + info->keys = bms_copy(keys); + info->exprs = exprs; stainfos = lappend(stainfos, info); } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 2132cf4d82..7ff36bc842 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -239,6 +239,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); WindowDef *windef; JoinExpr *jexpr; IndexElem *ielem; + StatsElem *selem; Alias *alias; RangeVar *range; IntoClause *into; @@ -405,7 +406,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); old_aggr_definition old_aggr_list oper_argtypes RuleActionList RuleActionMulti opt_column_list columnList opt_name_list - sort_clause opt_sort_clause sortby_list index_params + sort_clause opt_sort_clause sortby_list index_params stats_params opt_include opt_c_include index_including_params name_list role_list from_clause from_list opt_array_bounds qualified_name_list any_name any_name_list type_name_list @@ -512,6 +513,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type func_alias_clause %type sortby %type index_elem index_elem_options +%type stats_param %type table_ref %type joined_table %type relation_expr @@ -4097,7 +4099,7 @@ ExistingIndex: USING INDEX name { $$ = $3; } CreateStatsStmt: CREATE STATISTICS any_name - opt_name_list ON expr_list FROM from_list + opt_name_list ON stats_params FROM from_list { CreateStatsStmt *n = makeNode(CreateStatsStmt); n->defnames = $3; @@ -4109,7 +4111,7 @@ CreateStatsStmt: $$ = (Node *)n; } | CREATE STATISTICS IF_P NOT EXISTS any_name - opt_name_list ON expr_list FROM from_list + opt_name_list ON stats_params FROM from_list { CreateStatsStmt *n = makeNode(CreateStatsStmt); n->defnames = $6; @@ -4122,6 +4124,36 @@ CreateStatsStmt: } ; +/* + * Statistics attributes can be either simple column references, or arbitrary + * expressions in parens. For compatibility with index attributes permitted + * in CREATE INDEX, we allow an expression that's just a function call to be + * written without parens. + */ + +stats_params: stats_param { $$ = list_make1($1); } + | stats_params ',' stats_param { $$ = lappend($1, $3); } + ; + +stats_param: ColId + { + $$ = makeNode(StatsElem); + $$->name = $1; + $$->expr = NULL; + } + | func_expr_windowless + { + $$ = makeNode(StatsElem); + $$->name = NULL; + $$->expr = $1; + } + | '(' a_expr ')' + { + $$ = makeNode(StatsElem); + $$->name = NULL; + $$->expr = $2; + } + ; /***************************************************************************** * diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c index 7c3e01aa22..ceb0bf597d 100644 --- a/src/backend/parser/parse_agg.c +++ b/src/backend/parser/parse_agg.c @@ -484,6 +484,13 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr) else err = _("grouping operations are not allowed in index predicates"); + break; + case EXPR_KIND_STATS_EXPRESSION: + if (isAgg) + err = _("aggregate functions are not allowed in statistics expressions"); + else + err = _("grouping operations are not allowed in statistics expressions"); + break; case EXPR_KIND_ALTER_COL_TRANSFORM: if (isAgg) @@ -910,6 +917,9 @@ transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc, case EXPR_KIND_INDEX_EXPRESSION: err = _("window functions are not allowed in index expressions"); break; + case EXPR_KIND_STATS_EXPRESSION: + err = _("window functions are not allowed in statistics expressions"); + break; case EXPR_KIND_INDEX_PREDICATE: err = _("window functions are not allowed in index predicates"); break; diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index f869e159d6..03373d551f 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -500,6 +500,7 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) case EXPR_KIND_FUNCTION_DEFAULT: case EXPR_KIND_INDEX_EXPRESSION: case EXPR_KIND_INDEX_PREDICATE: + case EXPR_KIND_STATS_EXPRESSION: case EXPR_KIND_ALTER_COL_TRANSFORM: case EXPR_KIND_EXECUTE_PARAMETER: case EXPR_KIND_TRIGGER_WHEN: @@ -1741,6 +1742,9 @@ transformSubLink(ParseState *pstate, SubLink *sublink) case EXPR_KIND_INDEX_PREDICATE: err = _("cannot use subquery in index predicate"); break; + case EXPR_KIND_STATS_EXPRESSION: + err = _("cannot use subquery in statistics expression"); + break; case EXPR_KIND_ALTER_COL_TRANSFORM: err = _("cannot use subquery in transform expression"); break; @@ -3030,6 +3034,8 @@ ParseExprKindName(ParseExprKind exprKind) return "index expression"; case EXPR_KIND_INDEX_PREDICATE: return "index predicate"; + case EXPR_KIND_STATS_EXPRESSION: + return "statistics expression"; case EXPR_KIND_ALTER_COL_TRANSFORM: return "USING"; case EXPR_KIND_EXECUTE_PARAMETER: diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index 37cebc7d82..debef1d14f 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -2503,6 +2503,9 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location) case EXPR_KIND_INDEX_PREDICATE: err = _("set-returning functions are not allowed in index predicates"); break; + case EXPR_KIND_STATS_EXPRESSION: + err = _("set-returning functions are not allowed in statistics expressions"); + break; case EXPR_KIND_ALTER_COL_TRANSFORM: err = _("set-returning functions are not allowed in transform expressions"); break; diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index aa6c19adad..b968c25dd6 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1917,6 +1917,9 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid, stat_types = lappend(stat_types, makeString("dependencies")); else if (enabled[i] == STATS_EXT_MCV) stat_types = lappend(stat_types, makeString("mcv")); + else if (enabled[i] == STATS_EXT_EXPRESSIONS) + /* expression stats are not exposed to users */ + continue; else elog(ERROR, "unrecognized statistics kind %c", enabled[i]); } @@ -1924,14 +1927,47 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid, /* Determine which columns the statistics are on */ for (i = 0; i < statsrec->stxkeys.dim1; i++) { - ColumnRef *cref = makeNode(ColumnRef); + StatsElem *selem = makeNode(StatsElem); AttrNumber attnum = statsrec->stxkeys.values[i]; - cref->fields = list_make1(makeString(get_attname(heapRelid, - attnum, false))); - cref->location = -1; + selem->name = get_attname(heapRelid, attnum, false); + selem->expr = NULL; - def_names = lappend(def_names, cref); + def_names = lappend(def_names, selem); + } + + /* + * Now handle expressions, if there are any. The order (with respect to + * regular attributes) does not really matter for extended stats, so we + * simply append them after simple column references. + * + * XXX Some places during build/estimation treat expressions as if they + * are before atttibutes, but for the CREATE command that's entirely + * irrelevant. + */ + datum = SysCacheGetAttr(STATEXTOID, ht_stats, + Anum_pg_statistic_ext_stxexprs, &isnull); + + if (!isnull) + { + ListCell *lc; + List *exprs = NIL; + char *exprsString; + + exprsString = TextDatumGetCString(datum); + exprs = (List *) stringToNode(exprsString); + + foreach(lc, exprs) + { + StatsElem *selem = makeNode(StatsElem); + + selem->name = NULL; + selem->expr = (Node *) lfirst(lc); + + def_names = lappend(def_names, selem); + } + + pfree(exprsString); } /* finally, build the output node */ @@ -1942,6 +1978,7 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid, stats->relations = list_make1(heapRel); stats->stxcomment = NULL; stats->if_not_exists = false; + stats->transformed = true; /* don't need transformStatsStmt again */ /* Clean up */ ReleaseSysCache(ht_stats); @@ -2866,6 +2903,84 @@ transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString) return stmt; } +/* + * transformStatsStmt - parse analysis for CREATE STATISTICS + * + * To avoid race conditions, it's important that this function rely only on + * the passed-in relid (and not on stmt->relation) to determine the target + * relation. + */ +CreateStatsStmt * +transformStatsStmt(Oid relid, CreateStatsStmt *stmt, const char *queryString) +{ + ParseState *pstate; + ParseNamespaceItem *nsitem; + ListCell *l; + Relation rel; + + /* Nothing to do if statement already transformed. */ + if (stmt->transformed) + return stmt; + + /* + * We must not scribble on the passed-in CreateStatsStmt, so copy it. + * (This is overkill, but easy.) + */ + stmt = copyObject(stmt); + + /* Set up pstate */ + pstate = make_parsestate(NULL); + pstate->p_sourcetext = queryString; + + /* + * Put the parent table into the rtable so that the expressions can refer + * to its fields without qualification. Caller is responsible for locking + * relation, but we still need to open it. + */ + rel = relation_open(relid, NoLock); + nsitem = addRangeTableEntryForRelation(pstate, rel, + AccessShareLock, + NULL, false, true); + + /* no to join list, yes to namespaces */ + addNSItemToQuery(pstate, nsitem, false, true, true); + + /* take care of any expressions */ + foreach(l, stmt->exprs) + { + StatsElem *selem = (StatsElem *) lfirst(l); + + if (selem->expr) + { + /* Now do parse transformation of the expression */ + selem->expr = transformExpr(pstate, selem->expr, + EXPR_KIND_STATS_EXPRESSION); + + /* We have to fix its collations too */ + assign_expr_collations(pstate, selem->expr); + } + } + + /* + * Check that only the base rel is mentioned. (This should be dead code + * now that add_missing_from is history.) + */ + if (list_length(pstate->p_rtable) != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("statistics expressions can refer only to the table being indexed"))); + + free_parsestate(pstate); + + /* Close relation */ + table_close(rel, NoLock); + + /* Mark statement as successfully transformed */ + stmt->transformed = true; + + return stmt; +} + /* * transformRuleStmt - diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c index eac9285165..cf8a6d5f68 100644 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -70,15 +70,15 @@ static void generate_dependencies(DependencyGenerator state); static DependencyGenerator DependencyGenerator_init(int n, int k); static void DependencyGenerator_free(DependencyGenerator state); static AttrNumber *DependencyGenerator_next(DependencyGenerator state); -static double dependency_degree(int numrows, HeapTuple *rows, int k, - AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); +static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency); static bool dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums); static bool dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum); +static bool dependency_is_compatible_expression(Node *clause, Index relid, + List *statlist, Node **expr); static MVDependency *find_strongest_dependency(MVDependencies **dependencies, - int ndependencies, - Bitmapset *attnums); + int ndependencies, Bitmapset *attnums); static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, @@ -219,16 +219,13 @@ DependencyGenerator_next(DependencyGenerator state) * the last one. */ static double -dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, - VacAttrStats **stats, Bitmapset *attrs) +dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency) { int i, nitems; MultiSortSupport mss; SortItem *items; - AttrNumber *attnums; AttrNumber *attnums_dep; - int numattrs; /* counters valid within a group */ int group_size = 0; @@ -244,15 +241,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, mss = multi_sort_init(k); /* - * Transform the attrs from bitmap to an array to make accessing the i-th - * member easier, and then construct a filtered version with only attnums - * referenced by the dependency we validate. + * Translate the array of indexes to regular attnums for the dependency (we + * will need this to identify the columns in StatsBuildData). */ - attnums = build_attnums_array(attrs, &numattrs); - attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber)); for (i = 0; i < k; i++) - attnums_dep[i] = attnums[dependency[i]]; + attnums_dep[i] = data->attnums[dependency[i]]; /* * Verify the dependency (a,b,...)->z, using a rather simple algorithm: @@ -270,7 +264,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, /* prepare the sort function for the dimensions */ for (i = 0; i < k; i++) { - VacAttrStats *colstat = stats[dependency[i]]; + VacAttrStats *colstat = data->stats[dependency[i]]; TypeCacheEntry *type; type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); @@ -289,8 +283,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, * descriptor. For now that assumption holds, but it might change in the * future for example if we support statistics on multiple tables. */ - items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc, - mss, k, attnums_dep); + items = build_sorted_items(data, &nitems, mss, k, attnums_dep); /* * Walk through the sorted array, split it into rows according to the @@ -336,11 +329,10 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, pfree(items); pfree(mss); - pfree(attnums); pfree(attnums_dep); /* Compute the 'degree of validity' as (supporting/total). */ - return (n_supporting_rows * 1.0 / numrows); + return (n_supporting_rows * 1.0 / data->numrows); } /* @@ -360,23 +352,15 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, * (c) -> b */ MVDependencies * -statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, - VacAttrStats **stats) +statext_dependencies_build(StatsBuildData *data) { int i, k; - int numattrs; - AttrNumber *attnums; /* result */ MVDependencies *dependencies = NULL; - /* - * Transform the bms into an array, to make accessing i-th member easier. - */ - attnums = build_attnums_array(attrs, &numattrs); - - Assert(numattrs >= 2); + Assert(data->nattnums >= 2); /* * We'll try build functional dependencies starting from the smallest ones @@ -384,12 +368,12 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, * included in the statistics object. We start from the smallest ones * because we want to be able to skip already implied ones. */ - for (k = 2; k <= numattrs; k++) + for (k = 2; k <= data->nattnums; k++) { AttrNumber *dependency; /* array with k elements */ /* prepare a DependencyGenerator of variation */ - DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k); + DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k); /* generate all possible variations of k values (out of n) */ while ((dependency = DependencyGenerator_next(DependencyGenerator))) @@ -398,7 +382,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, MVDependency *d; /* compute how valid the dependency seems */ - degree = dependency_degree(numrows, rows, k, dependency, stats, attrs); + degree = dependency_degree(data, k, dependency); /* * if the dependency seems entirely invalid, don't store it @@ -413,7 +397,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, d->degree = degree; d->nattributes = k; for (i = 0; i < k; i++) - d->attributes[i] = attnums[dependency[i]]; + d->attributes[i] = data->attnums[dependency[i]]; /* initialize the list of dependencies */ if (dependencies == NULL) @@ -747,6 +731,7 @@ static bool dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) { Var *var; + Node *clause_expr; if (IsA(clause, RestrictInfo)) { @@ -774,9 +759,9 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) /* Make sure non-selected argument is a pseudoconstant. */ if (is_pseudo_constant_clause(lsecond(expr->args))) - var = linitial(expr->args); + clause_expr = linitial(expr->args); else if (is_pseudo_constant_clause(linitial(expr->args))) - var = lsecond(expr->args); + clause_expr = lsecond(expr->args); else return false; @@ -805,8 +790,8 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) /* * Reject ALL() variant, we only care about ANY/IN. * - * FIXME Maybe we should check if all the values are the same, and - * allow ALL in that case? Doesn't seem very practical, though. + * XXX Maybe we should check if all the values are the same, and allow + * ALL in that case? Doesn't seem very practical, though. */ if (!expr->useOr) return false; @@ -822,7 +807,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) if (!is_pseudo_constant_clause(lsecond(expr->args))) return false; - var = linitial(expr->args); + clause_expr = linitial(expr->args); /* * If it's not an "=" operator, just ignore the clause, as it's not @@ -838,13 +823,13 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) } else if (is_orclause(clause)) { - BoolExpr *expr = (BoolExpr *) clause; + BoolExpr *bool_expr = (BoolExpr *) clause; ListCell *lc; /* start with no attribute number */ *attnum = InvalidAttrNumber; - foreach(lc, expr->args) + foreach(lc, bool_expr->args) { AttrNumber clause_attnum; @@ -859,6 +844,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) if (*attnum == InvalidAttrNumber) *attnum = clause_attnum; + /* ensure all the variables are the same (same attnum) */ if (*attnum != clause_attnum) return false; } @@ -872,7 +858,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) * "NOT x" can be interpreted as "x = false", so get the argument and * proceed with seeing if it's a suitable Var. */ - var = (Var *) get_notclausearg(clause); + clause_expr = (Node *) get_notclausearg(clause); } else { @@ -880,20 +866,23 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) * A boolean expression "x" can be interpreted as "x = true", so * proceed with seeing if it's a suitable Var. */ - var = (Var *) clause; + clause_expr = (Node *) clause; } /* * We may ignore any RelabelType node above the operand. (There won't be * more than one, since eval_const_expressions has been applied already.) */ - if (IsA(var, RelabelType)) - var = (Var *) ((RelabelType *) var)->arg; + if (IsA(clause_expr, RelabelType)) + clause_expr = (Node *) ((RelabelType *) clause_expr)->arg; /* We only support plain Vars for now */ - if (!IsA(var, Var)) + if (!IsA(clause_expr, Var)) return false; + /* OK, we know we have a Var */ + var = (Var *) clause_expr; + /* Ensure Var is from the correct relation */ if (var->varno != relid) return false; @@ -1157,6 +1146,212 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses, return s1; } +/* + * dependency_is_compatible_expression + * Determines if the expression is compatible with functional dependencies + * + * Similar to dependency_is_compatible_clause, but doesn't enforce that the + * expression is a simple Var. OTOH we check that there's at least one + * statistics object matching the expression. + */ +static bool +dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr) +{ + List *vars; + ListCell *lc, + *lc2; + Node *clause_expr; + + if (IsA(clause, RestrictInfo)) + { + RestrictInfo *rinfo = (RestrictInfo *) clause; + + /* Pseudoconstants are not interesting (they couldn't contain a Var) */ + if (rinfo->pseudoconstant) + return false; + + /* Clauses referencing multiple, or no, varnos are incompatible */ + if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + return false; + + clause = (Node *) rinfo->clause; + } + + if (is_opclause(clause)) + { + /* If it's an opclause, check for Var = Const or Const = Var. */ + OpExpr *expr = (OpExpr *) clause; + + /* Only expressions with two arguments are candidates. */ + if (list_length(expr->args) != 2) + return false; + + /* Make sure non-selected argument is a pseudoconstant. */ + if (is_pseudo_constant_clause(lsecond(expr->args))) + clause_expr = linitial(expr->args); + else if (is_pseudo_constant_clause(linitial(expr->args))) + clause_expr = lsecond(expr->args); + else + return false; + + /* + * If it's not an "=" operator, just ignore the clause, as it's not + * compatible with functional dependencies. + * + * This uses the function for estimating selectivity, not the operator + * directly (a bit awkward, but well ...). + * + * XXX this is pretty dubious; probably it'd be better to check btree + * or hash opclass membership, so as not to be fooled by custom + * selectivity functions, and to be more consistent with decisions + * elsewhere in the planner. + */ + if (get_oprrest(expr->opno) != F_EQSEL) + return false; + + /* OK to proceed with checking "var" */ + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + /* If it's an scalar array operator, check for Var IN Const. */ + ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; + + /* + * Reject ALL() variant, we only care about ANY/IN. + * + * FIXME Maybe we should check if all the values are the same, and + * allow ALL in that case? Doesn't seem very practical, though. + */ + if (!expr->useOr) + return false; + + /* Only expressions with two arguments are candidates. */ + if (list_length(expr->args) != 2) + return false; + + /* + * We know it's always (Var IN Const), so we assume the var is the + * first argument, and pseudoconstant is the second one. + */ + if (!is_pseudo_constant_clause(lsecond(expr->args))) + return false; + + clause_expr = linitial(expr->args); + + /* + * If it's not an "=" operator, just ignore the clause, as it's not + * compatible with functional dependencies. The operator is identified + * simply by looking at which function it uses to estimate + * selectivity. That's a bit strange, but it's what other similar + * places do. + */ + if (get_oprrest(expr->opno) != F_EQSEL) + return false; + + /* OK to proceed with checking "var" */ + } + else if (is_orclause(clause)) + { + BoolExpr *bool_expr = (BoolExpr *) clause; + ListCell *lc; + + /* start with no expression (we'll use the first match) */ + *expr = NULL; + + foreach(lc, bool_expr->args) + { + Node *or_expr = NULL; + + /* + * Had we found incompatible expression in the arguments, treat + * the whole expression as incompatible. + */ + if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid, + statlist, &or_expr)) + return false; + + if (*expr == NULL) + *expr = or_expr; + + /* ensure all the expressions are the same */ + if (!equal(or_expr, *expr)) + return false; + } + + /* the expression is already checked by the recursive call */ + return true; + } + else if (is_notclause(clause)) + { + /* + * "NOT x" can be interpreted as "x = false", so get the argument and + * proceed with seeing if it's a suitable Var. + */ + clause_expr = (Node *) get_notclausearg(clause); + } + else + { + /* + * A boolean expression "x" can be interpreted as "x = true", so + * proceed with seeing if it's a suitable Var. + */ + clause_expr = (Node *) clause; + } + + /* + * We may ignore any RelabelType node above the operand. (There won't be + * more than one, since eval_const_expressions has been applied already.) + */ + if (IsA(clause_expr, RelabelType)) + clause_expr = (Node *) ((RelabelType *) clause_expr)->arg; + + vars = pull_var_clause(clause_expr, 0); + + foreach(lc, vars) + { + Var *var = (Var *) lfirst(lc); + + /* Ensure Var is from the correct relation */ + if (var->varno != relid) + return false; + + /* We also better ensure the Var is from the current level */ + if (var->varlevelsup != 0) + return false; + + /* Also ignore system attributes (we don't allow stats on those) */ + if (!AttrNumberIsForUserDefinedAttr(var->varattno)) + return false; + } + + /* + * Check if we actually have a matching statistics for the expression. + * + * XXX Maybe this is an overkill. We'll eliminate the expressions later. + */ + foreach(lc, statlist) + { + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); + + /* ignore stats without dependencies */ + if (info->kind != STATS_EXT_DEPENDENCIES) + continue; + + foreach(lc2, info->exprs) + { + Node *stat_expr = (Node *) lfirst(lc2); + + if (equal(clause_expr, stat_expr)) + { + *expr = stat_expr; + return true; + } + } + } + + return false; +} + /* * dependencies_clauselist_selectivity * Return the estimated selectivity of (a subset of) the given clauses @@ -1204,6 +1399,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root, MVDependency **dependencies; int ndependencies; int i; + AttrNumber attnum_offset; + + /* unique expressions */ + Node **unique_exprs; + int unique_exprs_cnt; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) @@ -1212,6 +1412,15 @@ dependencies_clauselist_selectivity(PlannerInfo *root, list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * list_length(clauses)); + /* + * We allocate space as if every clause was a unique expression, although + * that's probably overkill. Some will be simple column references that + * we'll translate to attnums, and there might be duplicates. But it's + * easier and cheaper to just do one allocation than repalloc later. + */ + unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses)); + unique_exprs_cnt = 0; + /* * Pre-process the clauses list to extract the attnums seen in each item. * We need to determine if there's any clauses which will be useful for @@ -1222,29 +1431,127 @@ dependencies_clauselist_selectivity(PlannerInfo *root, * * We also skip clauses that we already estimated using different types of * statistics (we treat them as incompatible). + * + * To handle expressions, we assign them negative attnums, as if it was a + * system attribute (this is fine, as we only allow extended stats on user + * attributes). And then we offset everything by the number of + * expressions, so that we can store the values in a bitmapset. */ listidx = 0; foreach(l, clauses) { Node *clause = (Node *) lfirst(l); AttrNumber attnum; + Node *expr = NULL; - if (!bms_is_member(listidx, *estimatedclauses) && - dependency_is_compatible_clause(clause, rel->relid, &attnum)) + /* ignore clause by default */ + list_attnums[listidx] = InvalidAttrNumber; + + if (!bms_is_member(listidx, *estimatedclauses)) { - list_attnums[listidx] = attnum; - clauses_attnums = bms_add_member(clauses_attnums, attnum); + /* + * If it's a simple column refrence, just extract the attnum. If + * it's an expression, assign a negative attnum as if it was a + * system attribute. + */ + if (dependency_is_compatible_clause(clause, rel->relid, &attnum)) + { + list_attnums[listidx] = attnum; + } + else if (dependency_is_compatible_expression(clause, rel->relid, + rel->statlist, + &expr)) + { + /* special attnum assigned to this expression */ + attnum = InvalidAttrNumber; + + Assert(expr != NULL); + + /* If the expression is duplicate, use the same attnum. */ + for (i = 0; i < unique_exprs_cnt; i++) + { + if (equal(unique_exprs[i], expr)) + { + /* negative attribute number to expression */ + attnum = -(i + 1); + break; + } + } + + /* not found in the list, so add it */ + if (attnum == InvalidAttrNumber) + { + unique_exprs[unique_exprs_cnt++] = expr; + + /* after incrementing the value, to get -1, -2, ... */ + attnum = (-unique_exprs_cnt); + } + + /* remember which attnum was assigned to this clause */ + list_attnums[listidx] = attnum; + } } - else - list_attnums[listidx] = InvalidAttrNumber; listidx++; } + Assert(listidx == list_length(clauses)); + /* - * If there's not at least two distinct attnums then reject the whole list - * of clauses. We must return 1.0 so the calling function's selectivity is - * unaffected. + * How much we need to offset the attnums? If there are no expressions, + * then no offset is needed. Otherwise we need to offset enough for the + * lowest value (-unique_exprs_cnt) to become 1. + */ + if (unique_exprs_cnt > 0) + attnum_offset = (unique_exprs_cnt + 1); + else + attnum_offset = 0; + + /* + * Now that we know how many expressions there are, we can offset the + * values just enough to build the bitmapset. + */ + for (i = 0; i < list_length(clauses); i++) + { + AttrNumber attnum; + + /* ignore incompatible or already estimated clauses */ + if (list_attnums[i] == InvalidAttrNumber) + continue; + + /* make sure the attnum is in the expected range */ + Assert(list_attnums[i] >= (-unique_exprs_cnt)); + Assert(list_attnums[i] <= MaxHeapAttributeNumber); + + /* make sure the attnum is positive (valid AttrNumber) */ + attnum = list_attnums[i] + attnum_offset; + + /* + * Either it's a regular attribute, or it's an expression, in which + * case we must not have seen it before (expressions are unique). + * + * XXX Check whether it's a regular attribute has to be done using the + * original attnum, while the second check has to use the value with + * an offset. + */ + Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) || + !bms_is_member(attnum, clauses_attnums)); + + /* + * Remember the offset attnum, both for attributes and expressions. + * We'll pass list_attnums to clauselist_apply_dependencies, which + * uses it to identify clauses in a bitmap. We could also pass the + * offset, but this is more convenient. + */ + list_attnums[i] = attnum; + + clauses_attnums = bms_add_member(clauses_attnums, attnum); + } + + /* + * If there's not at least two distinct attnums and expressions, then + * reject the whole list of clauses. We must return 1.0 so the calling + * function's selectivity is unaffected. */ if (bms_membership(clauses_attnums) != BMS_MULTIPLE) { @@ -1272,26 +1579,203 @@ dependencies_clauselist_selectivity(PlannerInfo *root, foreach(l, rel->statlist) { StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l); - Bitmapset *matched; - BMS_Membership membership; + int nmatched; + int nexprs; + int k; + MVDependencies *deps; /* skip statistics that are not of the correct type */ if (stat->kind != STATS_EXT_DEPENDENCIES) continue; - matched = bms_intersect(clauses_attnums, stat->keys); - membership = bms_membership(matched); - bms_free(matched); + /* + * Count matching attributes - we have to undo the attnum offsets. The + * input attribute numbers are not offset (expressions are not + * included in stat->keys, so it's not necessary). But we need to + * offset it before checking against clauses_attnums. + */ + nmatched = 0; + k = -1; + while ((k = bms_next_member(stat->keys, k)) >= 0) + { + AttrNumber attnum = (AttrNumber) k; - /* skip objects matching fewer than two attributes from clauses */ - if (membership != BMS_MULTIPLE) + /* skip expressions */ + if (!AttrNumberIsForUserDefinedAttr(attnum)) + continue; + + /* apply the same offset as above */ + attnum += attnum_offset; + + if (bms_is_member(attnum, clauses_attnums)) + nmatched++; + } + + /* count matching expressions */ + nexprs = 0; + for (i = 0; i < unique_exprs_cnt; i++) + { + ListCell *lc; + + foreach(lc, stat->exprs) + { + Node *stat_expr = (Node *) lfirst(lc); + + /* try to match it */ + if (equal(stat_expr, unique_exprs[i])) + nexprs++; + } + } + + /* + * Skip objects matching fewer than two attributes/expressions from + * clauses. + */ + if (nmatched + nexprs < 2) continue; - func_dependencies[nfunc_dependencies] - = statext_dependencies_load(stat->statOid); + deps = statext_dependencies_load(stat->statOid); - total_ndeps += func_dependencies[nfunc_dependencies]->ndeps; - nfunc_dependencies++; + /* + * The expressions may be represented by different attnums in the + * stats, we need to remap them to be consistent with the clauses. + * That will make the later steps (e.g. picking the strongest item and + * so on) much simpler and cheaper, because it won't need to care + * about the offset at all. + * + * When we're at it, we can ignore dependencies that are not fully + * matched by clauses (i.e. referencing attributes or expressions that + * are not in the clauses). + * + * We have to do this for all statistics, as long as there are any + * expressions - we need to shift the attnums in all dependencies. + * + * XXX Maybe we should do this always, because it also eliminates some + * of the dependencies early. It might be cheaper than having to walk + * the longer list in find_strongest_dependency later, especially as + * we need to do that repeatedly? + * + * XXX We have to do this even when there are no expressions in + * clauses, otherwise find_strongest_dependency may fail for stats + * with expressions (due to lookup of negative value in bitmap). So we + * need to at least filter out those dependencies. Maybe we could do + * it in a cheaper way (if there are no expr clauses, we can just + * discard all negative attnums without any lookups). + */ + if (unique_exprs_cnt > 0 || stat->exprs != NIL) + { + int ndeps = 0; + + for (i = 0; i < deps->ndeps; i++) + { + bool skip = false; + MVDependency *dep = deps->deps[i]; + int j; + + for (j = 0; j < dep->nattributes; j++) + { + int idx; + Node *expr; + int k; + AttrNumber unique_attnum = InvalidAttrNumber; + AttrNumber attnum; + + /* undo the per-statistics offset */ + attnum = dep->attributes[j]; + + /* + * For regular attributes we can simply check if it + * matches any clause. If there's no matching clause, we + * can just ignore it. We need to offset the attnum + * though. + */ + if (AttrNumberIsForUserDefinedAttr(attnum)) + { + dep->attributes[j] = attnum + attnum_offset; + + if (!bms_is_member(dep->attributes[j], clauses_attnums)) + { + skip = true; + break; + } + + continue; + } + + /* + * the attnum should be a valid system attnum (-1, -2, + * ...) + */ + Assert(AttributeNumberIsValid(attnum)); + + /* + * For expressions, we need to do two translations. First + * we have to translate the negative attnum to index in + * the list of expressions (in the statistics object). + * Then we need to see if there's a matching clause. The + * index of the unique expression determines the attnum + * (and we offset it). + */ + idx = -(1 + attnum); + + /* Is the expression index is valid? */ + Assert((idx >= 0) && (idx < list_length(stat->exprs))); + + expr = (Node *) list_nth(stat->exprs, idx); + + /* try to find the expression in the unique list */ + for (k = 0; k < unique_exprs_cnt; k++) + { + /* + * found a matching unique expression, use the attnum + * (derived from index of the unique expression) + */ + if (equal(unique_exprs[k], expr)) + { + unique_attnum = -(k + 1) + attnum_offset; + break; + } + } + + /* + * Found no matching expression, so we can simply skip + * this dependency, because there's no chance it will be + * fully covered. + */ + if (unique_attnum == InvalidAttrNumber) + { + skip = true; + break; + } + + /* otherwise remap it to the new attnum */ + dep->attributes[j] = unique_attnum; + } + + /* if found a matching dependency, keep it */ + if (!skip) + { + /* maybe we've skipped something earlier, so move it */ + if (ndeps != i) + deps->deps[ndeps] = deps->deps[i]; + + ndeps++; + } + } + + deps->ndeps = ndeps; + } + + /* + * It's possible we've removed all dependencies, in which case we + * don't bother adding it to the list. + */ + if (deps->ndeps > 0) + { + func_dependencies[nfunc_dependencies] = deps; + total_ndeps += deps->ndeps; + nfunc_dependencies++; + } } /* if no matching stats could be found then we've nothing to do */ @@ -1300,6 +1784,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, pfree(func_dependencies); bms_free(clauses_attnums); pfree(list_attnums); + pfree(unique_exprs); return 1.0; } @@ -1347,6 +1832,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, pfree(func_dependencies); bms_free(clauses_attnums); pfree(list_attnums); + pfree(unique_exprs); return s1; } diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 7808c6a09c..8c75690fce 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -24,6 +24,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_statistic_ext.h" #include "catalog/pg_statistic_ext_data.h" +#include "executor/executor.h" #include "commands/progress.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" @@ -35,13 +36,16 @@ #include "statistics/statistics.h" #include "utils/acl.h" #include "utils/array.h" +#include "utils/attoptcache.h" #include "utils/builtins.h" +#include "utils/datum.h" #include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/selfuncs.h" #include "utils/syscache.h" +#include "utils/typcache.h" /* * To avoid consuming too much memory during analysis and/or too much space @@ -66,18 +70,38 @@ typedef struct StatExtEntry Bitmapset *columns; /* attribute numbers covered by the object */ List *types; /* 'char' list of enabled statistics kinds */ int stattarget; /* statistics target (-1 for default) */ + List *exprs; /* expressions */ } StatExtEntry; static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid); -static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, +static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs, int nvacatts, VacAttrStats **vacatts); -static void statext_store(Oid relid, +static void statext_store(Oid statOid, MVNDistinct *ndistinct, MVDependencies *dependencies, - MCVList *mcv, VacAttrStats **stats); + MCVList *mcv, Datum exprs, VacAttrStats **stats); static int statext_compute_stattarget(int stattarget, int natts, VacAttrStats **stats); +/* Information needed to analyze a single simple expression. */ +typedef struct AnlExprData +{ + Node *expr; /* expression to analyze */ + VacAttrStats *vacattrstat; /* statistics attrs to analyze */ +} AnlExprData; + +static void compute_expr_stats(Relation onerel, double totalrows, + AnlExprData * exprdata, int nexprs, + HeapTuple *rows, int numrows); +static Datum serialize_expr_stats(AnlExprData * exprdata, int nexprs); +static Datum expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); +static AnlExprData *build_expr_data(List *exprs, int stattarget); + +static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat, + int numrows, HeapTuple *rows, + VacAttrStats **stats, int stattarget); + + /* * Compute requested extended stats, using the rows sampled for the plain * (single-column) stats. @@ -92,21 +116,25 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, { Relation pg_stext; ListCell *lc; - List *stats; + List *statslist; MemoryContext cxt; MemoryContext oldcxt; int64 ext_cnt; + /* Do nothing if there are no columns to analyze. */ + if (!natts) + return; + cxt = AllocSetContextCreate(CurrentMemoryContext, "BuildRelationExtStatistics", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(cxt); pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock); - stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel)); + statslist = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel)); /* report this phase */ - if (stats != NIL) + if (statslist != NIL) { const int index[] = { PROGRESS_ANALYZE_PHASE, @@ -114,28 +142,30 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, }; const int64 val[] = { PROGRESS_ANALYZE_PHASE_COMPUTE_EXT_STATS, - list_length(stats) + list_length(statslist) }; pgstat_progress_update_multi_param(2, index, val); } ext_cnt = 0; - foreach(lc, stats) + foreach(lc, statslist) { StatExtEntry *stat = (StatExtEntry *) lfirst(lc); MVNDistinct *ndistinct = NULL; MVDependencies *dependencies = NULL; MCVList *mcv = NULL; + Datum exprstats = (Datum) 0; VacAttrStats **stats; ListCell *lc2; int stattarget; + StatsBuildData *data; /* * Check if we can build these stats based on the column analyzed. If * not, report this fact (except in autovacuum) and move on. */ - stats = lookup_var_attr_stats(onerel, stat->columns, + stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs, natts, vacattrstats); if (!stats) { @@ -150,10 +180,6 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, continue; } - /* check allowed number of dimensions */ - Assert(bms_num_members(stat->columns) >= 2 && - bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS); - /* compute statistics target for this statistics */ stattarget = statext_compute_stattarget(stat->stattarget, bms_num_members(stat->columns), @@ -167,28 +193,49 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, if (stattarget == 0) continue; + /* evaluate expressions (if the statistics has any) */ + data = make_build_data(onerel, stat, numrows, rows, stats, stattarget); + /* compute statistic of each requested type */ foreach(lc2, stat->types) { char t = (char) lfirst_int(lc2); if (t == STATS_EXT_NDISTINCT) - ndistinct = statext_ndistinct_build(totalrows, numrows, rows, - stat->columns, stats); + ndistinct = statext_ndistinct_build(totalrows, data); else if (t == STATS_EXT_DEPENDENCIES) - dependencies = statext_dependencies_build(numrows, rows, - stat->columns, stats); + dependencies = statext_dependencies_build(data); else if (t == STATS_EXT_MCV) - mcv = statext_mcv_build(numrows, rows, stat->columns, stats, - totalrows, stattarget); + mcv = statext_mcv_build(data, totalrows, stattarget); + else if (t == STATS_EXT_EXPRESSIONS) + { + AnlExprData *exprdata; + int nexprs; + + /* should not happen, thanks to checks when defining stats */ + if (!stat->exprs) + elog(ERROR, "requested expression stats, but there are no expressions"); + + exprdata = build_expr_data(stat->exprs, stattarget); + nexprs = list_length(stat->exprs); + + compute_expr_stats(onerel, totalrows, + exprdata, nexprs, + rows, numrows); + + exprstats = serialize_expr_stats(exprdata, nexprs); + } } /* store the statistics in the catalog */ - statext_store(stat->statOid, ndistinct, dependencies, mcv, stats); + statext_store(stat->statOid, ndistinct, dependencies, mcv, exprstats, stats); /* for reporting progress */ pgstat_progress_update_param(PROGRESS_ANALYZE_EXT_STATS_COMPUTED, ++ext_cnt); + + /* free the build data (allocated as a single chunk) */ + pfree(data); } table_close(pg_stext, RowExclusiveLock); @@ -221,6 +268,10 @@ ComputeExtStatisticsRows(Relation onerel, MemoryContext oldcxt; int result = 0; + /* If there are no columns to analyze, just return 0. */ + if (!natts) + return 0; + cxt = AllocSetContextCreate(CurrentMemoryContext, "ComputeExtStatisticsRows", ALLOCSET_DEFAULT_SIZES); @@ -241,7 +292,7 @@ ComputeExtStatisticsRows(Relation onerel, * analyzed. If not, ignore it (don't report anything, we'll do that * during the actual build BuildRelationExtStatistics). */ - stats = lookup_var_attr_stats(onerel, stat->columns, + stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs, natts, vacattrstats); if (!stats) @@ -349,6 +400,10 @@ statext_is_kind_built(HeapTuple htup, char type) attnum = Anum_pg_statistic_ext_data_stxdmcv; break; + case STATS_EXT_EXPRESSIONS: + attnum = Anum_pg_statistic_ext_data_stxdexpr; + break; + default: elog(ERROR, "unexpected statistics type requested: %d", type); } @@ -388,6 +443,7 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) ArrayType *arr; char *enabled; Form_pg_statistic_ext staForm; + List *exprs = NIL; entry = palloc0(sizeof(StatExtEntry)); staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); @@ -415,10 +471,40 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) { Assert((enabled[i] == STATS_EXT_NDISTINCT) || (enabled[i] == STATS_EXT_DEPENDENCIES) || - (enabled[i] == STATS_EXT_MCV)); + (enabled[i] == STATS_EXT_MCV) || + (enabled[i] == STATS_EXT_EXPRESSIONS)); entry->types = lappend_int(entry->types, (int) enabled[i]); } + /* decode expression (if any) */ + datum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stxexprs, &isnull); + + if (!isnull) + { + char *exprsString; + + exprsString = TextDatumGetCString(datum); + exprs = (List *) stringToNode(exprsString); + + pfree(exprsString); + + /* + * Run the expressions through eval_const_expressions. This is not + * just an optimization, but is necessary, because the planner + * will be comparing them to similarly-processed qual clauses, and + * may fail to detect valid matches without this. We must not use + * canonicalize_qual, however, since these aren't qual + * expressions. + */ + exprs = (List *) eval_const_expressions(NULL, (Node *) exprs); + + /* May as well fix opfuncids too */ + fix_opfuncids((Node *) exprs); + } + + entry->exprs = exprs; + result = lappend(result, entry); } @@ -427,6 +513,187 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) return result; } +/* + * examine_attribute -- pre-analysis of a single column + * + * Determine whether the column is analyzable; if so, create and initialize + * a VacAttrStats struct for it. If not, return NULL. + */ +static VacAttrStats * +examine_attribute(Node *expr) +{ + HeapTuple typtuple; + VacAttrStats *stats; + int i; + bool ok; + + /* + * Create the VacAttrStats struct. Note that we only have a copy of the + * fixed fields of the pg_attribute tuple. + */ + stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); + + /* fake the attribute */ + stats->attr = (Form_pg_attribute) palloc0(ATTRIBUTE_FIXED_PART_SIZE); + stats->attr->attstattarget = -1; + + /* + * When analyzing an expression, believe the expression tree's type not + * the column datatype --- the latter might be the opckeytype storage + * type of the opclass, which is not interesting for our purposes. (Note: + * if we did anything with non-expression statistics columns, we'd need to + * figure out where to get the correct type info from, but for now that's + * not a problem.) It's not clear whether anyone will care about the + * typmod, but we store that too just in case. + */ + stats->attrtypid = exprType(expr); + stats->attrtypmod = exprTypmod(expr); + stats->attrcollid = exprCollation(expr); + + typtuple = SearchSysCacheCopy1(TYPEOID, + ObjectIdGetDatum(stats->attrtypid)); + if (!HeapTupleIsValid(typtuple)) + elog(ERROR, "cache lookup failed for type %u", stats->attrtypid); + stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple); + + /* + * We don't actually analyze individual attributes, so no need to set the + * memory context. + */ + stats->anl_context = NULL; + stats->tupattnum = InvalidAttrNumber; + + /* + * The fields describing the stats->stavalues[n] element types default to + * the type of the data being analyzed, but the type-specific typanalyze + * function can change them if it wants to store something else. + */ + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + stats->statypid[i] = stats->attrtypid; + stats->statyplen[i] = stats->attrtype->typlen; + stats->statypbyval[i] = stats->attrtype->typbyval; + stats->statypalign[i] = stats->attrtype->typalign; + } + + /* + * Call the type-specific typanalyze function. If none is specified, use + * std_typanalyze(). + */ + if (OidIsValid(stats->attrtype->typanalyze)) + ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze, + PointerGetDatum(stats))); + else + ok = std_typanalyze(stats); + + if (!ok || stats->compute_stats == NULL || stats->minrows <= 0) + { + heap_freetuple(typtuple); + pfree(stats->attr); + pfree(stats); + return NULL; + } + + return stats; +} + +/* + * examine_expression -- pre-analysis of a single expression + * + * Determine whether the expression is analyzable; if so, create and initialize + * a VacAttrStats struct for it. If not, return NULL. + */ +static VacAttrStats * +examine_expression(Node *expr, int stattarget) +{ + HeapTuple typtuple; + VacAttrStats *stats; + int i; + bool ok; + + Assert(expr != NULL); + + /* + * Create the VacAttrStats struct. + */ + stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); + + /* + * When analyzing an expression, believe the expression tree's type. + */ + stats->attrtypid = exprType(expr); + stats->attrtypmod = exprTypmod(expr); + + /* + * We don't allow collation to be specified in CREATE STATISTICS, so we + * have to use the collation specified for the expression. It's possible + * to specify the collation in the expression "(col COLLATE "en_US")" in + * which case exprCollation() does the right thing. + */ + stats->attrcollid = exprCollation(expr); + + /* + * We don't have any pg_attribute for expressions, so let's fake something + * reasonable into attstattarget, which is the only thing std_typanalyze + * needs. + */ + stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE); + + /* + * We can't have statistics target specified for the expression, so we + * could use either the default_statistics_target, or the target computed + * for the extended statistics. The second option seems more reasonable. + */ + stats->attr->attstattarget = stattarget; + + /* initialize some basic fields */ + stats->attr->attrelid = InvalidOid; + stats->attr->attnum = InvalidAttrNumber; + stats->attr->atttypid = stats->attrtypid; + + typtuple = SearchSysCacheCopy1(TYPEOID, + ObjectIdGetDatum(stats->attrtypid)); + if (!HeapTupleIsValid(typtuple)) + elog(ERROR, "cache lookup failed for type %u", stats->attrtypid); + + stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple); + stats->anl_context = CurrentMemoryContext; /* XXX should be using + * something else? */ + stats->tupattnum = InvalidAttrNumber; + + /* + * The fields describing the stats->stavalues[n] element types default to + * the type of the data being analyzed, but the type-specific typanalyze + * function can change them if it wants to store something else. + */ + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + stats->statypid[i] = stats->attrtypid; + stats->statyplen[i] = stats->attrtype->typlen; + stats->statypbyval[i] = stats->attrtype->typbyval; + stats->statypalign[i] = stats->attrtype->typalign; + } + + /* + * Call the type-specific typanalyze function. If none is specified, use + * std_typanalyze(). + */ + if (OidIsValid(stats->attrtype->typanalyze)) + ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze, + PointerGetDatum(stats))); + else + ok = std_typanalyze(stats); + + if (!ok || stats->compute_stats == NULL || stats->minrows <= 0) + { + heap_freetuple(typtuple); + pfree(stats); + return NULL; + } + + return stats; +} + /* * Using 'vacatts' of size 'nvacatts' as input data, return a newly built * VacAttrStats array which includes only the items corresponding to @@ -435,15 +702,18 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) * to the caller that the stats should not be built. */ static VacAttrStats ** -lookup_var_attr_stats(Relation rel, Bitmapset *attrs, +lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs, int nvacatts, VacAttrStats **vacatts) { int i = 0; int x = -1; + int natts; VacAttrStats **stats; + ListCell *lc; - stats = (VacAttrStats **) - palloc(bms_num_members(attrs) * sizeof(VacAttrStats *)); + natts = bms_num_members(attrs) + list_length(exprs); + + stats = (VacAttrStats **) palloc(natts * sizeof(VacAttrStats *)); /* lookup VacAttrStats info for the requested columns (same attnum) */ while ((x = bms_next_member(attrs, x)) >= 0) @@ -480,6 +750,24 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs, i++; } + /* also add info for expressions */ + foreach(lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + + stats[i] = examine_attribute(expr); + + /* + * XXX We need tuple descriptor later, and we just grab it from + * stats[0]->tupDesc (see e.g. statext_mcv_build). But as coded + * examine_attribute does not set that, so just grab it from the first + * vacatts element. + */ + stats[i]->tupDesc = vacatts[0]->tupDesc; + + i++; + } + return stats; } @@ -491,7 +779,7 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs, static void statext_store(Oid statOid, MVNDistinct *ndistinct, MVDependencies *dependencies, - MCVList *mcv, VacAttrStats **stats) + MCVList *mcv, Datum exprs, VacAttrStats **stats) { Relation pg_stextdata; HeapTuple stup, @@ -532,11 +820,17 @@ statext_store(Oid statOid, nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = (data == NULL); values[Anum_pg_statistic_ext_data_stxdmcv - 1] = PointerGetDatum(data); } + if (exprs != (Datum) 0) + { + nulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = false; + values[Anum_pg_statistic_ext_data_stxdexpr - 1] = exprs; + } /* always replace the value (either by bytea or NULL) */ replaces[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true; replaces[Anum_pg_statistic_ext_data_stxddependencies - 1] = true; replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true; + replaces[Anum_pg_statistic_ext_data_stxdexpr - 1] = true; /* there should already be a pg_statistic_ext_data tuple */ oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid)); @@ -668,7 +962,7 @@ compare_datums_simple(Datum a, Datum b, SortSupport ssup) * is not necessary here (and when querying the bitmap). */ AttrNumber * -build_attnums_array(Bitmapset *attrs, int *numattrs) +build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs) { int i, j; @@ -684,16 +978,19 @@ build_attnums_array(Bitmapset *attrs, int *numattrs) j = -1; while ((j = bms_next_member(attrs, j)) >= 0) { + AttrNumber attnum = (j - nexprs); + /* * Make sure the bitmap contains only user-defined attributes. As * bitmaps can't contain negative values, this can be violated in two * ways. Firstly, the bitmap might contain 0 as a member, and secondly * the integer value might be larger than MaxAttrNumber. */ - Assert(AttrNumberIsForUserDefinedAttr(j)); - Assert(j <= MaxAttrNumber); + Assert(AttributeNumberIsValid(attnum)); + Assert(attnum <= MaxAttrNumber); + Assert(attnum >= (-nexprs)); - attnums[i++] = (AttrNumber) j; + attnums[i++] = (AttrNumber) attnum; /* protect against overflows */ Assert(i <= num); @@ -710,29 +1007,31 @@ build_attnums_array(Bitmapset *attrs, int *numattrs) * can simply pfree the return value to release all of it. */ SortItem * -build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, - MultiSortSupport mss, int numattrs, AttrNumber *attnums) +build_sorted_items(StatsBuildData *data, int *nitems, + MultiSortSupport mss, + int numattrs, AttrNumber *attnums) { int i, j, len, - idx; - int nvalues = numrows * numattrs; + nrows; + int nvalues = data->numrows * numattrs; SortItem *items; Datum *values; bool *isnull; char *ptr; + int *typlen; /* Compute the total amount of memory we need (both items and values). */ - len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool)); + len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool)); /* Allocate the memory and split it into the pieces. */ ptr = palloc0(len); /* items to sort */ items = (SortItem *) ptr; - ptr += numrows * sizeof(SortItem); + ptr += data->numrows * sizeof(SortItem); /* values and null flags */ values = (Datum *) ptr; @@ -745,21 +1044,47 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, Assert((ptr - (char *) items) == len); /* fix the pointers to Datum and bool arrays */ - idx = 0; - for (i = 0; i < numrows; i++) + nrows = 0; + for (i = 0; i < data->numrows; i++) + { + items[nrows].values = &values[nrows * numattrs]; + items[nrows].isnull = &isnull[nrows * numattrs]; + + nrows++; + } + + /* build a local cache of typlen for all attributes */ + typlen = (int *) palloc(sizeof(int) * data->nattnums); + for (i = 0; i < data->nattnums; i++) + typlen[i] = get_typlen(data->stats[i]->attrtypid); + + nrows = 0; + for (i = 0; i < data->numrows; i++) { bool toowide = false; - items[idx].values = &values[idx * numattrs]; - items[idx].isnull = &isnull[idx * numattrs]; - /* load the values/null flags from sample rows */ for (j = 0; j < numattrs; j++) { Datum value; bool isnull; + int attlen; + AttrNumber attnum = attnums[j]; - value = heap_getattr(rows[i], attnums[j], tdesc, &isnull); + int idx; + + /* match attnum to the pre-calculated data */ + for (idx = 0; idx < data->nattnums; idx++) + { + if (attnum == data->attnums[idx]) + break; + } + + Assert(idx < data->nattnums); + + value = data->values[idx][i]; + isnull = data->nulls[idx][i]; + attlen = typlen[idx]; /* * If this is a varlena value, check if it's too wide and if yes @@ -770,8 +1095,7 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, * on the assumption that those are small (below WIDTH_THRESHOLD) * and will be discarded at the end of analyze. */ - if ((!isnull) && - (TupleDescAttr(tdesc, attnums[j] - 1)->attlen == -1)) + if ((!isnull) && (attlen == -1)) { if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) { @@ -782,21 +1106,21 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, value = PointerGetDatum(PG_DETOAST_DATUM(value)); } - items[idx].values[j] = value; - items[idx].isnull[j] = isnull; + items[nrows].values[j] = value; + items[nrows].isnull[j] = isnull; } if (toowide) continue; - idx++; + nrows++; } /* store the actual number of items (ignoring the too-wide ones) */ - *nitems = idx; + *nitems = nrows; /* all items were too wide */ - if (idx == 0) + if (nrows == 0) { /* everything is allocated as a single chunk */ pfree(items); @@ -804,7 +1128,7 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, } /* do the sort, using the multi-sort */ - qsort_arg((void *) items, idx, sizeof(SortItem), + qsort_arg((void *) items, nrows, sizeof(SortItem), multi_sort_compare, mss); return items; @@ -830,6 +1154,63 @@ has_stats_of_kind(List *stats, char requiredkind) return false; } +/* + * stat_find_expression + * Search for an expression in statistics object's list of expressions. + * + * Returns the index of the expression in the statistics object's list of + * expressions, or -1 if not found. + */ +static int +stat_find_expression(StatisticExtInfo *stat, Node *expr) +{ + ListCell *lc; + int idx; + + idx = 0; + foreach(lc, stat->exprs) + { + Node *stat_expr = (Node *) lfirst(lc); + + if (equal(stat_expr, expr)) + return idx; + idx++; + } + + /* Expression not found */ + return -1; +} + +/* + * stat_covers_expressions + * Test whether a statistics object covers all expressions in a list. + * + * Returns true if all expressions are covered. If expr_idxs is non-NULL, it + * is populated with the indexes of the expressions found. + */ +static bool +stat_covers_expressions(StatisticExtInfo *stat, List *exprs, + Bitmapset **expr_idxs) +{ + ListCell *lc; + + foreach(lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + int expr_idx; + + expr_idx = stat_find_expression(stat, expr); + if (expr_idx == -1) + return false; + + if (expr_idxs != NULL) + *expr_idxs = bms_add_member(*expr_idxs, expr_idx); + } + + /* If we reach here, all expressions are covered */ + return true; +} + /* * choose_best_statistics * Look for and return statistics with the specified 'requiredkind' which @@ -850,7 +1231,8 @@ has_stats_of_kind(List *stats, char requiredkind) */ StatisticExtInfo * choose_best_statistics(List *stats, char requiredkind, - Bitmapset **clause_attnums, int nclauses) + Bitmapset **clause_attnums, List **clause_exprs, + int nclauses) { ListCell *lc; StatisticExtInfo *best_match = NULL; @@ -861,7 +1243,8 @@ choose_best_statistics(List *stats, char requiredkind, { int i; StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); - Bitmapset *matched = NULL; + Bitmapset *matched_attnums = NULL; + Bitmapset *matched_exprs = NULL; int num_matched; int numkeys; @@ -870,35 +1253,43 @@ choose_best_statistics(List *stats, char requiredkind, continue; /* - * Collect attributes in remaining (unestimated) clauses fully covered - * by this statistic object. + * Collect attributes and expressions in remaining (unestimated) + * clauses fully covered by this statistic object. */ for (i = 0; i < nclauses; i++) { + Bitmapset *expr_idxs = NULL; + /* ignore incompatible/estimated clauses */ - if (!clause_attnums[i]) + if (!clause_attnums[i] && !clause_exprs[i]) continue; /* ignore clauses that are not covered by this object */ - if (!bms_is_subset(clause_attnums[i], info->keys)) + if (!bms_is_subset(clause_attnums[i], info->keys) || + !stat_covers_expressions(info, clause_exprs[i], &expr_idxs)) continue; - matched = bms_add_members(matched, clause_attnums[i]); + /* record attnums and indexes of expressions covered */ + matched_attnums = bms_add_members(matched_attnums, clause_attnums[i]); + matched_exprs = bms_add_members(matched_exprs, expr_idxs); } - num_matched = bms_num_members(matched); - bms_free(matched); + num_matched = bms_num_members(matched_attnums) + bms_num_members(matched_exprs); + + bms_free(matched_attnums); + bms_free(matched_exprs); /* * save the actual number of keys in the stats so that we can choose * the narrowest stats with the most matching keys. */ - numkeys = bms_num_members(info->keys); + numkeys = bms_num_members(info->keys) + list_length(info->exprs); /* - * Use this object when it increases the number of matched clauses or - * when it matches the same number of attributes but these stats have - * fewer keys than any previous match. + * Use this object when it increases the number of matched attributes + * and expressions or when it matches the same number of attributes + * and expressions but these stats have fewer keys than any previous + * match. */ if (num_matched > best_num_matched || (num_matched == best_num_matched && numkeys < best_match_keys)) @@ -923,7 +1314,8 @@ choose_best_statistics(List *stats, char requiredkind, */ static bool statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, - Index relid, Bitmapset **attnums) + Index relid, Bitmapset **attnums, + List **exprs) { /* Look inside any binary-compatible relabeling (as in examine_variable) */ if (IsA(clause, RelabelType)) @@ -951,19 +1343,19 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, return true; } - /* (Var op Const) or (Const op Var) */ + /* (Var/Expr op Const) or (Const op Var/Expr) */ if (is_opclause(clause)) { RangeTblEntry *rte = root->simple_rte_array[relid]; OpExpr *expr = (OpExpr *) clause; - Var *var; + Node *clause_expr; /* Only expressions with two arguments are considered compatible. */ if (list_length(expr->args) != 2) return false; - /* Check if the expression has the right shape (one Var, one Const) */ - if (!examine_clause_args(expr->args, &var, NULL, NULL)) + /* Check if the expression has the right shape */ + if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL)) return false; /* @@ -981,7 +1373,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, case F_SCALARLESEL: case F_SCALARGTSEL: case F_SCALARGESEL: - /* supported, will continue with inspection of the Var */ + /* supported, will continue with inspection of the Var/Expr */ break; default: @@ -1003,23 +1395,29 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, !get_func_leakproof(get_opcode(expr->opno))) return false; - return statext_is_compatible_clause_internal(root, (Node *) var, - relid, attnums); + /* Check (Var op Const) or (Const op Var) clauses by recursing. */ + if (IsA(clause_expr, Var)) + return statext_is_compatible_clause_internal(root, clause_expr, + relid, attnums, exprs); + + /* Otherwise we have (Expr op Const) or (Const op Expr). */ + *exprs = lappend(*exprs, clause_expr); + return true; } - /* Var IN Array */ + /* Var/Expr IN Array */ if (IsA(clause, ScalarArrayOpExpr)) { RangeTblEntry *rte = root->simple_rte_array[relid]; ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; - Var *var; + Node *clause_expr; /* Only expressions with two arguments are considered compatible. */ if (list_length(expr->args) != 2) return false; /* Check if the expression has the right shape (one Var, one Const) */ - if (!examine_clause_args(expr->args, &var, NULL, NULL)) + if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL)) return false; /* @@ -1037,7 +1435,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, case F_SCALARLESEL: case F_SCALARGTSEL: case F_SCALARGESEL: - /* supported, will continue with inspection of the Var */ + /* supported, will continue with inspection of the Var/Expr */ break; default: @@ -1059,8 +1457,14 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, !get_func_leakproof(get_opcode(expr->opno))) return false; - return statext_is_compatible_clause_internal(root, (Node *) var, - relid, attnums); + /* Check Var IN Array clauses by recursing. */ + if (IsA(clause_expr, Var)) + return statext_is_compatible_clause_internal(root, clause_expr, + relid, attnums, exprs); + + /* Otherwise we have Expr IN Array. */ + *exprs = lappend(*exprs, clause_expr); + return true; } /* AND/OR/NOT clause */ @@ -1093,54 +1497,62 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, */ if (!statext_is_compatible_clause_internal(root, (Node *) lfirst(lc), - relid, attnums)) + relid, attnums, exprs)) return false; } return true; } - /* Var IS NULL */ + /* Var/Expr IS NULL */ if (IsA(clause, NullTest)) { NullTest *nt = (NullTest *) clause; - /* - * Only simple (Var IS NULL) expressions supported for now. Maybe we - * could use examine_variable to fix this? - */ - if (!IsA(nt->arg, Var)) - return false; + /* Check Var IS NULL clauses by recursing. */ + if (IsA(nt->arg, Var)) + return statext_is_compatible_clause_internal(root, (Node *) (nt->arg), + relid, attnums, exprs); - return statext_is_compatible_clause_internal(root, (Node *) (nt->arg), - relid, attnums); + /* Otherwise we have Expr IS NULL. */ + *exprs = lappend(*exprs, nt->arg); + return true; } - return false; + /* + * Treat any other expressions as bare expressions to be matched against + * expressions in statistics objects. + */ + *exprs = lappend(*exprs, clause); + return true; } /* * statext_is_compatible_clause * Determines if the clause is compatible with MCV lists. * - * Currently, we only support three types of clauses: + * Currently, we only support the following types of clauses: * - * (a) OpExprs of the form (Var op Const), or (Const op Var), where the op - * is one of ("=", "<", ">", ">=", "<=") + * (a) OpExprs of the form (Var/Expr op Const), or (Const op Var/Expr), where + * the op is one of ("=", "<", ">", ">=", "<=") * - * (b) (Var IS [NOT] NULL) + * (b) (Var/Expr IS [NOT] NULL) * * (c) combinations using AND/OR/NOT * + * (d) ScalarArrayOpExprs of the form (Var/Expr op ANY (array)) or (Var/Expr + * op ALL (array)) + * * In the future, the range of supported clauses may be expanded to more * complex cases, for example (Var op Var). */ static bool statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, - Bitmapset **attnums) + Bitmapset **attnums, List **exprs) { RangeTblEntry *rte = root->simple_rte_array[relid]; RestrictInfo *rinfo = (RestrictInfo *) clause; + int clause_relid; Oid userid; /* @@ -1160,7 +1572,7 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, foreach(lc, expr->args) { if (!statext_is_compatible_clause(root, (Node *) lfirst(lc), - relid, attnums)) + relid, attnums, exprs)) return false; } @@ -1175,25 +1587,36 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, if (rinfo->pseudoconstant) return false; - /* clauses referencing multiple varnos are incompatible */ - if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + /* Clauses referencing other varnos are incompatible. */ + if (!bms_get_singleton_member(rinfo->clause_relids, &clause_relid) || + clause_relid != relid) return false; /* Check the clause and determine what attributes it references. */ if (!statext_is_compatible_clause_internal(root, (Node *) rinfo->clause, - relid, attnums)) + relid, attnums, exprs)) return false; /* - * Check that the user has permission to read all these attributes. Use + * Check that the user has permission to read all required attributes. Use * checkAsUser if it's set, in case we're accessing the table via a view. */ userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) != ACLCHECK_OK) { + Bitmapset *clause_attnums; + /* Don't have table privilege, must check individual columns */ - if (bms_is_member(InvalidAttrNumber, *attnums)) + if (*exprs != NIL) + { + pull_varattnos((Node *) exprs, relid, &clause_attnums); + clause_attnums = bms_add_members(clause_attnums, *attnums); + } + else + clause_attnums = *attnums; + + if (bms_is_member(InvalidAttrNumber, clause_attnums)) { /* Have a whole-row reference, must have access to all columns */ if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT, @@ -1205,7 +1628,7 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, /* Check the columns referenced by the clause */ int attnum = -1; - while ((attnum = bms_next_member(*attnums, attnum)) >= 0) + while ((attnum = bms_next_member(clause_attnums, attnum)) >= 0) { if (pg_attribute_aclcheck(rte->relid, attnum, userid, ACL_SELECT) != ACLCHECK_OK) @@ -1259,7 +1682,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli bool is_or) { ListCell *l; - Bitmapset **list_attnums; + Bitmapset **list_attnums; /* attnums extracted from the clause */ + List **list_exprs; /* expressions matched to any statistic */ int listidx; Selectivity sel = (is_or) ? 0.0 : 1.0; @@ -1270,13 +1694,16 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * list_length(clauses)); + /* expressions extracted from complex expressions */ + list_exprs = (List **) palloc(sizeof(Node *) * list_length(clauses)); + /* - * Pre-process the clauses list to extract the attnums seen in each item. - * We need to determine if there's any clauses which will be useful for - * selectivity estimations with extended stats. Along the way we'll record - * all of the attnums for each clause in a list which we'll reference - * later so we don't need to repeat the same work again. We'll also keep - * track of all attnums seen. + * Pre-process the clauses list to extract the attnums and expressions + * seen in each item. We need to determine if there are any clauses which + * will be useful for selectivity estimations with extended stats. Along + * the way we'll record all of the attnums and expressions for each clause + * in lists which we'll reference later so we don't need to repeat the + * same work again. * * We also skip clauses that we already estimated using different types of * statistics (we treat them as incompatible). @@ -1286,12 +1713,19 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli { Node *clause = (Node *) lfirst(l); Bitmapset *attnums = NULL; + List *exprs = NIL; if (!bms_is_member(listidx, *estimatedclauses) && - statext_is_compatible_clause(root, clause, rel->relid, &attnums)) + statext_is_compatible_clause(root, clause, rel->relid, &attnums, &exprs)) + { list_attnums[listidx] = attnums; + list_exprs[listidx] = exprs; + } else + { list_attnums[listidx] = NULL; + list_exprs[listidx] = NIL; + } listidx++; } @@ -1305,7 +1739,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli /* find the best suited statistics object for these attnums */ stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV, - list_attnums, list_length(clauses)); + list_attnums, list_exprs, + list_length(clauses)); /* * if no (additional) matching stats could be found then we've nothing @@ -1320,28 +1755,39 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli /* now filter the clauses to be estimated using the selected MCV */ stat_clauses = NIL; - /* record which clauses are simple (single column) */ + /* record which clauses are simple (single column or expression) */ simple_clauses = NULL; listidx = 0; foreach(l, clauses) { /* - * If the clause is compatible with the selected statistics, mark - * it as estimated and add it to the list to estimate. + * If the clause is not already estimated and is compatible with + * the selected statistics object (all attributes and expressions + * covered), mark it as estimated and add it to the list to + * estimate. */ - if (list_attnums[listidx] != NULL && - bms_is_subset(list_attnums[listidx], stat->keys)) + if (!bms_is_member(listidx, *estimatedclauses) && + bms_is_subset(list_attnums[listidx], stat->keys) && + stat_covers_expressions(stat, list_exprs[listidx], NULL)) { - if (bms_membership(list_attnums[listidx]) == BMS_SINGLETON) + /* record simple clauses (single column or expression) */ + if ((list_attnums[listidx] == NULL && + list_length(list_exprs[listidx]) == 1) || + (list_exprs[listidx] == NIL && + bms_membership(list_attnums[listidx]) == BMS_SINGLETON)) simple_clauses = bms_add_member(simple_clauses, list_length(stat_clauses)); + /* add clause to list and mark as estimated */ stat_clauses = lappend(stat_clauses, (Node *) lfirst(l)); *estimatedclauses = bms_add_member(*estimatedclauses, listidx); bms_free(list_attnums[listidx]); list_attnums[listidx] = NULL; + + list_free(list_exprs[listidx]); + list_exprs[listidx] = NULL; } listidx++; @@ -1530,23 +1976,24 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, } /* - * examine_opclause_expression - * Split expression into Var and Const parts. + * examine_opclause_args + * Split an operator expression's arguments into Expr and Const parts. * - * Attempts to match the arguments to either (Var op Const) or (Const op Var), - * possibly with a RelabelType on top. When the expression matches this form, - * returns true, otherwise returns false. + * Attempts to match the arguments to either (Expr op Const) or (Const op + * Expr), possibly with a RelabelType on top. When the expression matches this + * form, returns true, otherwise returns false. * - * Optionally returns pointers to the extracted Var/Const nodes, when passed - * non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies - * on which side of the operator we found the Var node. + * Optionally returns pointers to the extracted Expr/Const nodes, when passed + * non-null pointers (exprp, cstp and expronleftp). The expronleftp flag + * specifies on which side of the operator we found the expression node. */ bool -examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp) +examine_opclause_args(List *args, Node **exprp, Const **cstp, + bool *expronleftp) { - Var *var; + Node *expr; Const *cst; - bool varonleft; + bool expronleft; Node *leftop, *rightop; @@ -1563,30 +2010,564 @@ examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp) if (IsA(rightop, RelabelType)) rightop = (Node *) ((RelabelType *) rightop)->arg; - if (IsA(leftop, Var) && IsA(rightop, Const)) + if (IsA(rightop, Const)) { - var = (Var *) leftop; + expr = (Node *) leftop; cst = (Const *) rightop; - varonleft = true; + expronleft = true; } - else if (IsA(leftop, Const) && IsA(rightop, Var)) + else if (IsA(leftop, Const)) { - var = (Var *) rightop; + expr = (Node *) rightop; cst = (Const *) leftop; - varonleft = false; + expronleft = false; } else return false; /* return pointers to the extracted parts if requested */ - if (varp) - *varp = var; + if (exprp) + *exprp = expr; if (cstp) *cstp = cst; - if (varonleftp) - *varonleftp = varonleft; + if (expronleftp) + *expronleftp = expronleft; return true; } + + +/* + * Compute statistics about expressions of a relation. + */ +static void +compute_expr_stats(Relation onerel, double totalrows, + AnlExprData *exprdata, int nexprs, + HeapTuple *rows, int numrows) +{ + MemoryContext expr_context, + old_context; + int ind, + i; + + expr_context = AllocSetContextCreate(CurrentMemoryContext, + "Analyze Expression", + ALLOCSET_DEFAULT_SIZES); + old_context = MemoryContextSwitchTo(expr_context); + + for (ind = 0; ind < nexprs; ind++) + { + AnlExprData *thisdata = &exprdata[ind]; + VacAttrStats *stats = thisdata->vacattrstat; + Node *expr = thisdata->expr; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + Datum *exprvals; + bool *exprnulls; + ExprState *exprstate; + int tcnt; + + /* Are we still in the main context? */ + Assert(CurrentMemoryContext == expr_context); + + /* + * Need an EState for evaluation of expressions. Create it in the + * per-expression context to be sure it gets cleaned up at the bottom + * of the loop. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + + /* Set up expression evaluation state */ + exprstate = ExecPrepareExpr((Expr *) expr, estate); + + /* Need a slot to hold the current heap tuple, too */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Compute and save expression values */ + exprvals = (Datum *) palloc(numrows * sizeof(Datum)); + exprnulls = (bool *) palloc(numrows * sizeof(bool)); + + tcnt = 0; + for (i = 0; i < numrows; i++) + { + Datum datum; + bool isnull; + + /* + * Reset the per-tuple context each time, to reclaim any cruft + * left behind by evaluating the statistics expressions. + */ + ResetExprContext(econtext); + + /* Set up for expression evaluation */ + ExecStoreHeapTuple(rows[i], slot, false); + + /* + * Evaluate the expression. We do this in the per-tuple context so + * as not to leak memory, and then copy the result into the + * context created at the beginning of this function. + */ + datum = ExecEvalExprSwitchContext(exprstate, + GetPerTupleExprContext(estate), + &isnull); + if (isnull) + { + exprvals[tcnt] = (Datum) 0; + exprnulls[tcnt] = true; + } + else + { + /* Make sure we copy the data into the context. */ + Assert(CurrentMemoryContext == expr_context); + + exprvals[tcnt] = datumCopy(datum, + stats->attrtype->typbyval, + stats->attrtype->typlen); + exprnulls[tcnt] = false; + } + + tcnt++; + } + + /* + * Now we can compute the statistics for the expression columns. + * + * XXX Unlike compute_index_stats we don't need to switch and reset + * memory contexts here, because we're only computing stats for a + * single expression (and not iterating over many indexes), so we just + * do it in expr_context. Note that compute_stats copies the result + * into stats->anl_context, so it does not disappear. + */ + if (tcnt > 0) + { + AttributeOpts *aopt = + get_attribute_options(stats->attr->attrelid, + stats->attr->attnum); + + stats->exprvals = exprvals; + stats->exprnulls = exprnulls; + stats->rowstride = 1; + stats->compute_stats(stats, + expr_fetch_func, + tcnt, + tcnt); + + /* + * If the n_distinct option is specified, it overrides the above + * computation. + */ + if (aopt != NULL && aopt->n_distinct != 0.0) + stats->stadistinct = aopt->n_distinct; + } + + /* And clean up */ + MemoryContextSwitchTo(expr_context); + + ExecDropSingleTupleTableSlot(slot); + FreeExecutorState(estate); + MemoryContextResetAndDeleteChildren(expr_context); + } + + MemoryContextSwitchTo(old_context); + MemoryContextDelete(expr_context); +} + + +/* + * Fetch function for analyzing statistics object expressions. + * + * We have not bothered to construct tuples from the data, instead the data + * is just in Datum arrays. + */ +static Datum +expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) +{ + int i; + + /* exprvals and exprnulls are already offset for proper column */ + i = rownum * stats->rowstride; + *isNull = stats->exprnulls[i]; + return stats->exprvals[i]; +} + +/* + * Build analyze data for a list of expressions. As this is not tied + * directly to a relation (table or index), we have to fake some of + * the fields in examine_expression(). + */ +static AnlExprData * +build_expr_data(List *exprs, int stattarget) +{ + int idx; + int nexprs = list_length(exprs); + AnlExprData *exprdata; + ListCell *lc; + + exprdata = (AnlExprData *) palloc0(nexprs * sizeof(AnlExprData)); + + idx = 0; + foreach(lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + AnlExprData *thisdata = &exprdata[idx]; + + thisdata->expr = expr; + thisdata->vacattrstat = examine_expression(expr, stattarget); + idx++; + } + + return exprdata; +} + +/* form an array of pg_statistic rows (per update_attstats) */ +static Datum +serialize_expr_stats(AnlExprData *exprdata, int nexprs) +{ + int exprno; + Oid typOid; + Relation sd; + + ArrayBuildState *astate = NULL; + + sd = table_open(StatisticRelationId, RowExclusiveLock); + + /* lookup OID of composite type for pg_statistic */ + typOid = get_rel_type_id(StatisticRelationId); + if (!OidIsValid(typOid)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"pg_statistic\" does not have a composite type"))); + + for (exprno = 0; exprno < nexprs; exprno++) + { + int i, + k; + VacAttrStats *stats = exprdata[exprno].vacattrstat; + + Datum values[Natts_pg_statistic]; + bool nulls[Natts_pg_statistic]; + HeapTuple stup; + + if (!stats->stats_valid) + { + astate = accumArrayResult(astate, + (Datum) 0, + true, + typOid, + CurrentMemoryContext); + continue; + } + + /* + * Construct a new pg_statistic tuple + */ + for (i = 0; i < Natts_pg_statistic; ++i) + { + nulls[i] = false; + } + + values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(InvalidAttrNumber); + values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(false); + values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac); + values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth); + values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct); + i = Anum_pg_statistic_stakind1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */ + } + i = Anum_pg_statistic_staop1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + values[i++] = ObjectIdGetDatum(stats->staop[k]); /* staopN */ + } + i = Anum_pg_statistic_stacoll1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + values[i++] = ObjectIdGetDatum(stats->stacoll[k]); /* stacollN */ + } + i = Anum_pg_statistic_stanumbers1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + int nnum = stats->numnumbers[k]; + + if (nnum > 0) + { + int n; + Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum)); + ArrayType *arry; + + for (n = 0; n < nnum; n++) + numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]); + /* XXX knows more than it should about type float4: */ + arry = construct_array(numdatums, nnum, + FLOAT4OID, + sizeof(float4), true, TYPALIGN_INT); + values[i++] = PointerGetDatum(arry); /* stanumbersN */ + } + else + { + nulls[i] = true; + values[i++] = (Datum) 0; + } + } + i = Anum_pg_statistic_stavalues1 - 1; + for (k = 0; k < STATISTIC_NUM_SLOTS; k++) + { + if (stats->numvalues[k] > 0) + { + ArrayType *arry; + + arry = construct_array(stats->stavalues[k], + stats->numvalues[k], + stats->statypid[k], + stats->statyplen[k], + stats->statypbyval[k], + stats->statypalign[k]); + values[i++] = PointerGetDatum(arry); /* stavaluesN */ + } + else + { + nulls[i] = true; + values[i++] = (Datum) 0; + } + } + + stup = heap_form_tuple(RelationGetDescr(sd), values, nulls); + + astate = accumArrayResult(astate, + heap_copy_tuple_as_datum(stup, RelationGetDescr(sd)), + false, + typOid, + CurrentMemoryContext); + } + + table_close(sd, RowExclusiveLock); + + return makeArrayResult(astate, CurrentMemoryContext); +} + +/* + * Loads pg_statistic record from expression statistics for expression + * identified by the supplied index. + */ +HeapTuple +statext_expressions_load(Oid stxoid, int idx) +{ + bool isnull; + Datum value; + HeapTuple htup; + ExpandedArrayHeader *eah; + HeapTupleHeader td; + HeapTupleData tmptup; + HeapTuple tup; + + htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(stxoid)); + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for statistics object %u", stxoid); + + value = SysCacheGetAttr(STATEXTDATASTXOID, htup, + Anum_pg_statistic_ext_data_stxdexpr, &isnull); + if (isnull) + elog(ERROR, + "requested statistics kind \"%c\" is not yet built for statistics object %u", + STATS_EXT_DEPENDENCIES, stxoid); + + eah = DatumGetExpandedArray(value); + + deconstruct_expanded_array(eah); + + td = DatumGetHeapTupleHeader(eah->dvalues[idx]); + + /* Build a temporary HeapTuple control structure */ + tmptup.t_len = HeapTupleHeaderGetDatumLength(td); + tmptup.t_data = td; + + tup = heap_copytuple(&tmptup); + + ReleaseSysCache(htup); + + return tup; +} + +/* + * Evaluate the expressions, so that we can use the results to build + * all the requested statistics types. This matters especially for + * expensive expressions, of course. + */ +static StatsBuildData * +make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows, + VacAttrStats **stats, int stattarget) +{ + /* evaluated expressions */ + StatsBuildData *result; + char *ptr; + Size len; + + int i; + int k; + int idx; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + List *exprstates = NIL; + int nkeys = bms_num_members(stat->columns) + list_length(stat->exprs); + ListCell *lc; + + /* allocate everything as a single chunk, so we can free it easily */ + len = MAXALIGN(sizeof(StatsBuildData)); + len += MAXALIGN(sizeof(AttrNumber) * nkeys); /* attnums */ + len += MAXALIGN(sizeof(VacAttrStats *) * nkeys); /* stats */ + + /* values */ + len += MAXALIGN(sizeof(Datum *) * nkeys); + len += nkeys * MAXALIGN(sizeof(Datum) * numrows); + + /* nulls */ + len += MAXALIGN(sizeof(bool *) * nkeys); + len += nkeys * MAXALIGN(sizeof(bool) * numrows); + + ptr = palloc(len); + + /* set the pointers */ + result = (StatsBuildData *) ptr; + ptr += MAXALIGN(sizeof(StatsBuildData)); + + /* attnums */ + result->attnums = (AttrNumber *) ptr; + ptr += MAXALIGN(sizeof(AttrNumber) * nkeys); + + /* stats */ + result->stats = (VacAttrStats **) ptr; + ptr += MAXALIGN(sizeof(VacAttrStats *) * nkeys); + + /* values */ + result->values = (Datum **) ptr; + ptr += MAXALIGN(sizeof(Datum *) * nkeys); + + /* nulls */ + result->nulls = (bool **) ptr; + ptr += MAXALIGN(sizeof(bool *) * nkeys); + + for (i = 0; i < nkeys; i++) + { + result->values[i] = (Datum *) ptr; + ptr += MAXALIGN(sizeof(Datum) * numrows); + + result->nulls[i] = (bool *) ptr; + ptr += MAXALIGN(sizeof(bool) * numrows); + } + + Assert((ptr - (char *) result) == len); + + /* we have it allocated, so let's fill the values */ + result->nattnums = nkeys; + result->numrows = numrows; + + /* fill the attribute info - first attributes, then expressions */ + idx = 0; + k = -1; + while ((k = bms_next_member(stat->columns, k)) >= 0) + { + result->attnums[idx] = k; + result->stats[idx] = stats[idx]; + + idx++; + } + + k = -1; + foreach(lc, stat->exprs) + { + Node *expr = (Node *) lfirst(lc); + + result->attnums[idx] = k; + result->stats[idx] = examine_expression(expr, stattarget); + + idx++; + k--; + } + + /* first extract values for all the regular attributes */ + for (i = 0; i < numrows; i++) + { + idx = 0; + k = -1; + while ((k = bms_next_member(stat->columns, k)) >= 0) + { + result->values[idx][i] = heap_getattr(rows[i], k, + result->stats[idx]->tupDesc, + &result->nulls[idx][i]); + + idx++; + } + } + + /* Need an EState for evaluation expressions. */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + + /* Need a slot to hold the current heap tuple, too */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(rel), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Set up expression evaluation state */ + exprstates = ExecPrepareExprList(stat->exprs, estate); + + for (i = 0; i < numrows; i++) + { + /* + * Reset the per-tuple context each time, to reclaim any cruft left + * behind by evaluating the statistics object expressions. + */ + ResetExprContext(econtext); + + /* Set up for expression evaluation */ + ExecStoreHeapTuple(rows[i], slot, false); + + idx = bms_num_members(stat->columns); + foreach(lc, exprstates) + { + Datum datum; + bool isnull; + ExprState *exprstate = (ExprState *) lfirst(lc); + + /* + * XXX This probably leaks memory. Maybe we should use + * ExecEvalExprSwitchContext but then we need to copy the result + * somewhere else. + */ + datum = ExecEvalExpr(exprstate, + GetPerTupleExprContext(estate), + &isnull); + if (isnull) + { + result->values[idx][i] = (Datum) 0; + result->nulls[idx][i] = true; + } + else + { + result->values[idx][i] = (Datum) datum; + result->nulls[idx][i] = false; + } + + idx++; + } + } + + ExecDropSingleTupleTableSlot(slot); + FreeExecutorState(estate); + + return result; +} diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 8335dff241..2a00fb4848 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -74,7 +74,7 @@ ((ndims) * sizeof(DimensionInfo)) + \ ((nitems) * ITEM_SIZE(ndims))) -static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs); +static MultiSortSupport build_mss(StatsBuildData *data); static SortItem *build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss, int *ndistinct); @@ -181,32 +181,33 @@ get_mincount_for_mcv_list(int samplerows, double totalrows) * */ MCVList * -statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, - VacAttrStats **stats, double totalrows, int stattarget) +statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget) { int i, numattrs, + numrows, ngroups, nitems; - AttrNumber *attnums; double mincount; SortItem *items; SortItem *groups; MCVList *mcvlist = NULL; MultiSortSupport mss; - attnums = build_attnums_array(attrs, &numattrs); - /* comparator for all the columns */ - mss = build_mss(stats, numattrs); + mss = build_mss(data); /* sort the rows */ - items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc, - mss, numattrs, attnums); + items = build_sorted_items(data, &nitems, mss, + data->nattnums, data->attnums); if (!items) return NULL; + /* for convenience */ + numattrs = data->nattnums; + numrows = data->numrows; + /* transform the sorted rows into groups (sorted by frequency) */ groups = build_distinct_groups(nitems, items, mss, &ngroups); @@ -289,7 +290,7 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, /* store info about data type OIDs */ for (i = 0; i < numattrs; i++) - mcvlist->types[i] = stats[i]->attrtypid; + mcvlist->types[i] = data->stats[i]->attrtypid; /* Copy the first chunk of groups into the result. */ for (i = 0; i < nitems; i++) @@ -347,9 +348,10 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, * build MultiSortSupport for the attributes passed in attrs */ static MultiSortSupport -build_mss(VacAttrStats **stats, int numattrs) +build_mss(StatsBuildData *data) { int i; + int numattrs = data->nattnums; /* Sort by multiple columns (using array of SortSupport) */ MultiSortSupport mss = multi_sort_init(numattrs); @@ -357,7 +359,7 @@ build_mss(VacAttrStats **stats, int numattrs) /* prepare the sort functions for all the attributes */ for (i = 0; i < numattrs; i++) { - VacAttrStats *colstat = stats[i]; + VacAttrStats *colstat = data->stats[i]; TypeCacheEntry *type; type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); @@ -1523,6 +1525,59 @@ pg_mcv_list_send(PG_FUNCTION_ARGS) return byteasend(fcinfo); } +/* + * match the attribute/expression to a dimension of the statistic + * + * Match the attribute/expression to statistics dimension. Optionally + * determine the collation. + */ +static int +mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid) +{ + int idx = -1; + + if (IsA(expr, Var)) + { + /* simple Var, so just lookup using varattno */ + Var *var = (Var *) expr; + + if (collid) + *collid = var->varcollid; + + idx = bms_member_index(keys, var->varattno); + + /* make sure the index is valid */ + Assert((idx >= 0) && (idx <= bms_num_members(keys))); + } + else + { + ListCell *lc; + + /* expressions are stored after the simple columns */ + idx = bms_num_members(keys); + + if (collid) + *collid = exprCollation(expr); + + /* expression - lookup in stats expressions */ + foreach(lc, exprs) + { + Node *stat_expr = (Node *) lfirst(lc); + + if (equal(expr, stat_expr)) + break; + + idx++; + } + + /* make sure the index is valid */ + Assert((idx >= bms_num_members(keys)) && + (idx <= bms_num_members(keys) + list_length(exprs))); + } + + return idx; +} + /* * mcv_get_match_bitmap * Evaluate clauses using the MCV list, and update the match bitmap. @@ -1544,7 +1599,8 @@ pg_mcv_list_send(PG_FUNCTION_ARGS) */ static bool * mcv_get_match_bitmap(PlannerInfo *root, List *clauses, - Bitmapset *keys, MCVList *mcvlist, bool is_or) + Bitmapset *keys, List *exprs, + MCVList *mcvlist, bool is_or) { int i; ListCell *l; @@ -1582,77 +1638,78 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, OpExpr *expr = (OpExpr *) clause; FmgrInfo opproc; - /* valid only after examine_clause_args returns true */ - Var *var; + /* valid only after examine_opclause_args returns true */ + Node *clause_expr; Const *cst; - bool varonleft; + bool expronleft; + int idx; + Oid collid; fmgr_info(get_opcode(expr->opno), &opproc); - /* extract the var and const from the expression */ - if (examine_clause_args(expr->args, &var, &cst, &varonleft)) - { - int idx; + /* extract the var/expr and const from the expression */ + if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft)) + elog(ERROR, "incompatible clause"); - /* match the attribute to a dimension of the statistic */ - idx = bms_member_index(keys, var->varattno); + /* match the attribute/expression to a dimension of the statistic */ + idx = mcv_match_expression(clause_expr, keys, exprs, &collid); + + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + bool match = true; + MCVItem *item = &mcvlist->items[i]; + + Assert(idx >= 0); /* - * Walk through the MCV items and evaluate the current clause. - * We can skip items that were already ruled out, and - * terminate if there are no remaining MCV items that might - * possibly match. + * When the MCV item or the Const value is NULL we can treat + * this as a mismatch. We must not call the operator because + * of strictness. */ - for (i = 0; i < mcvlist->nitems; i++) + if (item->isnull[idx] || cst->constisnull) { - bool match = true; - MCVItem *item = &mcvlist->items[i]; - - /* - * When the MCV item or the Const value is NULL we can - * treat this as a mismatch. We must not call the operator - * because of strictness. - */ - if (item->isnull[idx] || cst->constisnull) - { - matches[i] = RESULT_MERGE(matches[i], is_or, false); - continue; - } - - /* - * Skip MCV items that can't change result in the bitmap. - * Once the value gets false for AND-lists, or true for - * OR-lists, we don't need to look at more clauses. - */ - if (RESULT_IS_FINAL(matches[i], is_or)) - continue; - - /* - * First check whether the constant is below the lower - * boundary (in that case we can skip the bucket, because - * there's no overlap). - * - * We don't store collations used to build the statistics, - * but we can use the collation for the attribute itself, - * as stored in varcollid. We do reset the statistics - * after a type change (including collation change), so - * this is OK. We may need to relax this after allowing - * extended statistics on expressions. - */ - if (varonleft) - match = DatumGetBool(FunctionCall2Coll(&opproc, - var->varcollid, - item->values[idx], - cst->constvalue)); - else - match = DatumGetBool(FunctionCall2Coll(&opproc, - var->varcollid, - cst->constvalue, - item->values[idx])); - - /* update the match bitmap with the result */ - matches[i] = RESULT_MERGE(matches[i], is_or, match); + matches[i] = RESULT_MERGE(matches[i], is_or, false); + continue; } + + /* + * Skip MCV items that can't change result in the bitmap. Once + * the value gets false for AND-lists, or true for OR-lists, + * we don't need to look at more clauses. + */ + if (RESULT_IS_FINAL(matches[i], is_or)) + continue; + + /* + * First check whether the constant is below the lower + * boundary (in that case we can skip the bucket, because + * there's no overlap). + * + * We don't store collations used to build the statistics, but + * we can use the collation for the attribute itself, as + * stored in varcollid. We do reset the statistics after a + * type change (including collation change), so this is OK. + * For expressions we use the collation extracted from the + * expression itself. + */ + if (expronleft) + match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + item->values[idx], + cst->constvalue)); + else + match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + cst->constvalue, + item->values[idx])); + + /* update the match bitmap with the result */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); } } else if (IsA(clause, ScalarArrayOpExpr)) @@ -1660,115 +1717,116 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; FmgrInfo opproc; - /* valid only after examine_clause_args returns true */ - Var *var; + /* valid only after examine_opclause_args returns true */ + Node *clause_expr; Const *cst; - bool varonleft; + bool expronleft; + Oid collid; + int idx; + + /* array evaluation */ + ArrayType *arrayval; + int16 elmlen; + bool elmbyval; + char elmalign; + int num_elems; + Datum *elem_values; + bool *elem_nulls; fmgr_info(get_opcode(expr->opno), &opproc); - /* extract the var and const from the expression */ - if (examine_clause_args(expr->args, &var, &cst, &varonleft)) + /* extract the var/expr and const from the expression */ + if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft)) + elog(ERROR, "incompatible clause"); + + /* ScalarArrayOpExpr has the Var always on the left */ + Assert(expronleft); + + /* XXX what if (cst->constisnull == NULL)? */ + if (!cst->constisnull) { - int idx; + arrayval = DatumGetArrayTypeP(cst->constvalue); + get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), + &elmlen, &elmbyval, &elmalign); + deconstruct_array(arrayval, + ARR_ELEMTYPE(arrayval), + elmlen, elmbyval, elmalign, + &elem_values, &elem_nulls, &num_elems); + } - ArrayType *arrayval; - int16 elmlen; - bool elmbyval; - char elmalign; - int num_elems; - Datum *elem_values; - bool *elem_nulls; + /* match the attribute/expression to a dimension of the statistic */ + idx = mcv_match_expression(clause_expr, keys, exprs, &collid); - /* ScalarArrayOpExpr has the Var always on the left */ - Assert(varonleft); - - if (!cst->constisnull) - { - arrayval = DatumGetArrayTypeP(cst->constvalue); - get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), - &elmlen, &elmbyval, &elmalign); - deconstruct_array(arrayval, - ARR_ELEMTYPE(arrayval), - elmlen, elmbyval, elmalign, - &elem_values, &elem_nulls, &num_elems); - } - - /* match the attribute to a dimension of the statistic */ - idx = bms_member_index(keys, var->varattno); + /* + * Walk through the MCV items and evaluate the current clause. We + * can skip items that were already ruled out, and terminate if + * there are no remaining MCV items that might possibly match. + */ + for (i = 0; i < mcvlist->nitems; i++) + { + int j; + bool match = (expr->useOr ? false : true); + MCVItem *item = &mcvlist->items[i]; /* - * Walk through the MCV items and evaluate the current clause. - * We can skip items that were already ruled out, and - * terminate if there are no remaining MCV items that might - * possibly match. + * When the MCV item or the Const value is NULL we can treat + * this as a mismatch. We must not call the operator because + * of strictness. */ - for (i = 0; i < mcvlist->nitems; i++) + if (item->isnull[idx] || cst->constisnull) { - int j; - bool match = (expr->useOr ? false : true); - MCVItem *item = &mcvlist->items[i]; - - /* - * When the MCV item or the Const value is NULL we can - * treat this as a mismatch. We must not call the operator - * because of strictness. - */ - if (item->isnull[idx] || cst->constisnull) - { - matches[i] = RESULT_MERGE(matches[i], is_or, false); - continue; - } - - /* - * Skip MCV items that can't change result in the bitmap. - * Once the value gets false for AND-lists, or true for - * OR-lists, we don't need to look at more clauses. - */ - if (RESULT_IS_FINAL(matches[i], is_or)) - continue; - - for (j = 0; j < num_elems; j++) - { - Datum elem_value = elem_values[j]; - bool elem_isnull = elem_nulls[j]; - bool elem_match; - - /* NULL values always evaluate as not matching. */ - if (elem_isnull) - { - match = RESULT_MERGE(match, expr->useOr, false); - continue; - } - - /* - * Stop evaluating the array elements once we reach - * match value that can't change - ALL() is the same - * as AND-list, ANY() is the same as OR-list. - */ - if (RESULT_IS_FINAL(match, expr->useOr)) - break; - - elem_match = DatumGetBool(FunctionCall2Coll(&opproc, - var->varcollid, - item->values[idx], - elem_value)); - - match = RESULT_MERGE(match, expr->useOr, elem_match); - } - - /* update the match bitmap with the result */ - matches[i] = RESULT_MERGE(matches[i], is_or, match); + matches[i] = RESULT_MERGE(matches[i], is_or, false); + continue; } + + /* + * Skip MCV items that can't change result in the bitmap. Once + * the value gets false for AND-lists, or true for OR-lists, + * we don't need to look at more clauses. + */ + if (RESULT_IS_FINAL(matches[i], is_or)) + continue; + + for (j = 0; j < num_elems; j++) + { + Datum elem_value = elem_values[j]; + bool elem_isnull = elem_nulls[j]; + bool elem_match; + + /* NULL values always evaluate as not matching. */ + if (elem_isnull) + { + match = RESULT_MERGE(match, expr->useOr, false); + continue; + } + + /* + * Stop evaluating the array elements once we reach match + * value that can't change - ALL() is the same as + * AND-list, ANY() is the same as OR-list. + */ + if (RESULT_IS_FINAL(match, expr->useOr)) + break; + + elem_match = DatumGetBool(FunctionCall2Coll(&opproc, + collid, + item->values[idx], + elem_value)); + + match = RESULT_MERGE(match, expr->useOr, elem_match); + } + + /* update the match bitmap with the result */ + matches[i] = RESULT_MERGE(matches[i], is_or, match); } } else if (IsA(clause, NullTest)) { NullTest *expr = (NullTest *) clause; - Var *var = (Var *) (expr->arg); + Node *clause_expr = (Node *) (expr->arg); - /* match the attribute to a dimension of the statistic */ - int idx = bms_member_index(keys, var->varattno); + /* match the attribute/expression to a dimension of the statistic */ + int idx = mcv_match_expression(clause_expr, keys, exprs, NULL); /* * Walk through the MCV items and evaluate the current clause. We @@ -1811,7 +1869,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, Assert(list_length(bool_clauses) >= 2); /* build the match bitmap for the OR-clauses */ - bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, + bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs, mcvlist, is_orclause(clause)); /* @@ -1839,7 +1897,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses, Assert(list_length(not_args) == 1); /* build the match bitmap for the NOT-clause */ - not_matches = mcv_get_match_bitmap(root, not_args, keys, + not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs, mcvlist, false); /* @@ -1982,7 +2040,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, mcv = statext_mcv_load(stat->statOid); /* build a match bitmap for the clauses */ - matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false); + matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs, + mcv, false); /* sum frequencies for all the matching MCV items */ *basesel = 0.0; @@ -2056,7 +2115,7 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat, /* build the match bitmap for the new clause */ new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys, - mcv, false); + stat->exprs, mcv, false); /* * Sum the frequencies for all the MCV items matching this clause and also diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c index e08c001e3f..4481312d61 100644 --- a/src/backend/statistics/mvdistinct.c +++ b/src/backend/statistics/mvdistinct.c @@ -36,8 +36,7 @@ #include "utils/syscache.h" #include "utils/typcache.h" -static double ndistinct_for_combination(double totalrows, int numrows, - HeapTuple *rows, VacAttrStats **stats, +static double ndistinct_for_combination(double totalrows, StatsBuildData *data, int k, int *combination); static double estimate_ndistinct(double totalrows, int numrows, int d, int f1); static int n_choose_k(int n, int k); @@ -81,15 +80,18 @@ static void generate_combinations(CombinationGenerator *state); * * This computes the ndistinct estimate using the same estimator used * in analyze.c and then computes the coefficient. + * + * To handle expressions easily, we treat them as system attributes with + * negative attnums, and offset everything by number of expressions to + * allow using Bitmapsets. */ MVNDistinct * -statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats) +statext_ndistinct_build(double totalrows, StatsBuildData *data) { MVNDistinct *result; int k; int itemcnt; - int numattrs = bms_num_members(attrs); + int numattrs = data->nattnums; int numcombs = num_combinations(numattrs); result = palloc(offsetof(MVNDistinct, items) + @@ -112,13 +114,19 @@ statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, MVNDistinctItem *item = &result->items[itemcnt]; int j; - item->attrs = NULL; + item->attributes = palloc(sizeof(AttrNumber) * k); + item->nattributes = k; + + /* translate the indexes to attnums */ for (j = 0; j < k; j++) - item->attrs = bms_add_member(item->attrs, - stats[combination[j]]->attr->attnum); + { + item->attributes[j] = data->attnums[combination[j]]; + + Assert(AttributeNumberIsValid(item->attributes[j])); + } + item->ndistinct = - ndistinct_for_combination(totalrows, numrows, rows, - stats, k, combination); + ndistinct_for_combination(totalrows, data, k, combination); itemcnt++; Assert(itemcnt <= result->nitems); @@ -189,7 +197,7 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct) { int nmembers; - nmembers = bms_num_members(ndistinct->items[i].attrs); + nmembers = ndistinct->items[i].nattributes; Assert(nmembers >= 2); len += SizeOfItem(nmembers); @@ -214,22 +222,15 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct) for (i = 0; i < ndistinct->nitems; i++) { MVNDistinctItem item = ndistinct->items[i]; - int nmembers = bms_num_members(item.attrs); - int x; + int nmembers = item.nattributes; memcpy(tmp, &item.ndistinct, sizeof(double)); tmp += sizeof(double); memcpy(tmp, &nmembers, sizeof(int)); tmp += sizeof(int); - x = -1; - while ((x = bms_next_member(item.attrs, x)) >= 0) - { - AttrNumber value = (AttrNumber) x; - - memcpy(tmp, &value, sizeof(AttrNumber)); - tmp += sizeof(AttrNumber); - } + memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers); + tmp += nmembers * sizeof(AttrNumber); /* protect against overflows */ Assert(tmp <= ((char *) output + len)); @@ -301,27 +302,21 @@ statext_ndistinct_deserialize(bytea *data) for (i = 0; i < ndistinct->nitems; i++) { MVNDistinctItem *item = &ndistinct->items[i]; - int nelems; - - item->attrs = NULL; /* ndistinct value */ memcpy(&item->ndistinct, tmp, sizeof(double)); tmp += sizeof(double); /* number of attributes */ - memcpy(&nelems, tmp, sizeof(int)); + memcpy(&item->nattributes, tmp, sizeof(int)); tmp += sizeof(int); - Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS)); + Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS)); - while (nelems-- > 0) - { - AttrNumber attno; + item->attributes + = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber)); - memcpy(&attno, tmp, sizeof(AttrNumber)); - tmp += sizeof(AttrNumber); - item->attrs = bms_add_member(item->attrs, attno); - } + memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes); + tmp += sizeof(AttrNumber) * item->nattributes; /* still within the bytea */ Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); @@ -369,17 +364,17 @@ pg_ndistinct_out(PG_FUNCTION_ARGS) for (i = 0; i < ndist->nitems; i++) { + int j; MVNDistinctItem item = ndist->items[i]; - int x = -1; - bool first = true; if (i > 0) appendStringInfoString(&str, ", "); - while ((x = bms_next_member(item.attrs, x)) >= 0) + for (j = 0; j < item.nattributes; j++) { - appendStringInfo(&str, "%s%d", first ? "\"" : ", ", x); - first = false; + AttrNumber attnum = item.attributes[j]; + + appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum); } appendStringInfo(&str, "\": %d", (int) item.ndistinct); } @@ -427,8 +422,8 @@ pg_ndistinct_send(PG_FUNCTION_ARGS) * combination of multiple columns. */ static double -ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, - VacAttrStats **stats, int k, int *combination) +ndistinct_for_combination(double totalrows, StatsBuildData *data, + int k, int *combination) { int i, j; @@ -439,6 +434,7 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, Datum *values; SortItem *items; MultiSortSupport mss; + int numrows = data->numrows; mss = multi_sort_init(k); @@ -467,25 +463,27 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, */ for (i = 0; i < k; i++) { - VacAttrStats *colstat = stats[combination[i]]; + Oid typid; TypeCacheEntry *type; + Oid collid = InvalidOid; + VacAttrStats *colstat = data->stats[combination[i]]; - type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); + typid = colstat->attrtypid; + collid = colstat->attrcollid; + + type = lookup_type_cache(typid, TYPECACHE_LT_OPR); if (type->lt_opr == InvalidOid) /* shouldn't happen */ elog(ERROR, "cache lookup failed for ordering operator for type %u", - colstat->attrtypid); + typid); /* prepare the sort function for this dimension */ - multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid); + multi_sort_add_dimension(mss, i, type->lt_opr, collid); /* accumulate all the data for this dimension into the arrays */ for (j = 0; j < numrows; j++) { - items[j].values[i] = - heap_getattr(rows[j], - colstat->attr->attnum, - colstat->tupDesc, - &items[j].isnull[i]); + items[j].values[i] = data->values[combination[i]][j]; + items[j].isnull[i] = data->nulls[combination[i]][j]; } } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 729274b330..16c6f17e23 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1816,7 +1816,34 @@ ProcessUtilitySlow(ParseState *pstate, break; case T_CreateStatsStmt: - address = CreateStatistics((CreateStatsStmt *) parsetree); + { + Oid relid; + CreateStatsStmt *stmt = (CreateStatsStmt *) parsetree; + RangeVar *rel = (RangeVar *) linitial(stmt->relations); + + if (!IsA(rel, RangeVar)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only a single relation is allowed in CREATE STATISTICS"))); + + /* + * CREATE STATISTICS will influence future execution plans + * but does not interfere with currently executing plans. + * So it should be enough to take ShareUpdateExclusiveLock + * on relation, conflicting with ANALYZE and other DDL + * that sets statistical information, but not with normal + * queries. + * + * XXX RangeVarCallbackOwnsRelation not needed here, to + * keep the same behavior as before. + */ + relid = RangeVarGetRelid(rel, ShareUpdateExclusiveLock, false); + + /* Run parse analysis ... */ + stmt = transformStatsStmt(relid, stmt, queryString); + + address = CreateStatistics(stmt); + } break; case T_AlterStatsStmt: diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index f0de2a25c9..3de98d2333 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -336,7 +336,8 @@ static char *pg_get_indexdef_worker(Oid indexrelid, int colno, bool attrsOnly, bool keysOnly, bool showTblSpc, bool inherits, int prettyFlags, bool missing_ok); -static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok); +static char *pg_get_statisticsobj_worker(Oid statextid, bool columns_only, + bool missing_ok); static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags, bool attrsOnly, bool missing_ok); static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, @@ -1507,7 +1508,36 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS) Oid statextid = PG_GETARG_OID(0); char *res; - res = pg_get_statisticsobj_worker(statextid, true); + res = pg_get_statisticsobj_worker(statextid, false, true); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +/* + * Internal version for use by ALTER TABLE. + * Includes a tablespace clause in the result. + * Returns a palloc'd C string; no pretty-printing. + */ +char * +pg_get_statisticsobjdef_string(Oid statextid) +{ + return pg_get_statisticsobj_worker(statextid, false, false); +} + +/* + * pg_get_statisticsobjdef_columns + * Get columns and expressions for an extended statistics object + */ +Datum +pg_get_statisticsobjdef_columns(PG_FUNCTION_ARGS) +{ + Oid statextid = PG_GETARG_OID(0); + char *res; + + res = pg_get_statisticsobj_worker(statextid, true, true); if (res == NULL) PG_RETURN_NULL(); @@ -1519,7 +1549,7 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS) * Internal workhorse to decompile an extended statistics object. */ static char * -pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) +pg_get_statisticsobj_worker(Oid statextid, bool columns_only, bool missing_ok) { Form_pg_statistic_ext statextrec; HeapTuple statexttup; @@ -1534,6 +1564,11 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) bool dependencies_enabled; bool mcv_enabled; int i; + List *context; + ListCell *lc; + List *exprs = NIL; + bool has_exprs; + int ncolumns; statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid)); @@ -1544,75 +1579,114 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) elog(ERROR, "cache lookup failed for statistics object %u", statextid); } + /* has the statistics expressions? */ + has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL); + statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup); + /* + * Get the statistics expressions, if any. (NOTE: we do not use the + * relcache versions of the expressions, because we want to display + * non-const-folded expressions.) + */ + if (has_exprs) + { + Datum exprsDatum; + bool isnull; + char *exprsString; + + exprsDatum = SysCacheGetAttr(STATEXTOID, statexttup, + Anum_pg_statistic_ext_stxexprs, &isnull); + Assert(!isnull); + exprsString = TextDatumGetCString(exprsDatum); + exprs = (List *) stringToNode(exprsString); + pfree(exprsString); + } + else + exprs = NIL; + + /* count the number of columns (attributes and expressions) */ + ncolumns = statextrec->stxkeys.dim1 + list_length(exprs); + initStringInfo(&buf); - nsp = get_namespace_name(statextrec->stxnamespace); - appendStringInfo(&buf, "CREATE STATISTICS %s", - quote_qualified_identifier(nsp, - NameStr(statextrec->stxname))); - - /* - * Decode the stxkind column so that we know which stats types to print. - */ - datum = SysCacheGetAttr(STATEXTOID, statexttup, - Anum_pg_statistic_ext_stxkind, &isnull); - Assert(!isnull); - arr = DatumGetArrayTypeP(datum); - if (ARR_NDIM(arr) != 1 || - ARR_HASNULL(arr) || - ARR_ELEMTYPE(arr) != CHAROID) - elog(ERROR, "stxkind is not a 1-D char array"); - enabled = (char *) ARR_DATA_PTR(arr); - - ndistinct_enabled = false; - dependencies_enabled = false; - mcv_enabled = false; - - for (i = 0; i < ARR_DIMS(arr)[0]; i++) + if (!columns_only) { - if (enabled[i] == STATS_EXT_NDISTINCT) - ndistinct_enabled = true; - if (enabled[i] == STATS_EXT_DEPENDENCIES) - dependencies_enabled = true; - if (enabled[i] == STATS_EXT_MCV) - mcv_enabled = true; - } + nsp = get_namespace_name(statextrec->stxnamespace); + appendStringInfo(&buf, "CREATE STATISTICS %s", + quote_qualified_identifier(nsp, + NameStr(statextrec->stxname))); - /* - * If any option is disabled, then we'll need to append the types clause - * to show which options are enabled. We omit the types clause on purpose - * when all options are enabled, so a pg_dump/pg_restore will create all - * statistics types on a newer postgres version, if the statistics had all - * options enabled on the original version. - */ - if (!ndistinct_enabled || !dependencies_enabled || !mcv_enabled) - { - bool gotone = false; + /* + * Decode the stxkind column so that we know which stats types to + * print. + */ + datum = SysCacheGetAttr(STATEXTOID, statexttup, + Anum_pg_statistic_ext_stxkind, &isnull); + Assert(!isnull); + arr = DatumGetArrayTypeP(datum); + if (ARR_NDIM(arr) != 1 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != CHAROID) + elog(ERROR, "stxkind is not a 1-D char array"); + enabled = (char *) ARR_DATA_PTR(arr); - appendStringInfoString(&buf, " ("); + ndistinct_enabled = false; + dependencies_enabled = false; + mcv_enabled = false; - if (ndistinct_enabled) + for (i = 0; i < ARR_DIMS(arr)[0]; i++) { - appendStringInfoString(&buf, "ndistinct"); - gotone = true; + if (enabled[i] == STATS_EXT_NDISTINCT) + ndistinct_enabled = true; + else if (enabled[i] == STATS_EXT_DEPENDENCIES) + dependencies_enabled = true; + else if (enabled[i] == STATS_EXT_MCV) + mcv_enabled = true; + + /* ignore STATS_EXT_EXPRESSIONS (it's built automatically) */ } - if (dependencies_enabled) + /* + * If any option is disabled, then we'll need to append the types + * clause to show which options are enabled. We omit the types clause + * on purpose when all options are enabled, so a pg_dump/pg_restore + * will create all statistics types on a newer postgres version, if + * the statistics had all options enabled on the original version. + * + * But if the statistics is defined on just a single column, it has to + * be an expression statistics. In that case we don't need to specify + * kinds. + */ + if ((!ndistinct_enabled || !dependencies_enabled || !mcv_enabled) && + (ncolumns > 1)) { - appendStringInfo(&buf, "%sdependencies", gotone ? ", " : ""); - gotone = true; + bool gotone = false; + + appendStringInfoString(&buf, " ("); + + if (ndistinct_enabled) + { + appendStringInfoString(&buf, "ndistinct"); + gotone = true; + } + + if (dependencies_enabled) + { + appendStringInfo(&buf, "%sdependencies", gotone ? ", " : ""); + gotone = true; + } + + if (mcv_enabled) + appendStringInfo(&buf, "%smcv", gotone ? ", " : ""); + + appendStringInfoChar(&buf, ')'); } - if (mcv_enabled) - appendStringInfo(&buf, "%smcv", gotone ? ", " : ""); - - appendStringInfoChar(&buf, ')'); + appendStringInfoString(&buf, " ON "); } - appendStringInfoString(&buf, " ON "); - + /* decode simple column references */ for (colno = 0; colno < statextrec->stxkeys.dim1; colno++) { AttrNumber attnum = statextrec->stxkeys.values[colno]; @@ -1626,14 +1700,109 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) appendStringInfoString(&buf, quote_identifier(attname)); } - appendStringInfo(&buf, " FROM %s", - generate_relation_name(statextrec->stxrelid, NIL)); + context = deparse_context_for(get_relation_name(statextrec->stxrelid), + statextrec->stxrelid); + + foreach(lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + char *str; + int prettyFlags = PRETTYFLAG_INDENT; + + str = deparse_expression_pretty(expr, context, false, false, + prettyFlags, 0); + + if (colno > 0) + appendStringInfoString(&buf, ", "); + + /* Need parens if it's not a bare function call */ + if (looks_like_function(expr)) + appendStringInfoString(&buf, str); + else + appendStringInfo(&buf, "(%s)", str); + + colno++; + } + + if (!columns_only) + appendStringInfo(&buf, " FROM %s", + generate_relation_name(statextrec->stxrelid, NIL)); ReleaseSysCache(statexttup); return buf.data; } +/* + * Generate text array of expressions for statistics object. + */ +Datum +pg_get_statisticsobjdef_expressions(PG_FUNCTION_ARGS) +{ + Oid statextid = PG_GETARG_OID(0); + Form_pg_statistic_ext statextrec; + HeapTuple statexttup; + Datum datum; + bool isnull; + List *context; + ListCell *lc; + List *exprs = NIL; + bool has_exprs; + char *tmp; + ArrayBuildState *astate = NULL; + + statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid)); + + if (!HeapTupleIsValid(statexttup)) + elog(ERROR, "cache lookup failed for statistics object %u", statextid); + + /* has the statistics expressions? */ + has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL); + + /* no expressions? we're done */ + if (!has_exprs) + { + ReleaseSysCache(statexttup); + PG_RETURN_NULL(); + } + + statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup); + + /* + * Get the statistics expressions, and deparse them into text values. + */ + datum = SysCacheGetAttr(STATEXTOID, statexttup, + Anum_pg_statistic_ext_stxexprs, &isnull); + + Assert(!isnull); + tmp = TextDatumGetCString(datum); + exprs = (List *) stringToNode(tmp); + pfree(tmp); + + context = deparse_context_for(get_relation_name(statextrec->stxrelid), + statextrec->stxrelid); + + foreach(lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + char *str; + int prettyFlags = PRETTYFLAG_INDENT; + + str = deparse_expression_pretty(expr, context, false, false, + prettyFlags, 0); + + astate = accumArrayResult(astate, + PointerGetDatum(cstring_to_text(str)), + false, + TEXTOID, + CurrentMemoryContext); + } + + ReleaseSysCache(statexttup); + + PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext)); +} + /* * pg_get_partkeydef * diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 2348d4a772..7e41bc5641 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3430,6 +3430,14 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, * If examine_variable is able to deduce anything about the GROUP BY * expression, treat it as a single variable even if it's really more * complicated. + * + * XXX This has the consequence that if there's a statistics on the + * expression, we don't split it into individual Vars. This affects + * our selection of statistics in estimate_multivariate_ndistinct, + * because it's probably better to use more accurate estimate for + * each expression and treat them as independent, than to combine + * estimates for the extracted variables when we don't know how that + * relates to the expressions. */ examine_variable(root, groupexpr, 0, &vardata); if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique) @@ -3880,50 +3888,77 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, List **varinfos, double *ndistinct) { ListCell *lc; - Bitmapset *attnums = NULL; - int nmatches; + int nmatches_vars; + int nmatches_exprs; Oid statOid = InvalidOid; MVNDistinct *stats; - Bitmapset *matched = NULL; + StatisticExtInfo *matched_info = NULL; /* bail out immediately if the table has no extended statistics */ if (!rel->statlist) return false; - /* Determine the attnums we're looking for */ - foreach(lc, *varinfos) - { - GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc); - AttrNumber attnum; - - Assert(varinfo->rel == rel); - - if (!IsA(varinfo->var, Var)) - continue; - - attnum = ((Var *) varinfo->var)->varattno; - - if (!AttrNumberIsForUserDefinedAttr(attnum)) - continue; - - attnums = bms_add_member(attnums, attnum); - } - /* look for the ndistinct statistics matching the most vars */ - nmatches = 1; /* we require at least two matches */ + nmatches_vars = 0; /* we require at least two matches */ + nmatches_exprs = 0; foreach(lc, rel->statlist) { + ListCell *lc2; StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); - Bitmapset *shared; - int nshared; + int nshared_vars = 0; + int nshared_exprs = 0; /* skip statistics of other kinds */ if (info->kind != STATS_EXT_NDISTINCT) continue; - /* compute attnums shared by the vars and the statistics object */ - shared = bms_intersect(info->keys, attnums); - nshared = bms_num_members(shared); + /* + * Determine how many expressions (and variables in non-matched + * expressions) match. We'll then use these numbers to pick the + * statistics object that best matches the clauses. + */ + foreach(lc2, *varinfos) + { + ListCell *lc3; + GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2); + AttrNumber attnum; + + Assert(varinfo->rel == rel); + + /* simple Var, search in statistics keys directly */ + if (IsA(varinfo->var, Var)) + { + attnum = ((Var *) varinfo->var)->varattno; + + /* + * Ignore system attributes - we don't support statistics on + * them, so can't match them (and it'd fail as the values are + * negative). + */ + if (!AttrNumberIsForUserDefinedAttr(attnum)) + continue; + + if (bms_is_member(attnum, info->keys)) + nshared_vars++; + + continue; + } + + /* expression - see if it's in the statistics */ + foreach(lc3, info->exprs) + { + Node *expr = (Node *) lfirst(lc3); + + if (equal(varinfo->var, expr)) + { + nshared_exprs++; + break; + } + } + } + + if (nshared_vars + nshared_exprs < 2) + continue; /* * Does this statistics object match more columns than the currently @@ -3932,18 +3967,21 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, * XXX This should break ties using name of the object, or something * like that, to make the outcome stable. */ - if (nshared > nmatches) + if ((nshared_exprs > nmatches_exprs) || + (((nshared_exprs == nmatches_exprs)) && (nshared_vars > nmatches_vars))) { statOid = info->statOid; - nmatches = nshared; - matched = shared; + nmatches_vars = nshared_vars; + nmatches_exprs = nshared_exprs; + matched_info = info; } } /* No match? */ if (statOid == InvalidOid) return false; - Assert(nmatches > 1 && matched != NULL); + + Assert(nmatches_vars + nmatches_exprs > 1); stats = statext_ndistinct_load(statOid); @@ -3956,20 +3994,135 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, int i; List *newlist = NIL; MVNDistinctItem *item = NULL; + ListCell *lc2; + Bitmapset *matched = NULL; + AttrNumber attnum_offset; + + /* + * How much we need to offset the attnums? If there are no + * expressions, no offset is needed. Otherwise offset enough to move + * the lowest one (which is equal to number of expressions) to 1. + */ + if (matched_info->exprs) + attnum_offset = (list_length(matched_info->exprs) + 1); + else + attnum_offset = 0; + + /* see what actually matched */ + foreach(lc2, *varinfos) + { + ListCell *lc3; + int idx; + bool found = false; + + GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2); + + /* + * Process a simple Var expression, by matching it to keys + * directly. If there's a matchine expression, we'll try + * matching it later. + */ + if (IsA(varinfo->var, Var)) + { + AttrNumber attnum = ((Var *) varinfo->var)->varattno; + + /* + * Ignore expressions on system attributes. Can't rely on + * the bms check for negative values. + */ + if (!AttrNumberIsForUserDefinedAttr(attnum)) + continue; + + /* Is the variable covered by the statistics? */ + if (!bms_is_member(attnum, matched_info->keys)) + continue; + + attnum = attnum + attnum_offset; + + /* ensure sufficient offset */ + Assert(AttrNumberIsForUserDefinedAttr(attnum)); + + matched = bms_add_member(matched, attnum); + + found = true; + } + + /* + * XXX Maybe we should allow searching the expressions even if we + * found an attribute matching the expression? That would handle + * trivial expressions like "(a)" but it seems fairly useless. + */ + if (found) + continue; + + /* expression - see if it's in the statistics */ + idx = 0; + foreach(lc3, matched_info->exprs) + { + Node *expr = (Node *) lfirst(lc3); + + if (equal(varinfo->var, expr)) + { + AttrNumber attnum = -(idx + 1); + + attnum = attnum + attnum_offset; + + /* ensure sufficient offset */ + Assert(AttrNumberIsForUserDefinedAttr(attnum)); + + matched = bms_add_member(matched, attnum); + + /* there should be just one matching expression */ + break; + } + + idx++; + } + } /* Find the specific item that exactly matches the combination */ for (i = 0; i < stats->nitems; i++) { + int j; MVNDistinctItem *tmpitem = &stats->items[i]; - if (bms_subset_compare(tmpitem->attrs, matched) == BMS_EQUAL) + if (tmpitem->nattributes != bms_num_members(matched)) + continue; + + /* assume it's the right item */ + item = tmpitem; + + /* check that all item attributes/expressions fit the match */ + for (j = 0; j < tmpitem->nattributes; j++) { - item = tmpitem; - break; + AttrNumber attnum = tmpitem->attributes[j]; + + /* + * Thanks to how we constructed the matched bitmap above, we + * can just offset all attnums the same way. + */ + attnum = attnum + attnum_offset; + + if (!bms_is_member(attnum, matched)) + { + /* nah, it's not this item */ + item = NULL; + break; + } } + + /* + * If the item has all the matched attributes, we know it's the + * right one - there can't be a better one. matching more. + */ + if (item) + break; } - /* make sure we found an item */ + /* + * Make sure we found an item. There has to be one, because ndistinct + * statistics includes all combinations of attributes. + */ if (!item) elog(ERROR, "corrupt MVNDistinct entry"); @@ -3977,18 +4130,63 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, foreach(lc, *varinfos) { GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc); - AttrNumber attnum; + ListCell *lc3; + bool found = false; - if (!IsA(varinfo->var, Var)) + /* + * Let's look at plain variables first, because it's the most + * common case and the check is quite cheap. We can simply get the + * attnum and check (with an offset) matched bitmap. + */ + if (IsA(varinfo->var, Var)) { - newlist = lappend(newlist, varinfo); + AttrNumber attnum = ((Var *) varinfo->var)->varattno; + + /* + * If it's a system attribute, we're done. We don't support + * extended statistics on system attributes, so it's clearly + * not matched. Just keep the expression and continue. + */ + if (!AttrNumberIsForUserDefinedAttr(attnum)) + { + newlist = lappend(newlist, varinfo); + continue; + } + + /* apply the same offset as above */ + attnum += attnum_offset; + + /* if it's not matched, keep the varinfo */ + if (!bms_is_member(attnum, matched)) + newlist = lappend(newlist, varinfo); + + /* The rest of the loop deals with complex expressions. */ continue; } - attnum = ((Var *) varinfo->var)->varattno; + /* + * Process complex expressions, not just simple Vars. + * + * First, we search for an exact match of an expression. If we + * find one, we can just discard the whole GroupExprInfo, with all + * the variables we extracted from it. + * + * Otherwise we inspect the individual vars, and try matching it + * to variables in the item. + */ + foreach(lc3, matched_info->exprs) + { + Node *expr = (Node *) lfirst(lc3); - if (AttrNumberIsForUserDefinedAttr(attnum) && - bms_is_member(attnum, matched)) + if (equal(varinfo->var, expr)) + { + found = true; + break; + } + } + + /* found exact match, skip */ + if (found) continue; newlist = lappend(newlist, varinfo); @@ -4690,6 +4888,13 @@ get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo, *join_is_reversed = false; } +/* statext_expressions_load copies the tuple, so just pfree it. */ +static void +ReleaseDummy(HeapTuple tuple) +{ + pfree(tuple); +} + /* * examine_variable * Try to look up statistical data about an expression. @@ -4830,6 +5035,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, * operator we are estimating for. FIXME later. */ ListCell *ilist; + ListCell *slist; foreach(ilist, onerel->indexlist) { @@ -4986,6 +5192,129 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid, if (vardata->statsTuple) break; } + + /* + * Search extended statistics for one with a matching expression. + * There might be multiple ones, so just grab the first one. In the + * future, we might consider the statistics target (and pick the most + * accurate statistics) and maybe some other parameters. + */ + foreach(slist, onerel->statlist) + { + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(slist); + ListCell *expr_item; + int pos; + + /* + * Stop once we've found statistics for the expression (either + * from extended stats, or for an index in the preceding loop). + */ + if (vardata->statsTuple) + break; + + /* skip stats without per-expression stats */ + if (info->kind != STATS_EXT_EXPRESSIONS) + continue; + + pos = 0; + foreach(expr_item, info->exprs) + { + Node *expr = (Node *) lfirst(expr_item); + + Assert(expr); + + /* strip RelabelType before comparing it */ + if (expr && IsA(expr, RelabelType)) + expr = (Node *) ((RelabelType *) expr)->arg; + + /* found a match, see if we can extract pg_statistic row */ + if (equal(node, expr)) + { + HeapTuple t = statext_expressions_load(info->statOid, pos); + + /* Get index's table for permission check */ + RangeTblEntry *rte; + Oid userid; + + vardata->statsTuple = t; + + /* + * XXX Not sure if we should cache the tuple somewhere. + * Now we just create a new copy every time. + */ + vardata->freefunc = ReleaseDummy; + + rte = planner_rt_fetch(onerel->relid, root); + Assert(rte->rtekind == RTE_RELATION); + + /* + * Use checkAsUser if it's set, in case we're accessing + * the table via a view. + */ + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + /* + * For simplicity, we insist on the whole table being + * selectable, rather than trying to identify which + * column(s) the statistics depends on. Also require all + * rows to be selectable --- there must be no + * securityQuals from security barrier views or RLS + * policies. + */ + vardata->acl_ok = + rte->securityQuals == NIL && + (pg_class_aclcheck(rte->relid, userid, + ACL_SELECT) == ACLCHECK_OK); + + /* + * If the user doesn't have permissions to access an + * inheritance child relation, check the permissions of + * the table actually mentioned in the query, since most + * likely the user does have that permission. Note that + * whole-table select privilege on the parent doesn't + * quite guarantee that the user could read all columns of + * the child. But in practice it's unlikely that any + * interesting security violation could result from + * allowing access to the expression stats, so we allow it + * anyway. See similar code in examine_simple_variable() + * for additional comments. + */ + if (!vardata->acl_ok && + root->append_rel_array != NULL) + { + AppendRelInfo *appinfo; + Index varno = onerel->relid; + + appinfo = root->append_rel_array[varno]; + while (appinfo && + planner_rt_fetch(appinfo->parent_relid, + root)->rtekind == RTE_RELATION) + { + varno = appinfo->parent_relid; + appinfo = root->append_rel_array[varno]; + } + if (varno != onerel->relid) + { + /* Repeat access check on this rel */ + rte = planner_rt_fetch(varno, root); + Assert(rte->rtekind == RTE_RELATION); + + userid = rte->checkAsUser ? rte->checkAsUser : GetUserId(); + + vardata->acl_ok = + rte->securityQuals == NIL && + (pg_class_aclcheck(rte->relid, + userid, + ACL_SELECT) == ACLCHECK_OK); + } + } + + break; + } + + pos++; + } + } } } diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 737e46464a..86113df29c 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -2637,6 +2637,18 @@ my %tests = ( unlike => { exclude_dump_test_schema => 1, }, }, + 'CREATE STATISTICS extended_stats_expression' => { + create_order => 99, + create_sql => 'CREATE STATISTICS dump_test.test_ext_stats_expr + ON (2 * col1) FROM dump_test.test_fifth_table', + regexp => qr/^ + \QCREATE STATISTICS dump_test.test_ext_stats_expr ON ((2 * col1)) FROM dump_test.test_fifth_table;\E + /xms, + like => + { %full_runs, %dump_test_schema_runs, section_post_data => 1, }, + unlike => { exclude_dump_test_schema => 1, }, + }, + 'CREATE SEQUENCE test_table_col1_seq' => { regexp => qr/^ \QCREATE SEQUENCE dump_test.test_table_col1_seq\E diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index e56cc43e11..440249ff69 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -2712,7 +2712,104 @@ describeOneTableDetails(const char *schemaname, } /* print any extended statistics */ - if (pset.sversion >= 100000) + if (pset.sversion >= 140000) + { + printfPQExpBuffer(&buf, + "SELECT oid, " + "stxrelid::pg_catalog.regclass, " + "stxnamespace::pg_catalog.regnamespace AS nsp, " + "stxname,\n" + "pg_get_statisticsobjdef_columns(oid) AS columns,\n" + " 'd' = any(stxkind) AS ndist_enabled,\n" + " 'f' = any(stxkind) AS deps_enabled,\n" + " 'm' = any(stxkind) AS mcv_enabled,\n" + "stxstattarget\n" + "FROM pg_catalog.pg_statistic_ext stat\n" + "WHERE stxrelid = '%s'\n" + "ORDER BY 1;", + oid); + + result = PSQLexec(buf.data); + if (!result) + goto error_return; + else + tuples = PQntuples(result); + + if (tuples > 0) + { + printTableAddFooter(&cont, _("Statistics objects:")); + + for (i = 0; i < tuples; i++) + { + bool gotone = false; + bool has_ndistinct; + bool has_dependencies; + bool has_mcv; + bool has_all; + bool has_some; + + has_ndistinct = (strcmp(PQgetvalue(result, i, 5), "t") == 0); + has_dependencies = (strcmp(PQgetvalue(result, i, 6), "t") == 0); + has_mcv = (strcmp(PQgetvalue(result, i, 7), "t") == 0); + + printfPQExpBuffer(&buf, " "); + + /* statistics object name (qualified with namespace) */ + appendPQExpBuffer(&buf, "\"%s\".\"%s\"", + PQgetvalue(result, i, 2), + PQgetvalue(result, i, 3)); + + /* + * When printing kinds we ignore expression statistics, + * which is used only internally and can't be specified by + * user. We don't print the kinds when either none are + * specified (in which case it has to be statistics on a + * single expr) or when all are specified (in which case + * we assume it's expanded by CREATE STATISTICS). + */ + has_all = (has_ndistinct && has_dependencies && has_mcv); + has_some = (has_ndistinct || has_dependencies || has_mcv); + + if (has_some && !has_all) + { + appendPQExpBuffer(&buf, " ("); + + /* options */ + if (has_ndistinct) + { + appendPQExpBufferStr(&buf, "ndistinct"); + gotone = true; + } + + if (has_dependencies) + { + appendPQExpBuffer(&buf, "%sdependencies", gotone ? ", " : ""); + gotone = true; + } + + if (has_mcv) + { + appendPQExpBuffer(&buf, "%smcv", gotone ? ", " : ""); + } + + appendPQExpBuffer(&buf, ")"); + } + + appendPQExpBuffer(&buf, " ON %s FROM %s", + PQgetvalue(result, i, 4), + PQgetvalue(result, i, 1)); + + /* Show the stats target if it's not default */ + if (strcmp(PQgetvalue(result, i, 8), "-1") != 0) + appendPQExpBuffer(&buf, "; STATISTICS %s", + PQgetvalue(result, i, 8)); + + printTableAddFooter(&cont, buf.data); + } + } + PQclear(result); + } + else if (pset.sversion >= 100000) { printfPQExpBuffer(&buf, "SELECT oid, " @@ -4468,18 +4565,27 @@ listExtendedStats(const char *pattern) printfPQExpBuffer(&buf, "SELECT \n" "es.stxnamespace::pg_catalog.regnamespace::text AS \"%s\", \n" - "es.stxname AS \"%s\", \n" - "pg_catalog.format('%%s FROM %%s', \n" - " (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n" - " FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n" - " JOIN pg_catalog.pg_attribute a \n" - " ON (es.stxrelid = a.attrelid \n" - " AND a.attnum = s.attnum \n" - " AND NOT a.attisdropped)), \n" - "es.stxrelid::regclass) AS \"%s\"", + "es.stxname AS \"%s\", \n", gettext_noop("Schema"), - gettext_noop("Name"), - gettext_noop("Definition")); + gettext_noop("Name")); + + if (pset.sversion >= 140000) + appendPQExpBuffer(&buf, + "pg_catalog.format('%%s FROM %%s', \n" + " pg_get_statisticsobjdef_columns(es.oid), \n" + " es.stxrelid::regclass) AS \"%s\"", + gettext_noop("Definition")); + else + appendPQExpBuffer(&buf, + "pg_catalog.format('%%s FROM %%s', \n" + " (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n" + " FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n" + " JOIN pg_catalog.pg_attribute a \n" + " ON (es.stxrelid = a.attrelid \n" + " AND a.attnum = s.attnum \n" + " AND NOT a.attisdropped)), \n" + "es.stxrelid::regclass) AS \"%s\"", + gettext_noop("Definition")); appendPQExpBuffer(&buf, ",\nCASE WHEN 'd' = any(es.stxkind) THEN 'defined' \n" diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 474ee2982b..4a39da3c9d 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202103265 +#define CATALOG_VERSION_NO 202103266 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index ecf12f4639..cc7d90d2b0 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3658,6 +3658,14 @@ proname => 'pg_get_statisticsobjdef', provolatile => 's', prorettype => 'text', proargtypes => 'oid', prosrc => 'pg_get_statisticsobjdef' }, +{ oid => '8887', descr => 'extended statistics columns', + proname => 'pg_get_statisticsobjdef_columns', provolatile => 's', + prorettype => 'text', proargtypes => 'oid', + prosrc => 'pg_get_statisticsobjdef_columns' }, +{ oid => '8886', descr => 'extended statistics expressions', + proname => 'pg_get_statisticsobjdef_expressions', provolatile => 's', + prorettype => '_text', proargtypes => 'oid', + prosrc => 'pg_get_statisticsobjdef_expressions' }, { oid => '3352', descr => 'partition key description', proname => 'pg_get_partkeydef', provolatile => 's', prorettype => 'text', proargtypes => 'oid', prosrc => 'pg_get_partkeydef' }, diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h index 29649f5814..36912ce528 100644 --- a/src/include/catalog/pg_statistic_ext.h +++ b/src/include/catalog/pg_statistic_ext.h @@ -54,6 +54,9 @@ CATALOG(pg_statistic_ext,3381,StatisticExtRelationId) #ifdef CATALOG_VARLEN char stxkind[1] BKI_FORCE_NOT_NULL; /* statistics kinds requested * to build */ + pg_node_tree stxexprs; /* A list of expression trees for stats + * attributes that are not simple column + * references. */ #endif } FormData_pg_statistic_ext; @@ -81,6 +84,7 @@ DECLARE_ARRAY_FOREIGN_KEY((stxrelid, stxkeys), pg_attribute, (attrelid, attnum)) #define STATS_EXT_NDISTINCT 'd' #define STATS_EXT_DEPENDENCIES 'f' #define STATS_EXT_MCV 'm' +#define STATS_EXT_EXPRESSIONS 'e' #endif /* EXPOSE_TO_CLIENT_CODE */ diff --git a/src/include/catalog/pg_statistic_ext_data.h b/src/include/catalog/pg_statistic_ext_data.h index 2f2577c218..5729154383 100644 --- a/src/include/catalog/pg_statistic_ext_data.h +++ b/src/include/catalog/pg_statistic_ext_data.h @@ -38,6 +38,7 @@ CATALOG(pg_statistic_ext_data,3429,StatisticExtDataRelationId) pg_ndistinct stxdndistinct; /* ndistinct coefficients (serialized) */ pg_dependencies stxddependencies; /* dependencies (serialized) */ pg_mcv_list stxdmcv; /* MCV (serialized) */ + pg_statistic stxdexpr[1]; /* stats for expressions */ #endif diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h index 1a79540c94..339f29f4c8 100644 --- a/src/include/commands/defrem.h +++ b/src/include/commands/defrem.h @@ -81,9 +81,7 @@ extern ObjectAddress AlterOperator(AlterOperatorStmt *stmt); extern ObjectAddress CreateStatistics(CreateStatsStmt *stmt); extern ObjectAddress AlterStatistics(AlterStatsStmt *stmt); extern void RemoveStatisticsById(Oid statsOid); -extern void UpdateStatisticsForTypeChange(Oid statsOid, - Oid relationOid, int attnum, - Oid oldColumnType, Oid newColumnType); +extern Oid StatisticsGetRelation(Oid statId, bool missing_ok); /* commands/aggregatecmds.c */ extern ObjectAddress DefineAggregate(ParseState *pstate, List *name, List *args, bool oldstyle, diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index e22df890ef..299956f329 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -454,6 +454,7 @@ typedef enum NodeTag T_TypeName, T_ColumnDef, T_IndexElem, + T_StatsElem, T_Constraint, T_DefElem, T_RangeTblEntry, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 0ce19d98ec..12e0e026dc 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1914,7 +1914,8 @@ typedef enum AlterTableType AT_AddIdentity, /* ADD IDENTITY */ AT_SetIdentity, /* SET identity column options */ AT_DropIdentity, /* DROP IDENTITY */ - AT_AlterCollationRefreshVersion /* ALTER COLLATION ... REFRESH VERSION */ + AT_AlterCollationRefreshVersion, /* ALTER COLLATION ... REFRESH VERSION */ + AT_ReAddStatistics /* internal to commands/tablecmds.c */ } AlterTableType; typedef struct ReplicaIdentityStmt @@ -2872,8 +2873,24 @@ typedef struct CreateStatsStmt List *relations; /* rels to build stats on (list of RangeVar) */ char *stxcomment; /* comment to apply to stats, or NULL */ bool if_not_exists; /* do nothing if stats name already exists */ + bool transformed; /* true when transformStatsStmt is finished */ } CreateStatsStmt; +/* + * StatsElem - statistics parameters (used in CREATE STATISTICS) + * + * For a plain attribute, 'name' is the name of the referenced table column + * and 'expr' is NULL. For an expression, 'name' is NULL and 'expr' is the + * expression tree. + */ +typedef struct StatsElem +{ + NodeTag type; + char *name; /* name of attribute to index, or NULL */ + Node *expr; /* expression to index, or NULL */ +} StatsElem; + + /* ---------------------- * Alter Statistics Statement * ---------------------- diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index c13642e35e..e4b554f811 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -923,6 +923,7 @@ typedef struct StatisticExtInfo RelOptInfo *rel; /* back-link to statistic's table */ char kind; /* statistics kind of this entry */ Bitmapset *keys; /* attnums of the columns covered */ + List *exprs; /* expressions */ } StatisticExtInfo; /* diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h index 176b9f37c1..a71d7e1f74 100644 --- a/src/include/parser/parse_node.h +++ b/src/include/parser/parse_node.h @@ -69,6 +69,7 @@ typedef enum ParseExprKind EXPR_KIND_FUNCTION_DEFAULT, /* default parameter value for function */ EXPR_KIND_INDEX_EXPRESSION, /* index expression */ EXPR_KIND_INDEX_PREDICATE, /* index predicate */ + EXPR_KIND_STATS_EXPRESSION, /* extended statistics expression */ EXPR_KIND_ALTER_COL_TRANSFORM, /* transform expr in ALTER COLUMN TYPE */ EXPR_KIND_EXECUTE_PARAMETER, /* parameter value in EXECUTE */ EXPR_KIND_TRIGGER_WHEN, /* WHEN condition in CREATE TRIGGER */ diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h index bfa4a6b0f2..1056bf081b 100644 --- a/src/include/parser/parse_utilcmd.h +++ b/src/include/parser/parse_utilcmd.h @@ -26,6 +26,8 @@ extern AlterTableStmt *transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, List **afterStmts); extern IndexStmt *transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString); +extern CreateStatsStmt *transformStatsStmt(Oid relid, CreateStatsStmt *stmt, + const char *queryString); extern void transformRuleStmt(RuleStmt *stmt, const char *queryString, List **actions, Node **whereClause); extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt); diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index a0a3cf5b0f..55cd9252a5 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -57,19 +57,27 @@ typedef struct SortItem int count; } SortItem; -extern MVNDistinct *statext_ndistinct_build(double totalrows, - int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats); +/* a unified representation of the data the statistics is built on */ +typedef struct StatsBuildData +{ + int numrows; + int nattnums; + AttrNumber *attnums; + VacAttrStats **stats; + Datum **values; + bool **nulls; +} StatsBuildData; + + +extern MVNDistinct *statext_ndistinct_build(double totalrows, StatsBuildData *data); extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct); extern MVNDistinct *statext_ndistinct_deserialize(bytea *data); -extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats); +extern MVDependencies *statext_dependencies_build(StatsBuildData *data); extern bytea *statext_dependencies_serialize(MVDependencies *dependencies); extern MVDependencies *statext_dependencies_deserialize(bytea *data); -extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats, +extern MCVList *statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget); extern bytea *statext_mcv_serialize(MCVList *mcv, VacAttrStats **stats); extern MCVList *statext_mcv_deserialize(bytea *data); @@ -85,14 +93,14 @@ extern int multi_sort_compare_dims(int start, int end, const SortItem *a, extern int compare_scalars_simple(const void *a, const void *b, void *arg); extern int compare_datums_simple(Datum a, Datum b, SortSupport ssup); -extern AttrNumber *build_attnums_array(Bitmapset *attrs, int *numattrs); +extern AttrNumber *build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs); -extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows, - TupleDesc tdesc, MultiSortSupport mss, +extern SortItem *build_sorted_items(StatsBuildData *data, int *nitems, + MultiSortSupport mss, int numattrs, AttrNumber *attnums); -extern bool examine_clause_args(List *args, Var **varp, - Const **cstp, bool *varonleftp); +extern bool examine_opclause_args(List *args, Node **exprp, + Const **cstp, bool *expronleftp); extern Selectivity mcv_combine_selectivities(Selectivity simple_sel, Selectivity mcv_sel, diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h index fec50688ea..326cf26fea 100644 --- a/src/include/statistics/statistics.h +++ b/src/include/statistics/statistics.h @@ -26,7 +26,8 @@ typedef struct MVNDistinctItem { double ndistinct; /* ndistinct value for this combination */ - Bitmapset *attrs; /* attr numbers of items */ + int nattributes; /* number of attributes */ + AttrNumber *attributes; /* attribute numbers */ } MVNDistinctItem; /* A MVNDistinct object, comprising all possible combinations of columns */ @@ -121,6 +122,8 @@ extern Selectivity statext_clauselist_selectivity(PlannerInfo *root, extern bool has_stats_of_kind(List *stats, char requiredkind); extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind, Bitmapset **clause_attnums, + List **clause_exprs, int nclauses); +extern HeapTuple statext_expressions_load(Oid stxoid, int idx); #endif /* STATISTICS_H */ diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h index ac3d0a6742..d333e5e8a5 100644 --- a/src/include/utils/ruleutils.h +++ b/src/include/utils/ruleutils.h @@ -41,4 +41,6 @@ extern char *generate_collation_name(Oid collid); extern char *generate_opclass_name(Oid opclass); extern char *get_range_partbound_string(List *bound_datums); +extern char *pg_get_statisticsobjdef_string(Oid statextid); + #endif /* RULEUTILS_H */ diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out index 10d17be23c..4dc5e6aa5f 100644 --- a/src/test/regress/expected/create_table_like.out +++ b/src/test/regress/expected/create_table_like.out @@ -304,7 +304,9 @@ CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) PRIMARY KEY, b text); CREATE INDEX ctlt1_b_key ON ctlt1 (b); CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b)); CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1; +CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1; COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats'; +COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats'; COMMENT ON COLUMN ctlt1.a IS 'A'; COMMENT ON COLUMN ctlt1.b IS 'B'; COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check'; @@ -414,7 +416,8 @@ Indexes: Check constraints: "ctlt1_a_check" CHECK (length(a) > 2) Statistics objects: - "public"."ctlt_all_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt_all + "public"."ctlt_all_a_b_stat" ON a, b FROM ctlt_all + "public"."ctlt_all_expr_stat" ON ((a || b)) FROM ctlt_all SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_class c WHERE classoid = 'pg_class'::regclass AND objoid = i.indexrelid AND c.oid = i.indexrelid AND i.indrelid = 'ctlt_all'::regclass ORDER BY c.relname, objsubid; relname | objsubid | description @@ -424,10 +427,11 @@ SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_clas (2 rows) SELECT s.stxname, objsubid, description FROM pg_description, pg_statistic_ext s WHERE classoid = 'pg_statistic_ext'::regclass AND objoid = s.oid AND s.stxrelid = 'ctlt_all'::regclass ORDER BY s.stxname, objsubid; - stxname | objsubid | description --------------------+----------+------------- - ctlt_all_a_b_stat | 0 | ab stats -(1 row) + stxname | objsubid | description +--------------------+----------+--------------- + ctlt_all_a_b_stat | 0 | ab stats + ctlt_all_expr_stat | 0 | ab expr stats +(2 rows) CREATE TABLE inh_error1 () INHERITS (ctlt1, ctlt4); NOTICE: merging multiple inherited definitions of column "a" @@ -452,7 +456,8 @@ Indexes: Check constraints: "ctlt1_a_check" CHECK (length(a) > 2) Statistics objects: - "public"."pg_attrdef_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM public.pg_attrdef + "public"."pg_attrdef_a_b_stat" ON a, b FROM public.pg_attrdef + "public"."pg_attrdef_expr_stat" ON ((a || b)) FROM public.pg_attrdef DROP TABLE public.pg_attrdef; -- Check that LIKE isn't confused when new table masks the old, either @@ -473,7 +478,8 @@ Indexes: Check constraints: "ctlt1_a_check" CHECK (length(a) > 2) Statistics objects: - "ctl_schema"."ctlt1_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt1 + "ctl_schema"."ctlt1_a_b_stat" ON a, b FROM ctlt1 + "ctl_schema"."ctlt1_expr_stat" ON ((a || b)) FROM ctlt1 ROLLBACK; DROP TABLE ctlt1, ctlt2, ctlt3, ctlt4, ctlt12_storage, ctlt12_comments, ctlt1_inh, ctlt13_inh, ctlt13_like, ctlt_all, ctla, ctlb CASCADE; diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out index 50d046d3ef..1461e947cd 100644 --- a/src/test/regress/expected/oidjoins.out +++ b/src/test/regress/expected/oidjoins.out @@ -151,11 +151,6 @@ NOTICE: checking pg_aggregate {aggmfinalfn} => pg_proc {oid} NOTICE: checking pg_aggregate {aggsortop} => pg_operator {oid} NOTICE: checking pg_aggregate {aggtranstype} => pg_type {oid} NOTICE: checking pg_aggregate {aggmtranstype} => pg_type {oid} -NOTICE: checking pg_statistic_ext {stxrelid} => pg_class {oid} -NOTICE: checking pg_statistic_ext {stxnamespace} => pg_namespace {oid} -NOTICE: checking pg_statistic_ext {stxowner} => pg_authid {oid} -NOTICE: checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum} -NOTICE: checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid} NOTICE: checking pg_statistic {starelid} => pg_class {oid} NOTICE: checking pg_statistic {staop1} => pg_operator {oid} NOTICE: checking pg_statistic {staop2} => pg_operator {oid} @@ -168,6 +163,11 @@ NOTICE: checking pg_statistic {stacoll3} => pg_collation {oid} NOTICE: checking pg_statistic {stacoll4} => pg_collation {oid} NOTICE: checking pg_statistic {stacoll5} => pg_collation {oid} NOTICE: checking pg_statistic {starelid,staattnum} => pg_attribute {attrelid,attnum} +NOTICE: checking pg_statistic_ext {stxrelid} => pg_class {oid} +NOTICE: checking pg_statistic_ext {stxnamespace} => pg_namespace {oid} +NOTICE: checking pg_statistic_ext {stxowner} => pg_authid {oid} +NOTICE: checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum} +NOTICE: checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid} NOTICE: checking pg_rewrite {ev_class} => pg_class {oid} NOTICE: checking pg_trigger {tgrelid} => pg_class {oid} NOTICE: checking pg_trigger {tgparentid} => pg_trigger {oid} diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 9b12cc122a..9b59a7b4a5 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2418,6 +2418,7 @@ pg_stats_ext| SELECT cn.nspname AS schemaname, ( SELECT array_agg(a.attname ORDER BY a.attnum) AS array_agg FROM (unnest(s.stxkeys) k(k) JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))) AS attnames, + pg_get_statisticsobjdef_expressions(s.oid) AS exprs, s.stxkind AS kinds, sd.stxdndistinct AS n_distinct, sd.stxddependencies AS dependencies, @@ -2439,6 +2440,78 @@ pg_stats_ext| SELECT cn.nspname AS schemaname, FROM (unnest(s.stxkeys) k(k) JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k)))) WHERE (NOT has_column_privilege(c.oid, a.attnum, 'select'::text))))) AND ((c.relrowsecurity = false) OR (NOT row_security_active(c.oid)))); +pg_stats_ext_exprs| SELECT cn.nspname AS schemaname, + c.relname AS tablename, + sn.nspname AS statistics_schemaname, + s.stxname AS statistics_name, + pg_get_userbyid(s.stxowner) AS statistics_owner, + stat.expr, + (stat.a).stanullfrac AS null_frac, + (stat.a).stawidth AS avg_width, + (stat.a).stadistinct AS n_distinct, + CASE + WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stavalues1 + WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stavalues2 + WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stavalues3 + WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stavalues4 + WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stavalues5 + ELSE NULL::anyarray + END AS most_common_vals, + CASE + WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stanumbers1 + WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stanumbers2 + WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stanumbers3 + WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stanumbers4 + WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stanumbers5 + ELSE NULL::real[] + END AS most_common_freqs, + CASE + WHEN ((stat.a).stakind1 = 2) THEN (stat.a).stavalues1 + WHEN ((stat.a).stakind2 = 2) THEN (stat.a).stavalues2 + WHEN ((stat.a).stakind3 = 2) THEN (stat.a).stavalues3 + WHEN ((stat.a).stakind4 = 2) THEN (stat.a).stavalues4 + WHEN ((stat.a).stakind5 = 2) THEN (stat.a).stavalues5 + ELSE NULL::anyarray + END AS histogram_bounds, + CASE + WHEN ((stat.a).stakind1 = 3) THEN (stat.a).stanumbers1[1] + WHEN ((stat.a).stakind2 = 3) THEN (stat.a).stanumbers2[1] + WHEN ((stat.a).stakind3 = 3) THEN (stat.a).stanumbers3[1] + WHEN ((stat.a).stakind4 = 3) THEN (stat.a).stanumbers4[1] + WHEN ((stat.a).stakind5 = 3) THEN (stat.a).stanumbers5[1] + ELSE NULL::real + END AS correlation, + CASE + WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stavalues1 + WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stavalues2 + WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stavalues3 + WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stavalues4 + WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stavalues5 + ELSE NULL::anyarray + END AS most_common_elems, + CASE + WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stanumbers1 + WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stanumbers2 + WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stanumbers3 + WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stanumbers4 + WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stanumbers5 + ELSE NULL::real[] + END AS most_common_elem_freqs, + CASE + WHEN ((stat.a).stakind1 = 5) THEN (stat.a).stanumbers1 + WHEN ((stat.a).stakind2 = 5) THEN (stat.a).stanumbers2 + WHEN ((stat.a).stakind3 = 5) THEN (stat.a).stanumbers3 + WHEN ((stat.a).stakind4 = 5) THEN (stat.a).stanumbers4 + WHEN ((stat.a).stakind5 = 5) THEN (stat.a).stanumbers5 + ELSE NULL::real[] + END AS elem_count_histogram + FROM (((((pg_statistic_ext s + JOIN pg_class c ON ((c.oid = s.stxrelid))) + LEFT JOIN pg_statistic_ext_data sd ON ((s.oid = sd.stxoid))) + LEFT JOIN pg_namespace cn ON ((cn.oid = c.relnamespace))) + LEFT JOIN pg_namespace sn ON ((sn.oid = s.stxnamespace))) + JOIN LATERAL ( SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr, + unnest(sd.stxdexpr) AS a) stat ON ((stat.expr IS NOT NULL))); pg_tables| SELECT n.nspname AS schemaname, c.relname AS tablename, pg_get_userbyid(c.relowner) AS tableowner, diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 07af1e29e3..60e9bfcd78 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -25,7 +25,7 @@ begin end; $$; -- Verify failures -CREATE TABLE ext_stats_test (x int, y int, z int); +CREATE TABLE ext_stats_test (x text, y int, z int); CREATE STATISTICS tst; ERROR: syntax error at or near ";" LINE 1: CREATE STATISTICS tst; @@ -44,12 +44,25 @@ CREATE STATISTICS tst ON a, b FROM ext_stats_test; ERROR: column "a" does not exist CREATE STATISTICS tst ON x, x, y FROM ext_stats_test; ERROR: duplicate column name in statistics definition -CREATE STATISTICS tst ON x + y FROM ext_stats_test; -ERROR: only simple column references are allowed in CREATE STATISTICS -CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -ERROR: only simple column references are allowed in CREATE STATISTICS +CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test; +ERROR: cannot have more than 8 columns in statistics +CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test; +ERROR: cannot have more than 8 columns in statistics +CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test; +ERROR: cannot have more than 8 columns in statistics +CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test; +ERROR: duplicate expression in statistics definition CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test; ERROR: unrecognized statistics kind "unrecognized" +-- incorrect expressions +CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses +ERROR: syntax error at or near "+" +LINE 1: CREATE STATISTICS tst ON y + z FROM ext_stats_test; + ^ +CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression +ERROR: syntax error at or near "," +LINE 1: CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; + ^ DROP TABLE ext_stats_test; -- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); @@ -79,7 +92,7 @@ ALTER TABLE ab1 DROP COLUMN a; b | integer | | | c | integer | | | Statistics objects: - "public"."ab1_b_c_stats" (ndistinct, dependencies, mcv) ON b, c FROM ab1 + "public"."ab1_b_c_stats" ON b, c FROM ab1 -- Ensure statistics are dropped when table is SELECT stxname FROM pg_statistic_ext WHERE stxname LIKE 'ab1%'; @@ -111,7 +124,7 @@ ALTER STATISTICS ab1_a_b_stats SET STATISTICS 0; a | integer | | | b | integer | | | Statistics objects: - "public"."ab1_a_b_stats" (ndistinct, dependencies, mcv) ON a, b FROM ab1; STATISTICS 0 + "public"."ab1_a_b_stats" ON a, b FROM ab1; STATISTICS 0 ANALYZE ab1; SELECT stxname, stxdndistinct, stxddependencies, stxdmcv @@ -131,7 +144,7 @@ ALTER STATISTICS ab1_a_b_stats SET STATISTICS -1; a | integer | | | | plain | | b | integer | | | | plain | | Statistics objects: - "public"."ab1_a_b_stats" (ndistinct, dependencies, mcv) ON a, b FROM ab1 + "public"."ab1_a_b_stats" ON a, b FROM ab1 -- partial analyze doesn't build stats either ANALYZE ab1 (a); @@ -150,6 +163,39 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1; ANALYZE ab1; DROP TABLE ab1 CASCADE; NOTICE: drop cascades to table ab1c +-- basic test for statistics on expressions +CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ); +-- expression stats may be built on a single expression column +CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1; +-- with a single expression, we only enable expression statistics +CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1; +SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2'; + stxkind +--------- + {e} +(1 row) + +-- adding anything to the expression builds all statistics kinds +CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1; +SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3'; + stxkind +----------- + {d,f,m,e} +(1 row) + +-- date_trunc on timestamptz is not immutable, but that should not matter +CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1; +-- date_trunc on timestamp is immutable +CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1; +-- insert some data and run analyze, to test that these cases build properly +INSERT INTO ab1 +SELECT + generate_series(1,10), + generate_series(1,10), + generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'), + generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day'); +ANALYZE ab1; +DROP TABLE ab1; -- Verify supported object types for extended statistics CREATE schema tststats; CREATE TABLE tststats.t (a int, b int, c text); @@ -244,6 +290,30 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c 200 | 11 (1 row) +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)'); + estimated | actual +-----------+-------- + 100 | 11 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + estimated | actual +-----------+-------- + 100 | 11 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + estimated | actual +-----------+-------- + 100 | 11 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + estimated | actual +-----------+-------- + 100 | 11 +(1 row) + -- correct command CREATE STATISTICS s10 ON a, b, c FROM ndistinct; ANALYZE ndistinct; @@ -282,6 +352,32 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b 11 | 11 (1 row) +-- partial improvement (match on attributes) +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)'); + estimated | actual +-----------+-------- + 11 | 11 +(1 row) + +-- expressions - no improvement +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + estimated | actual +-----------+-------- + 11 | 11 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + estimated | actual +-----------+-------- + 11 | 11 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + estimated | actual +-----------+-------- + 11 | 11 +(1 row) + -- last two plans keep using Group Aggregate, because 'd' is not covered -- by the statistic and while it's NULL-only we assume 200 values for it SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d'); @@ -343,6 +439,30 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d 200 | 13 (1 row) +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)'); + estimated | actual +-----------+-------- + 221 | 221 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + estimated | actual +-----------+-------- + 221 | 221 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + estimated | actual +-----------+-------- + 1000 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + estimated | actual +-----------+-------- + 221 | 221 +(1 row) + DROP STATISTICS s10; SELECT s.stxkind, d.stxdndistinct FROM pg_statistic_ext s, pg_statistic_ext_data d @@ -383,6 +503,413 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d 200 | 13 (1 row) +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)'); + estimated | actual +-----------+-------- + 100 | 221 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + estimated | actual +-----------+-------- + 100 | 221 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + estimated | actual +-----------+-------- + 100 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + estimated | actual +-----------+-------- + 100 | 221 +(1 row) + +-- ndistinct estimates with statistics on expressions +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + estimated | actual +-----------+-------- + 100 | 221 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + estimated | actual +-----------+-------- + 100 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + estimated | actual +-----------+-------- + 100 | 221 +(1 row) + +CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct; +ANALYZE ndistinct; +SELECT s.stxkind, d.stxdndistinct + FROM pg_statistic_ext s, pg_statistic_ext_data d + WHERE s.stxrelid = 'ndistinct'::regclass + AND d.stxoid = s.oid; + stxkind | stxdndistinct +---------+------------------------------------------------------------------- + {d,e} | {"-1, -2": 221, "-1, -3": 247, "-2, -3": 323, "-1, -2, -3": 1000} +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + estimated | actual +-----------+-------- + 221 | 221 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + estimated | actual +-----------+-------- + 1000 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + estimated | actual +-----------+-------- + 221 | 221 +(1 row) + +DROP STATISTICS s10; +-- a mix of attributes and expressions +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + estimated | actual +-----------+-------- + 100 | 221 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)'); + estimated | actual +-----------+-------- + 100 | 247 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)'); + estimated | actual +-----------+-------- + 100 | 1000 +(1 row) + +CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct; +ANALYZE ndistinct; +SELECT s.stxkind, d.stxdndistinct + FROM pg_statistic_ext s, pg_statistic_ext_data d + WHERE s.stxrelid = 'ndistinct'::regclass + AND d.stxoid = s.oid; + stxkind | stxdndistinct +---------+------------------------------------------------------------- + {d,e} | {"3, 4": 221, "3, -1": 247, "4, -1": 323, "3, 4, -1": 1000} +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + estimated | actual +-----------+-------- + 221 | 221 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)'); + estimated | actual +-----------+-------- + 247 | 247 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)'); + estimated | actual +-----------+-------- + 1000 | 1000 +(1 row) + +DROP STATISTICS s10; +-- combination of multiple ndistinct statistics, with/without expressions +TRUNCATE ndistinct; +-- two mostly independent groups of columns +INSERT INTO ndistinct (a, b, c, d) + SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20) + FROM generate_series(1,1000) s(i); +ANALYZE ndistinct; +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + estimated | actual +-----------+-------- + 27 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + estimated | actual +-----------+-------- + 27 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + estimated | actual +-----------+-------- + 27 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + estimated | actual +-----------+-------- + 27 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + estimated | actual +-----------+-------- + 100 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + estimated | actual +-----------+-------- + 100 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + estimated | actual +-----------+-------- + 100 | 180 +(1 row) + +-- basic statistics on both attributes (no expressions) +CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct; +CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct; +ANALYZE ndistinct; +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + estimated | actual +-----------+-------- + 100 | 180 +(1 row) + +-- replace the second statistics by statistics on expressions +DROP STATISTICS s12; +CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct; +ANALYZE ndistinct; +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + estimated | actual +-----------+-------- + 100 | 180 +(1 row) + +-- replace the second statistics by statistics on both attributes and expressions +DROP STATISTICS s12; +CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct; +ANALYZE ndistinct; +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + estimated | actual +-----------+-------- + 100 | 180 +(1 row) + +-- replace the other statistics by statistics on both attributes and expressions +DROP STATISTICS s11; +CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct; +ANALYZE ndistinct; +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + estimated | actual +-----------+-------- + 100 | 180 +(1 row) + +-- replace statistics by somewhat overlapping ones (this expected to get worse estimate +-- because the first statistics shall be applied to 3 columns, and the second one can't +-- be really applied) +DROP STATISTICS s11; +DROP STATISTICS s12; +CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct; +CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct; +ANALYZE ndistinct; +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + estimated | actual +-----------+-------- + 9 | 9 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + estimated | actual +-----------+-------- + 45 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + estimated | actual +-----------+-------- + 100 | 45 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + estimated | actual +-----------+-------- + 100 | 180 +(1 row) + +DROP STATISTICS s11; +DROP STATISTICS s12; -- functional dependencies tests CREATE TABLE functional_dependencies ( filler1 TEXT, @@ -427,6 +954,40 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE 3 | 3 (1 row) +-- a => b, a => c, b => c +TRUNCATE functional_dependencies; +DROP STATISTICS func_deps_stat; +-- now do the same thing, but with expressions +INSERT INTO functional_dependencies (a, b, c, filler1) + SELECT i, i, i, i FROM generate_series(1,5000) s(i); +ANALYZE functional_dependencies; +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1'); + estimated | actual +-----------+-------- + 1 | 35 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1'); + estimated | actual +-----------+-------- + 1 | 5 +(1 row) + +-- create statistics +CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies; +ANALYZE functional_dependencies; +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1'); + estimated | actual +-----------+-------- + 35 | 35 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1'); + estimated | actual +-----------+-------- + 5 | 5 +(1 row) + -- a => b, a => c, b => c TRUNCATE functional_dependencies; DROP STATISTICS func_deps_stat; @@ -755,14 +1316,14 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE 1 | 0 (1 row) --- changing the type of column c causes its single-column stats to be dropped, --- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple --- clauses estimated with functional dependencies does not exceed this +-- changing the type of column c causes all its stats to be dropped, reverting +-- to default estimates without any statistics, i.e. 0.5% selectivity for each +-- condition ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1'); estimated | actual -----------+-------- - 25 | 50 + 1 | 50 (1 row) ANALYZE functional_dependencies; @@ -772,6 +1333,332 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE 50 | 50 (1 row) +DROP STATISTICS func_deps_stat; +-- now try functional dependencies with expressions +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 1 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2'); + estimated | actual +-----------+-------- + 1 | 50 +(1 row) + +-- IN +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')'); + estimated | actual +-----------+-------- + 1 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2'); + estimated | actual +-----------+-------- + 1 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)'); + estimated | actual +-----------+-------- + 1 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)'); + estimated | actual +-----------+-------- + 1 | 400 +(1 row) + +-- OR clauses referencing the same attribute +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')'); + estimated | actual +-----------+-------- + 1 | 200 +(1 row) + +-- OR clauses referencing different attributes +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +-- ANY +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 1 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2'); + estimated | actual +-----------+-------- + 1 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])'); + estimated | actual +-----------+-------- + 1 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])'); + estimated | actual +-----------+-------- + 1 | 400 +(1 row) + +-- ANY with inequalities should not benefit from functional dependencies +-- the estimates however improve thanks to having expression statistics +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X'''); + estimated | actual +-----------+-------- + 926 | 1900 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 1543 | 2250 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 2229 | 2050 +(1 row) + +-- ALL (should not benefit from functional dependencies) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +-- create statistics on expressions +CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies; +ANALYZE functional_dependencies; +-- print the detected dependencies +SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat'; + dependencies +------------------------------------------------------------------------------------------------------------------------ + {"-1 => -2": 1.000000, "-1 => -3": 1.000000, "-2 => -3": 1.000000, "-1, -2 => -3": 1.000000, "-1, -3 => -2": 1.000000} +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 50 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2'); + estimated | actual +-----------+-------- + 50 | 50 +(1 row) + +-- IN +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)'); + estimated | actual +-----------+-------- + 400 | 400 +(1 row) + +-- OR clauses referencing the same attribute +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 99 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')'); + estimated | actual +-----------+-------- + 99 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')'); + estimated | actual +-----------+-------- + 197 | 200 +(1 row) + +-- OR clauses referencing different attributes +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 3 | 100 +(1 row) + +-- ANY +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X'''); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])'); + estimated | actual +-----------+-------- + 400 | 400 +(1 row) + +-- ANY with inequalities should not benefit from functional dependencies +-- the estimates however improve thanks to having expression statistics +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X'''); + estimated | actual +-----------+-------- + 1957 | 1900 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 2933 | 2250 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 3548 | 2050 +(1 row) + +-- ALL (should not benefit from functional dependencies) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])'); + estimated | actual +-----------+-------- + 2 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + -- check the ability to use multiple functional dependencies CREATE TABLE functional_dependencies_multi ( a INTEGER, @@ -896,6 +1783,39 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = 1 | 1 (1 row) +TRUNCATE mcv_lists; +DROP STATISTICS mcv_lists_stats; +-- random data (no MCV list), but with expression +INSERT INTO mcv_lists (a, b, c, filler1) + SELECT i, i, i, i FROM generate_series(1,1000) s(i); +ANALYZE mcv_lists; +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1'); + estimated | actual +-----------+-------- + 1 | 13 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1'); + estimated | actual +-----------+-------- + 1 | 1 +(1 row) + +-- create statistics +CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists; +ANALYZE mcv_lists; +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1'); + estimated | actual +-----------+-------- + 13 | 13 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1'); + estimated | actual +-----------+-------- + 1 | 1 +(1 row) + -- 100 distinct combinations, all in the MCV list TRUNCATE mcv_lists; DROP STATISTICS mcv_lists_stats; @@ -1121,6 +2041,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' 200 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); estimated | actual -----------+-------- @@ -1207,6 +2133,212 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = 50 | 50 (1 row) +-- 100 distinct combinations, all in the MCV list, but with expressions +TRUNCATE mcv_lists; +DROP STATISTICS mcv_lists_stats; +INSERT INTO mcv_lists (a, b, c, filler1) + SELECT i, i, i, i FROM generate_series(1,1000) s(i); +ANALYZE mcv_lists; +-- without any stats on the expressions, we have to use default selectivities, which +-- is why the estimates here are different from the pre-computed case above +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1'); + estimated | actual +-----------+-------- + 1 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)'); + estimated | actual +-----------+-------- + 1 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1'); + estimated | actual +-----------+-------- + 111 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)'); + estimated | actual +-----------+-------- + 111 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1'); + estimated | actual +-----------+-------- + 1 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL'); + estimated | actual +-----------+-------- + 15 | 120 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)'); + estimated | actual +-----------+-------- + 11 | 150 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats) +CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists; +CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists; +CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists; +ANALYZE mcv_lists; +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1'); + estimated | actual +-----------+-------- + 5 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)'); + estimated | actual +-----------+-------- + 5 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1'); + estimated | actual +-----------+-------- + 5 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)'); + estimated | actual +-----------+-------- + 5 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1'); + estimated | actual +-----------+-------- + 1 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL'); + estimated | actual +-----------+-------- + 149 | 120 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)'); + estimated | actual +-----------+-------- + 20 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])'); + estimated | actual +-----------+-------- + 20 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)'); + estimated | actual +-----------+-------- + 116 | 150 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])'); + estimated | actual +-----------+-------- + 12 | 100 +(1 row) + +DROP STATISTICS mcv_lists_stats_1; +DROP STATISTICS mcv_lists_stats_2; +DROP STATISTICS mcv_lists_stats_3; +-- create statistics with both MCV and expressions +CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists; +ANALYZE mcv_lists; +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1'); + estimated | actual +-----------+-------- + 50 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)'); + estimated | actual +-----------+-------- + 50 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1'); + estimated | actual +-----------+-------- + 50 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)'); + estimated | actual +-----------+-------- + 50 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1'); + estimated | actual +-----------+-------- + 50 | 50 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL'); + estimated | actual +-----------+-------- + 105 | 120 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)'); + estimated | actual +-----------+-------- + 150 | 150 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + -- 100 distinct combinations with NULL values, all in the MCV list TRUNCATE mcv_lists; DROP STATISTICS mcv_lists_stats; @@ -1712,6 +2844,100 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR (1 row) DROP TABLE mcv_lists_multi; +-- statistics on integer expressions +CREATE TABLE expr_stats (a int, b int, c int); +INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i); +ANALYZE expr_stats; +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats; +ANALYZE expr_stats; +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +DROP STATISTICS expr_stats_1; +DROP TABLE expr_stats; +-- statistics on a mix columns and expressions +CREATE TABLE expr_stats (a int, b int, c int); +INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i); +ANALYZE expr_stats; +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0'); + estimated | actual +-----------+-------- + 1 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats; +ANALYZE expr_stats; +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +DROP TABLE expr_stats; +-- statistics on expressions with different data types +CREATE TABLE expr_stats (a int, b name, c text); +INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i); +ANALYZE expr_stats; +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0'''); + estimated | actual +-----------+-------- + 11 | 100 +(1 row) + +CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats; +ANALYZE expr_stats; +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0'''); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +DROP TABLE expr_stats; -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table. @@ -1743,21 +2969,21 @@ create statistics stts_s2.stts_yama (dependencies, mcv) on col1, col3 from stts_ insert into stts_t1 select i,i from generate_series(1,100) i; analyze stts_t1; \dX - List of extended statistics - Schema | Name | Definition | Ndistinct | Dependencies | MCV -----------+------------------------+--------------------------------------+-----------+--------------+--------- - public | func_deps_stat | a, b, c FROM functional_dependencies | | defined | - public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined - public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined - public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined - public | stts_1 | a, b FROM stts_t1 | defined | | - public | stts_2 | a, b FROM stts_t1 | defined | defined | - public | stts_3 | a, b FROM stts_t1 | defined | defined | defined - public | stts_4 | b, c FROM stts_t2 | defined | defined | defined - public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined - stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined - stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined - tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined + List of extended statistics + Schema | Name | Definition | Ndistinct | Dependencies | MCV +----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+--------- + public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined | + public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined + public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined + public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined + public | stts_1 | a, b FROM stts_t1 | defined | | + public | stts_2 | a, b FROM stts_t1 | defined | defined | + public | stts_3 | a, b FROM stts_t1 | defined | defined | defined + public | stts_4 | b, c FROM stts_t2 | defined | defined | defined + public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined + stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined + stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined + tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined (12 rows) \dX stts_? @@ -1778,21 +3004,21 @@ analyze stts_t1; (1 row) \dX+ - List of extended statistics - Schema | Name | Definition | Ndistinct | Dependencies | MCV -----------+------------------------+--------------------------------------+-----------+--------------+--------- - public | func_deps_stat | a, b, c FROM functional_dependencies | | defined | - public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined - public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined - public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined - public | stts_1 | a, b FROM stts_t1 | defined | | - public | stts_2 | a, b FROM stts_t1 | defined | defined | - public | stts_3 | a, b FROM stts_t1 | defined | defined | defined - public | stts_4 | b, c FROM stts_t2 | defined | defined | defined - public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined - stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined - stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined - tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined + List of extended statistics + Schema | Name | Definition | Ndistinct | Dependencies | MCV +----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+--------- + public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined | + public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined + public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined + public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined + public | stts_1 | a, b FROM stts_t1 | defined | | + public | stts_2 | a, b FROM stts_t1 | defined | defined | + public | stts_3 | a, b FROM stts_t1 | defined | defined | defined + public | stts_4 | b, c FROM stts_t2 | defined | defined | defined + public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined + stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined + stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined + tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined (12 rows) \dX+ stts_? @@ -1822,21 +3048,21 @@ analyze stts_t1; create role regress_stats_ext nosuperuser; set role regress_stats_ext; \dX - List of extended statistics - Schema | Name | Definition | Ndistinct | Dependencies | MCV -----------+------------------------+--------------------------------------+-----------+--------------+--------- - public | func_deps_stat | a, b, c FROM functional_dependencies | | defined | - public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined - public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined - public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined - public | stts_1 | a, b FROM stts_t1 | defined | | - public | stts_2 | a, b FROM stts_t1 | defined | defined | - public | stts_3 | a, b FROM stts_t1 | defined | defined | defined - public | stts_4 | b, c FROM stts_t2 | defined | defined | defined - public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined - stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined - stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined - tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined + List of extended statistics + Schema | Name | Definition | Ndistinct | Dependencies | MCV +----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+--------- + public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined | + public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined + public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined + public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined + public | stts_1 | a, b FROM stts_t1 | defined | | + public | stts_2 | a, b FROM stts_t1 | defined | defined | + public | stts_3 | a, b FROM stts_t1 | defined | defined | defined + public | stts_4 | b, c FROM stts_t2 | defined | defined | defined + public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined + stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined + stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined + tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined (12 rows) reset role; diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql index 06b76f949d..4929d373a2 100644 --- a/src/test/regress/sql/create_table_like.sql +++ b/src/test/regress/sql/create_table_like.sql @@ -124,7 +124,9 @@ CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) PRIMARY KEY, b text); CREATE INDEX ctlt1_b_key ON ctlt1 (b); CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b)); CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1; +CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1; COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats'; +COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats'; COMMENT ON COLUMN ctlt1.a IS 'A'; COMMENT ON COLUMN ctlt1.b IS 'B'; COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check'; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index f9f12769e4..fb3af6e523 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -28,16 +28,21 @@ end; $$; -- Verify failures -CREATE TABLE ext_stats_test (x int, y int, z int); +CREATE TABLE ext_stats_test (x text, y int, z int); CREATE STATISTICS tst; CREATE STATISTICS tst ON a, b; CREATE STATISTICS tst FROM sometab; CREATE STATISTICS tst ON a, b FROM nonexistent; CREATE STATISTICS tst ON a, b FROM ext_stats_test; CREATE STATISTICS tst ON x, x, y FROM ext_stats_test; -CREATE STATISTICS tst ON x + y FROM ext_stats_test; -CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; +CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test; +CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test; +CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test; +CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test; CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test; +-- incorrect expressions +CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses +CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression DROP TABLE ext_stats_test; -- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it @@ -97,6 +102,36 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1; ANALYZE ab1; DROP TABLE ab1 CASCADE; +-- basic test for statistics on expressions +CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ); + +-- expression stats may be built on a single expression column +CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1; + +-- with a single expression, we only enable expression statistics +CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1; +SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2'; + +-- adding anything to the expression builds all statistics kinds +CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1; +SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3'; + +-- date_trunc on timestamptz is not immutable, but that should not matter +CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1; + +-- date_trunc on timestamp is immutable +CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1; + +-- insert some data and run analyze, to test that these cases build properly +INSERT INTO ab1 +SELECT + generate_series(1,10), + generate_series(1,10), + generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'), + generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day'); +ANALYZE ab1; +DROP TABLE ab1; + -- Verify supported object types for extended statistics CREATE schema tststats; @@ -164,6 +199,14 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d'); +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + -- correct command CREATE STATISTICS s10 ON a, b, c FROM ndistinct; @@ -184,6 +227,16 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c'); +-- partial improvement (match on attributes) +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)'); + +-- expressions - no improvement +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + -- last two plans keep using Group Aggregate, because 'd' is not covered -- by the statistic and while it's NULL-only we assume 200 values for it SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d'); @@ -216,6 +269,14 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d'); +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + DROP STATISTICS s10; SELECT s.stxkind, d.stxdndistinct @@ -234,6 +295,206 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d'); +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + +-- ndistinct estimates with statistics on expressions +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + +CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct; + +ANALYZE ndistinct; + +SELECT s.stxkind, d.stxdndistinct + FROM pg_statistic_ext s, pg_statistic_ext_data d + WHERE s.stxrelid = 'ndistinct'::regclass + AND d.stxoid = s.oid; + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)'); + +DROP STATISTICS s10; + +-- a mix of attributes and expressions +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)'); + +CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct; + +ANALYZE ndistinct; + +SELECT s.stxkind, d.stxdndistinct + FROM pg_statistic_ext s, pg_statistic_ext_data d + WHERE s.stxrelid = 'ndistinct'::regclass + AND d.stxoid = s.oid; + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)'); + +DROP STATISTICS s10; + +-- combination of multiple ndistinct statistics, with/without expressions +TRUNCATE ndistinct; + +-- two mostly independent groups of columns +INSERT INTO ndistinct (a, b, c, d) + SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20) + FROM generate_series(1,1000) s(i); + +ANALYZE ndistinct; + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + +-- basic statistics on both attributes (no expressions) +CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct; + +CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct; + +ANALYZE ndistinct; + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + + +-- replace the second statistics by statistics on expressions + +DROP STATISTICS s12; + +CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct; + +ANALYZE ndistinct; + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + + +-- replace the second statistics by statistics on both attributes and expressions + +DROP STATISTICS s12; + +CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct; + +ANALYZE ndistinct; + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + + +-- replace the other statistics by statistics on both attributes and expressions + +DROP STATISTICS s11; + +CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct; + +ANALYZE ndistinct; + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + + +-- replace statistics by somewhat overlapping ones (this expected to get worse estimate +-- because the first statistics shall be applied to 3 columns, and the second one can't +-- be really applied) + +DROP STATISTICS s11; +DROP STATISTICS s12; + +CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct; +CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct; + +ANALYZE ndistinct; + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)'); + +SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)'); + +DROP STATISTICS s11; +DROP STATISTICS s12; + -- functional dependencies tests CREATE TABLE functional_dependencies ( filler1 TEXT, @@ -272,6 +533,29 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE TRUNCATE functional_dependencies; DROP STATISTICS func_deps_stat; +-- now do the same thing, but with expressions +INSERT INTO functional_dependencies (a, b, c, filler1) + SELECT i, i, i, i FROM generate_series(1,5000) s(i); + +ANALYZE functional_dependencies; + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1'); + +-- create statistics +CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies; + +ANALYZE functional_dependencies; + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1'); + +-- a => b, a => c, b => c +TRUNCATE functional_dependencies; +DROP STATISTICS func_deps_stat; + INSERT INTO functional_dependencies (a, b, c, filler1) SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i); @@ -397,9 +681,9 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])'); --- changing the type of column c causes its single-column stats to be dropped, --- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple --- clauses estimated with functional dependencies does not exceed this +-- changing the type of column c causes all its stats to be dropped, reverting +-- to default estimates without any statistics, i.e. 0.5% selectivity for each +-- condition ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1'); @@ -408,6 +692,132 @@ ANALYZE functional_dependencies; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1'); +DROP STATISTICS func_deps_stat; + +-- now try functional dependencies with expressions + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2'); + +-- IN +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)'); + +-- OR clauses referencing the same attribute +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')'); + +-- OR clauses referencing different attributes +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X'''); + +-- ANY +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])'); + +-- ANY with inequalities should not benefit from functional dependencies +-- the estimates however improve thanks to having expression statistics +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])'); + +-- ALL (should not benefit from functional dependencies) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])'); + +-- create statistics on expressions +CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies; + +ANALYZE functional_dependencies; + +-- print the detected dependencies +SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat'; + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2'); + +-- IN +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)'); + +-- OR clauses referencing the same attribute +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')'); + +-- OR clauses referencing different attributes +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X'''); + +-- ANY +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])'); + +-- ANY with inequalities should not benefit from functional dependencies +-- the estimates however improve thanks to having expression statistics +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X'''); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])'); + +-- ALL (should not benefit from functional dependencies) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])'); + -- check the ability to use multiple functional dependencies CREATE TABLE functional_dependencies_multi ( a INTEGER, @@ -479,6 +889,28 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1'); +TRUNCATE mcv_lists; +DROP STATISTICS mcv_lists_stats; + +-- random data (no MCV list), but with expression +INSERT INTO mcv_lists (a, b, c, filler1) + SELECT i, i, i, i FROM generate_series(1,1000) s(i); + +ANALYZE mcv_lists; + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1'); + +-- create statistics +CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists; + +ANALYZE mcv_lists; + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1'); + -- 100 distinct combinations, all in the MCV list TRUNCATE mcv_lists; DROP STATISTICS mcv_lists_stats; @@ -565,6 +997,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)'); @@ -602,6 +1036,98 @@ ANALYZE mcv_lists; SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'''); + +-- 100 distinct combinations, all in the MCV list, but with expressions +TRUNCATE mcv_lists; +DROP STATISTICS mcv_lists_stats; + +INSERT INTO mcv_lists (a, b, c, filler1) + SELECT i, i, i, i FROM generate_series(1,1000) s(i); + +ANALYZE mcv_lists; + +-- without any stats on the expressions, we have to use default selectivities, which +-- is why the estimates here are different from the pre-computed case above + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])'); + +-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats) +CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists; +CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists; +CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists; + +ANALYZE mcv_lists; + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])'); + +DROP STATISTICS mcv_lists_stats_1; +DROP STATISTICS mcv_lists_stats_2; +DROP STATISTICS mcv_lists_stats_3; + +-- create statistics with both MCV and expressions +CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists; + +ANALYZE mcv_lists; + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])'); + +-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL'); + -- 100 distinct combinations with NULL values, all in the MCV list TRUNCATE mcv_lists; DROP STATISTICS mcv_lists_stats; @@ -894,6 +1420,57 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR DROP TABLE mcv_lists_multi; + +-- statistics on integer expressions +CREATE TABLE expr_stats (a int, b int, c int); +INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i); +ANALYZE expr_stats; + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0'); + +CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats; +ANALYZE expr_stats; + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0'); + +DROP STATISTICS expr_stats_1; +DROP TABLE expr_stats; + +-- statistics on a mix columns and expressions +CREATE TABLE expr_stats (a int, b int, c int); +INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i); +ANALYZE expr_stats; + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0'); + +CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats; +ANALYZE expr_stats; + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0'); + +DROP TABLE expr_stats; + +-- statistics on expressions with different data types +CREATE TABLE expr_stats (a int, b name, c text); +INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i); +ANALYZE expr_stats; + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0'''); + +CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats; +ANALYZE expr_stats; + +SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0'''); + +DROP TABLE expr_stats; + + -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table.