Extended statistics on expressions

Allow defining extended statistics on expressions, not just just on
simple column references.  With this commit, expressions are supported
by all existing extended statistics kinds, improving the same types of
estimates. A simple example may look like this:

  CREATE TABLE t (a int);
  CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t;
  ANALYZE t;

The collected statistics are useful e.g. to estimate queries with those
expressions in WHERE or GROUP BY clauses:

  SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0;

  SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20);

This introduces new internal statistics kind 'e' (expressions) which is
built automatically when the statistics object definition includes any
expressions. This represents single-expression statistics, as if there
was an expression index (but without the index maintenance overhead).
The statistics is stored in pg_statistics_ext_data as an array of
composite types, which is possible thanks to 79f6a942bd.

CREATE STATISTICS allows building statistics on a single expression, in
which case in which case it's not possible to specify statistics kinds.

A new system view pg_stats_ext_exprs can be used to display expression
statistics, similarly to pg_stats and pg_stats_ext views.

ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it
treats indexes, i.e. it drops and recreates the statistics. This means
all statistics are reset, and we no longer try to preserve at least the
functional dependencies. This should not be a major issue in practice,
as the functional dependencies actually rely on per-column statistics,
which were always reset anyway.

Author: Tomas Vondra
Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu
Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
This commit is contained in:
Tomas Vondra 2021-03-26 23:22:01 +01:00
parent 98376c18f1
commit a4d75c86bf
43 changed files with 5838 additions and 819 deletions

View File

@ -7385,8 +7385,22 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
<literal>d</literal> for n-distinct statistics,
<literal>f</literal> for functional dependency statistics, and
<literal>m</literal> for most common values (MCV) list statistics
<literal>e</literal> for expression statistics
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>stxexprs</structfield> <type>pg_node_tree</type>
</para>
<para>
Expression trees (in <function>nodeToString()</function>
representation) for statistics object attributes that are not simple
column references. This is a list with one element per expression.
Null if all statistics object attributes are simple references.
</para></entry>
</row>
</tbody>
</tgroup>
</table>
@ -7452,7 +7466,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
(references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>oid</structfield>)
</para>
<para>
Extended statistic object containing the definition for this data
Extended statistics object containing the definition for this data
</para></entry>
</row>
@ -7484,6 +7498,15 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
<structname>pg_mcv_list</structname> type
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>stxexprs</structfield> <type>pg_node_tree</type>
</para>
<para>
A list of any expressions covered by this statistics object.
</para></entry>
</row>
</tbody>
</tgroup>
</table>
@ -7637,6 +7660,16 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
see <xref linkend="logical-replication-publication"/>.
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>stxdexpr</structfield> <type>pg_statistic[]</type>
</para>
<para>
Per-expression statistics, serialized as an array of
<structname>pg_statistic</structname> type
</para></entry>
</row>
</tbody>
</tgroup>
</table>
@ -9444,6 +9477,11 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
<entry>extended planner statistics</entry>
</row>
<row>
<entry><link linkend="view-pg-stats-ext-exprs"><structname>pg_stats_ext_exprs</structname></link></entry>
<entry>extended planner statistics for expressions</entry>
</row>
<row>
<entry><link linkend="view-pg-tables"><structname>pg_tables</structname></link></entry>
<entry>tables</entry>
@ -12696,10 +12734,19 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
(references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
</para>
<para>
Name of the column described by this row
Names of the columns included in the extended statistics object
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>exprs</structfield> <type>text[]</type>
</para>
<para>
Expressions included in the extended statistics object
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>inherited</structfield> <type>bool</type>
@ -12851,7 +12898,8 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
<para>
The view <structname>pg_stats_ext</structname> provides access to
the information stored in the <link
information about each extended statistics object in the database,
combining information stored in the <link
linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
catalogs. This view allows access only to rows of
@ -12908,7 +12956,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
(references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
</para>
<para>
Name of schema containing extended statistic
Name of schema containing extended statistics object
</para></entry>
</row>
@ -12918,7 +12966,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
(references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
</para>
<para>
Name of extended statistics
Name of extended statistics object
</para></entry>
</row>
@ -12928,7 +12976,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
(references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
</para>
<para>
Owner of the extended statistics
Owner of the extended statistics object
</para></entry>
</row>
@ -12938,7 +12986,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
(references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
</para>
<para>
Names of the columns the extended statistics is defined on
Names of the columns the extended statistics object is defined on
</para></entry>
</row>
@ -12947,7 +12995,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
<structfield>kinds</structfield> <type>char[]</type>
</para>
<para>
Types of extended statistics enabled for this record
Types of extended statistics object enabled for this record
</para></entry>
</row>
@ -13032,6 +13080,237 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
</sect1>
<sect1 id="view-pg-stats-ext-exprs">
<title><structname>pg_stats_ext_exprs</structname></title>
<indexterm zone="view-pg-stats-ext-exprs">
<primary>pg_stats_ext_exprs</primary>
</indexterm>
<para>
The view <structname>pg_stats_ext_exprs</structname> provides access to
information about all expressions included in extended statistics objects,
combining information stored in the <link
linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
catalogs. This view allows access only to rows of
<link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link> and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
that correspond to tables the user has permission to read, and therefore
it is safe to allow public read access to this view.
</para>
<para>
<structname>pg_stats_ext_exprs</structname> is also designed to present
the information in a more readable format than the underlying catalogs
&mdash; at the cost that its schema must be extended whenever the structure
of statistics in <link linkend="catalog-pg-statistic"><structname>pg_statistic</structname></link> changes.
</para>
<table>
<title><structname>pg_stats_ext_exprs</structname> Columns</title>
<tgroup cols="1">
<thead>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
Column Type
</para>
<para>
Description
</para></entry>
</row>
</thead>
<tbody>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>schemaname</structfield> <type>name</type>
(references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
</para>
<para>
Name of schema containing table
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>tablename</structfield> <type>name</type>
(references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>relname</structfield>)
</para>
<para>
Name of table the statistics object is defined on
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>statistics_schemaname</structfield> <type>name</type>
(references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
</para>
<para>
Name of schema containing extended statistics object
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>statistics_name</structfield> <type>name</type>
(references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
</para>
<para>
Name of extended statistics object
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>statistics_owner</structfield> <type>name</type>
(references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
</para>
<para>
Owner of the extended statistics object
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>expr</structfield> <type>text</type>
</para>
<para>
Expression included in the extended statistics object
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>null_frac</structfield> <type>float4</type>
</para>
<para>
Fraction of expression entries that are null
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>avg_width</structfield> <type>int4</type>
</para>
<para>
Average width in bytes of expression's entries
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>n_distinct</structfield> <type>float4</type>
</para>
<para>
If greater than zero, the estimated number of distinct values in the
expression. If less than zero, the negative of the number of distinct
values divided by the number of rows. (The negated form is used when
<command>ANALYZE</command> believes that the number of distinct values is
likely to increase as the table grows; the positive form is used when
the expression seems to have a fixed number of possible values.) For
example, -1 indicates a unique expression in which the number of distinct
values is the same as the number of rows.
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>most_common_vals</structfield> <type>anyarray</type>
</para>
<para>
A list of the most common values in the expression. (Null if
no values seem to be more common than any others.)
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>most_common_freqs</structfield> <type>float4[]</type>
</para>
<para>
A list of the frequencies of the most common values,
i.e., number of occurrences of each divided by total number of rows.
(Null when <structfield>most_common_vals</structfield> is.)
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>histogram_bounds</structfield> <type>anyarray</type>
</para>
<para>
A list of values that divide the expression's values into groups of
approximately equal population. The values in
<structfield>most_common_vals</structfield>, if present, are omitted from this
histogram calculation. (This expression is null if the expression data type
does not have a <literal>&lt;</literal> operator or if the
<structfield>most_common_vals</structfield> list accounts for the entire
population.)
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>correlation</structfield> <type>float4</type>
</para>
<para>
Statistical correlation between physical row ordering and
logical ordering of the expression values. This ranges from -1 to +1.
When the value is near -1 or +1, an index scan on the expression will
be estimated to be cheaper than when it is near zero, due to reduction
of random access to the disk. (This expression is null if the expression's
data type does not have a <literal>&lt;</literal> operator.)
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>most_common_elems</structfield> <type>anyarray</type>
</para>
<para>
A list of non-null element values most often appearing within values of
the expression. (Null for scalar types.)
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>most_common_elem_freqs</structfield> <type>float4[]</type>
</para>
<para>
A list of the frequencies of the most common element values, i.e., the
fraction of rows containing at least one instance of the given value.
Two or three additional values follow the per-element frequencies;
these are the minimum and maximum of the preceding per-element
frequencies, and optionally the frequency of null elements.
(Null when <structfield>most_common_elems</structfield> is.)
</para></entry>
</row>
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>elem_count_histogram</structfield> <type>float4[]</type>
</para>
<para>
A histogram of the counts of distinct non-null element values within the
values of the expression, followed by the average number of distinct
non-null elements. (Null for scalar types.)
</para></entry>
</row>
</tbody>
</tgroup>
</table>
<para>
The maximum number of entries in the array fields can be controlled on a
column-by-column basis using the <link linkend="sql-altertable"><command>ALTER
TABLE SET STATISTICS</command></link> command, or globally by setting the
<xref linkend="guc-default-statistics-target"/> run-time parameter.
</para>
</sect1>
<sect1 id="view-pg-tables">
<title><structname>pg_tables</structname></title>

View File

@ -21,9 +21,13 @@ PostgreSQL documentation
<refsynopsisdiv>
<synopsis>
CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
ON ( <replaceable class="parameter">expression</replaceable> )
FROM <replaceable class="parameter">table_name</replaceable>
CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
[ ( <replaceable class="parameter">statistics_kind</replaceable> [, ... ] ) ]
ON <replaceable class="parameter">column_name</replaceable>, <replaceable class="parameter">column_name</replaceable> [, ...]
ON { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) }, { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [, ...]
FROM <replaceable class="parameter">table_name</replaceable>
</synopsis>
@ -39,6 +43,19 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
database and will be owned by the user issuing the command.
</para>
<para>
The <command>CREATE STATISTICS</command> command has two basic forms. The
first form allows univariate statistics for a single expression to be
collected, providing benefits similar to an expression index without the
overhead of index maintenance. This form does not allow the statistics
kind to be specified, since the various statistics kinds refer only to
multivariate statistics. The second form of the command allows
multivariate statistics on multiple columns and/or expressions to be
collected, optionally specifying which statistics kinds to include. This
form will also automatically cause univariate statistics to be collected on
any expressions included in the list.
</para>
<para>
If a schema name is given (for example, <literal>CREATE STATISTICS
myschema.mystat ...</literal>) then the statistics object is created in the
@ -79,14 +96,16 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
<term><replaceable class="parameter">statistics_kind</replaceable></term>
<listitem>
<para>
A statistics kind to be computed in this statistics object.
A multivariate statistics kind to be computed in this statistics object.
Currently supported kinds are
<literal>ndistinct</literal>, which enables n-distinct statistics,
<literal>dependencies</literal>, which enables functional
dependency statistics, and <literal>mcv</literal> which enables
most-common values lists.
If this clause is omitted, all supported statistics kinds are
included in the statistics object.
included in the statistics object. Univariate expression statistics are
built automatically if the statistics definition includes any complex
expressions rather than just simple column references.
For more information, see <xref linkend="planner-stats-extended"/>
and <xref linkend="multivariate-statistics-examples"/>.
</para>
@ -98,8 +117,22 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
<listitem>
<para>
The name of a table column to be covered by the computed statistics.
At least two column names must be given; the order of the column names
is insignificant.
This is only allowed when building multivariate statistics. At least
two column names or expressions must be specified, and their order is
not significant.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><replaceable class="parameter">expression</replaceable></term>
<listitem>
<para>
An expression to be covered by the computed statistics. This may be
used to build univariate statistics on a single expression, or as part
of a list of multiple column names and/or expressions to build
multivariate statistics. In the latter case, separate univariate
statistics are built automatically for each expression in the list.
</para>
</listitem>
</varlistentry>
@ -125,6 +158,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
reading it. Once created, however, the ownership of the statistics
object is independent of the underlying table(s).
</para>
<para>
Expression statistics are per-expression and are similar to creating an
index on the expression, except that they avoid the overhead of index
maintenance. Expression statistics are built automatically for each
expression in the statistics object definition.
</para>
</refsect1>
<refsect1 id="sql-createstatistics-examples">
@ -196,6 +236,72 @@ EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
in the table, allowing it to generate better estimates in both cases.
</para>
<para>
Create table <structname>t3</structname> with a single timestamp column,
and run queries using expressions on that column. Without extended
statistics, the planner has no information about the data distribution for
the expressions, and uses default estimates. The planner also does not
realize that the value of the date truncated to the month is fully
determined by the value of the date truncated to the day. Then expression
and ndistinct statistics are built on those two expressions:
<programlisting>
CREATE TABLE t3 (
a timestamp
);
INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp,
'2020-12-31'::timestamp,
'1 minute'::interval) s(i);
ANALYZE t3;
-- the number of matching rows will be drastically underestimated:
EXPLAIN ANALYZE SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
EXPLAIN ANALYZE SELECT * FROM t3
WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
AND '2020-06-30'::timestamp;
EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
FROM t3 GROUP BY 1, 2;
-- build ndistinct statistics on the pair of expressions (per-expression
-- statistics are built automatically)
CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3;
ANALYZE t3;
-- now the row count estimates are more accurate:
EXPLAIN ANALYZE SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
EXPLAIN ANALYZE SELECT * FROM t3
WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
AND '2020-06-30'::timestamp;
EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
FROM t3 GROUP BY 1, 2;
</programlisting>
Without expression and ndistinct statistics, the planner has no information
about the number of distinct values for the expressions, and has to rely
on default estimates. The equality and range conditions are assumed to have
0.5% selectivity, and the number of distinct values in the expression is
assumed to be the same as for the column (i.e. unique). This results in a
significant underestimate of the row count in the first two queries. Moreover,
the planner has no information about the relationship between the expressions,
so it assumes the two <literal>WHERE</literal> and <literal>GROUP BY</literal>
conditions are independent, and multiplies their selectivities together to
arrive at a severe overestimate of the group count in the aggregate query.
This is further exacerbated by the lack of accurate statistics for the
expressions, forcing the planner to use a default ndistinct estimate for the
expression derived from ndistinct for the column. With such statistics, the
planner recognizes that the conditions are correlated, and arrives at much
more accurate estimates.
</para>
</refsect1>
<refsect1>

View File

@ -49,15 +49,15 @@ include $(top_srcdir)/src/backend/common.mk
# Note: the order of this list determines the order in which the catalog
# header files are assembled into postgres.bki. BKI_BOOTSTRAP catalogs
# must appear first, and there are reputedly other, undocumented ordering
# dependencies.
# must appear first, and pg_statistic before pg_statistic_ext_data, and
# there are reputedly other, undocumented ordering dependencies.
CATALOG_HEADERS := \
pg_proc.h pg_type.h pg_attribute.h pg_class.h \
pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \
pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \
pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \
pg_statistic_ext.h pg_statistic_ext_data.h \
pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
pg_statistic.h pg_statistic_ext.h pg_statistic_ext_data.h \
pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \
pg_database.h pg_db_role_setting.h pg_tablespace.h \
pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \

View File

@ -264,6 +264,7 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
JOIN pg_attribute a
ON (a.attrelid = s.stxrelid AND a.attnum = k)
) AS attnames,
pg_get_statisticsobjdef_expressions(s.oid) as exprs,
s.stxkind AS kinds,
sd.stxdndistinct AS n_distinct,
sd.stxddependencies AS dependencies,
@ -290,6 +291,74 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
WHERE NOT has_column_privilege(c.oid, a.attnum, 'select') )
AND (c.relrowsecurity = false OR NOT row_security_active(c.oid));
CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS
SELECT cn.nspname AS schemaname,
c.relname AS tablename,
sn.nspname AS statistics_schemaname,
s.stxname AS statistics_name,
pg_get_userbyid(s.stxowner) AS statistics_owner,
stat.expr,
(stat.a).stanullfrac AS null_frac,
(stat.a).stawidth AS avg_width,
(stat.a).stadistinct AS n_distinct,
(CASE
WHEN (stat.a).stakind1 = 1 THEN (stat.a).stavalues1
WHEN (stat.a).stakind2 = 1 THEN (stat.a).stavalues2
WHEN (stat.a).stakind3 = 1 THEN (stat.a).stavalues3
WHEN (stat.a).stakind4 = 1 THEN (stat.a).stavalues4
WHEN (stat.a).stakind5 = 1 THEN (stat.a).stavalues5
END) AS most_common_vals,
(CASE
WHEN (stat.a).stakind1 = 1 THEN (stat.a).stanumbers1
WHEN (stat.a).stakind2 = 1 THEN (stat.a).stanumbers2
WHEN (stat.a).stakind3 = 1 THEN (stat.a).stanumbers3
WHEN (stat.a).stakind4 = 1 THEN (stat.a).stanumbers4
WHEN (stat.a).stakind5 = 1 THEN (stat.a).stanumbers5
END) AS most_common_freqs,
(CASE
WHEN (stat.a).stakind1 = 2 THEN (stat.a).stavalues1
WHEN (stat.a).stakind2 = 2 THEN (stat.a).stavalues2
WHEN (stat.a).stakind3 = 2 THEN (stat.a).stavalues3
WHEN (stat.a).stakind4 = 2 THEN (stat.a).stavalues4
WHEN (stat.a).stakind5 = 2 THEN (stat.a).stavalues5
END) AS histogram_bounds,
(CASE
WHEN (stat.a).stakind1 = 3 THEN (stat.a).stanumbers1[1]
WHEN (stat.a).stakind2 = 3 THEN (stat.a).stanumbers2[1]
WHEN (stat.a).stakind3 = 3 THEN (stat.a).stanumbers3[1]
WHEN (stat.a).stakind4 = 3 THEN (stat.a).stanumbers4[1]
WHEN (stat.a).stakind5 = 3 THEN (stat.a).stanumbers5[1]
END) correlation,
(CASE
WHEN (stat.a).stakind1 = 4 THEN (stat.a).stavalues1
WHEN (stat.a).stakind2 = 4 THEN (stat.a).stavalues2
WHEN (stat.a).stakind3 = 4 THEN (stat.a).stavalues3
WHEN (stat.a).stakind4 = 4 THEN (stat.a).stavalues4
WHEN (stat.a).stakind5 = 4 THEN (stat.a).stavalues5
END) AS most_common_elems,
(CASE
WHEN (stat.a).stakind1 = 4 THEN (stat.a).stanumbers1
WHEN (stat.a).stakind2 = 4 THEN (stat.a).stanumbers2
WHEN (stat.a).stakind3 = 4 THEN (stat.a).stanumbers3
WHEN (stat.a).stakind4 = 4 THEN (stat.a).stanumbers4
WHEN (stat.a).stakind5 = 4 THEN (stat.a).stanumbers5
END) AS most_common_elem_freqs,
(CASE
WHEN (stat.a).stakind1 = 5 THEN (stat.a).stanumbers1
WHEN (stat.a).stakind2 = 5 THEN (stat.a).stanumbers2
WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3
WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4
WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5
END) AS elem_count_histogram
FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid)
LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid)
LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace)
LEFT JOIN pg_namespace sn ON (sn.oid = s.stxnamespace)
JOIN LATERAL (
SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
unnest(sd.stxdexpr)::pg_statistic AS a
) stat ON (stat.expr IS NOT NULL);
-- unprivileged users may read pg_statistic_ext but not pg_statistic_ext_data
REVOKE ALL on pg_statistic_ext_data FROM public;

View File

@ -29,6 +29,8 @@
#include "commands/comment.h"
#include "commands/defrem.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/optimizer.h"
#include "statistics/statistics.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
@ -62,7 +64,8 @@ ObjectAddress
CreateStatistics(CreateStatsStmt *stmt)
{
int16 attnums[STATS_MAX_DIMENSIONS];
int numcols = 0;
int nattnums = 0;
int numcols;
char *namestr;
NameData stxname;
Oid statoid;
@ -74,21 +77,25 @@ CreateStatistics(CreateStatsStmt *stmt)
Datum datavalues[Natts_pg_statistic_ext_data];
bool datanulls[Natts_pg_statistic_ext_data];
int2vector *stxkeys;
List *stxexprs = NIL;
Datum exprsDatum;
Relation statrel;
Relation datarel;
Relation rel = NULL;
Oid relid;
ObjectAddress parentobject,
myself;
Datum types[3]; /* one for each possible type of statistic */
Datum types[4]; /* one for each possible type of statistic */
int ntypes;
ArrayType *stxkind;
bool build_ndistinct;
bool build_dependencies;
bool build_mcv;
bool build_expressions;
bool requested_type = false;
int i;
ListCell *cell;
ListCell *cell2;
Assert(IsA(stmt, CreateStatsStmt));
@ -190,101 +197,124 @@ CreateStatistics(CreateStatsStmt *stmt)
}
/*
* Currently, we only allow simple column references in the expression
* list. That will change someday, and again the grammar already supports
* it so we have to enforce restrictions here. For now, we can convert
* the expression list to a simple array of attnums. While at it, enforce
* some constraints.
* Make sure no more than STATS_MAX_DIMENSIONS columns are used. There
* might be duplicates and so on, but we'll deal with those later.
*/
numcols = list_length(stmt->exprs);
if (numcols > STATS_MAX_DIMENSIONS)
ereport(ERROR,
(errcode(ERRCODE_TOO_MANY_COLUMNS),
errmsg("cannot have more than %d columns in statistics",
STATS_MAX_DIMENSIONS)));
/*
* Convert the expression list to a simple array of attnums, but also keep
* a list of more complex expressions. While at it, enforce some
* constraints.
*
* XXX We do only the bare minimum to separate simple attribute and
* complex expressions - for example "(a)" will be treated as a complex
* expression. No matter how elaborate the check is, there'll always be a
* way around it, if the user is determined (consider e.g. "(a+0)"), so
* it's not worth protecting against it.
*/
foreach(cell, stmt->exprs)
{
Node *expr = (Node *) lfirst(cell);
ColumnRef *cref;
char *attname;
StatsElem *selem;
HeapTuple atttuple;
Form_pg_attribute attForm;
TypeCacheEntry *type;
if (!IsA(expr, ColumnRef))
/*
* We should not get anything else than StatsElem, given the grammar.
* But let's keep it as a safety.
*/
if (!IsA(expr, StatsElem))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("only simple column references are allowed in CREATE STATISTICS")));
cref = (ColumnRef *) expr;
errmsg("only simple column references and expressions are allowed in CREATE STATISTICS")));
if (list_length(cref->fields) != 1)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("only simple column references are allowed in CREATE STATISTICS")));
attname = strVal((Value *) linitial(cref->fields));
selem = (StatsElem *) expr;
atttuple = SearchSysCacheAttName(relid, attname);
if (!HeapTupleIsValid(atttuple))
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("column \"%s\" does not exist",
attname)));
attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
if (selem->name) /* column reference */
{
char *attname;
/* Disallow use of system attributes in extended stats */
if (attForm->attnum <= 0)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("statistics creation on system columns is not supported")));
attname = selem->name;
/* Disallow data types without a less-than operator */
type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
if (type->lt_opr == InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
attname, format_type_be(attForm->atttypid))));
atttuple = SearchSysCacheAttName(relid, attname);
if (!HeapTupleIsValid(atttuple))
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("column \"%s\" does not exist",
attname)));
attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
/* Make sure no more than STATS_MAX_DIMENSIONS columns are used */
if (numcols >= STATS_MAX_DIMENSIONS)
ereport(ERROR,
(errcode(ERRCODE_TOO_MANY_COLUMNS),
errmsg("cannot have more than %d columns in statistics",
STATS_MAX_DIMENSIONS)));
/* Disallow use of system attributes in extended stats */
if (attForm->attnum <= 0)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("statistics creation on system columns is not supported")));
attnums[numcols] = attForm->attnum;
numcols++;
ReleaseSysCache(atttuple);
/* Disallow data types without a less-than operator */
type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
if (type->lt_opr == InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
attname, format_type_be(attForm->atttypid))));
attnums[nattnums] = attForm->attnum;
nattnums++;
ReleaseSysCache(atttuple);
}
else /* expression */
{
Node *expr = selem->expr;
Oid atttype;
Assert(expr != NULL);
/*
* Disallow data types without a less-than operator.
*
* We ignore this for statistics on a single expression, in which
* case we'll build the regular statistics only (and that code can
* deal with such data types).
*/
if (list_length(stmt->exprs) > 1)
{
atttype = exprType(expr);
type = lookup_type_cache(atttype, TYPECACHE_LT_OPR);
if (type->lt_opr == InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class",
format_type_be(atttype))));
}
stxexprs = lappend(stxexprs, expr);
}
}
/*
* Check that at least two columns were specified in the statement. The
* upper bound was already checked in the loop above.
*/
if (numcols < 2)
ereport(ERROR,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("extended statistics require at least 2 columns")));
/*
* Sort the attnums, which makes detecting duplicates somewhat easier, and
* it does not hurt (it does not affect the efficiency, unlike for
* indexes, for example).
*/
qsort(attnums, numcols, sizeof(int16), compare_int16);
/*
* Check for duplicates in the list of columns. The attnums are sorted so
* just check consecutive elements.
*/
for (i = 1; i < numcols; i++)
{
if (attnums[i] == attnums[i - 1])
ereport(ERROR,
(errcode(ERRCODE_DUPLICATE_COLUMN),
errmsg("duplicate column name in statistics definition")));
}
/* Form an int2vector representation of the sorted column list */
stxkeys = buildint2vector(attnums, numcols);
/*
* Parse the statistics kinds.
*
* First check that if this is the case with a single expression, there
* are no statistics kinds specified (we don't allow that for the simple
* CREATE STATISTICS form).
*/
if ((list_length(stmt->exprs) == 1) && (list_length(stxexprs) == 1))
{
/* statistics kinds not specified */
if (list_length(stmt->stat_types) > 0)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("when building statistics on a single expression, statistics kinds may not be specified")));
}
/* OK, let's check that we recognize the statistics kinds. */
build_ndistinct = false;
build_dependencies = false;
build_mcv = false;
@ -313,14 +343,91 @@ CreateStatistics(CreateStatsStmt *stmt)
errmsg("unrecognized statistics kind \"%s\"",
type)));
}
/* If no statistic type was specified, build them all. */
if (!requested_type)
/*
* If no statistic type was specified, build them all (but only when the
* statistics is defined on more than one column/expression).
*/
if ((!requested_type) && (numcols >= 2))
{
build_ndistinct = true;
build_dependencies = true;
build_mcv = true;
}
/*
* When there are non-trivial expressions, build the expression stats
* automatically. This allows calculating good estimates for stats that
* consider per-clause estimates (e.g. functional dependencies).
*/
build_expressions = (list_length(stxexprs) > 0);
/*
* Check that at least two columns were specified in the statement, or
* that we're building statistics on a single expression.
*/
if ((numcols < 2) && (list_length(stxexprs) != 1))
ereport(ERROR,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("extended statistics require at least 2 columns")));
/*
* Sort the attnums, which makes detecting duplicates somewhat easier, and
* it does not hurt (it does not matter for the contents, unlike for
* indexes, for example).
*/
qsort(attnums, nattnums, sizeof(int16), compare_int16);
/*
* Check for duplicates in the list of columns. The attnums are sorted so
* just check consecutive elements.
*/
for (i = 1; i < nattnums; i++)
{
if (attnums[i] == attnums[i - 1])
ereport(ERROR,
(errcode(ERRCODE_DUPLICATE_COLUMN),
errmsg("duplicate column name in statistics definition")));
}
/*
* Check for duplicate expressions. We do two loops, counting the
* occurrences of each expression. This is O(N^2) but we only allow small
* number of expressions and it's not executed often.
*
* XXX We don't cross-check attributes and expressions, because it does
* not seem worth it. In principle we could check that expressions don't
* contain trivial attribute references like "(a)", but the reasoning is
* similar to why we don't bother with extracting columns from
* expressions. It's either expensive or very easy to defeat for
* determined user, and there's no risk if we allow such statistics (the
* statistics is useless, but harmless).
*/
foreach(cell, stxexprs)
{
Node *expr1 = (Node *) lfirst(cell);
int cnt = 0;
foreach(cell2, stxexprs)
{
Node *expr2 = (Node *) lfirst(cell2);
if (equal(expr1, expr2))
cnt += 1;
}
/* every expression should find at least itself */
Assert(cnt >= 1);
if (cnt > 1)
ereport(ERROR,
(errcode(ERRCODE_DUPLICATE_COLUMN),
errmsg("duplicate expression in statistics definition")));
}
/* Form an int2vector representation of the sorted column list */
stxkeys = buildint2vector(attnums, nattnums);
/* construct the char array of enabled statistic types */
ntypes = 0;
if (build_ndistinct)
@ -329,9 +436,23 @@ CreateStatistics(CreateStatsStmt *stmt)
types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
if (build_mcv)
types[ntypes++] = CharGetDatum(STATS_EXT_MCV);
if (build_expressions)
types[ntypes++] = CharGetDatum(STATS_EXT_EXPRESSIONS);
Assert(ntypes > 0 && ntypes <= lengthof(types));
stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR);
/* convert the expressions (if any) to a text datum */
if (stxexprs != NIL)
{
char *exprsString;
exprsString = nodeToString(stxexprs);
exprsDatum = CStringGetTextDatum(exprsString);
pfree(exprsString);
}
else
exprsDatum = (Datum) 0;
statrel = table_open(StatisticExtRelationId, RowExclusiveLock);
/*
@ -351,6 +472,10 @@ CreateStatistics(CreateStatsStmt *stmt)
values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys);
values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind);
values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum;
if (exprsDatum == (Datum) 0)
nulls[Anum_pg_statistic_ext_stxexprs - 1] = true;
/* insert it into pg_statistic_ext */
htup = heap_form_tuple(statrel->rd_att, values, nulls);
CatalogTupleInsert(statrel, htup);
@ -373,6 +498,7 @@ CreateStatistics(CreateStatsStmt *stmt)
datanulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
datanulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
datanulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
datanulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
/* insert it into pg_statistic_ext_data */
htup = heap_form_tuple(datarel->rd_att, datavalues, datanulls);
@ -396,12 +522,41 @@ CreateStatistics(CreateStatsStmt *stmt)
*/
ObjectAddressSet(myself, StatisticExtRelationId, statoid);
for (i = 0; i < numcols; i++)
/* add dependencies for plain column references */
for (i = 0; i < nattnums; i++)
{
ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]);
recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
}
/*
* If there are no dependencies on a column, give the statistics an auto
* dependency on the whole table. In most cases, this will be redundant,
* but it might not be if the statistics expressions contain no Vars
* (which might seem strange but possible). This is consistent with what
* we do for indexes in index_create.
*
* XXX We intentionally don't consider the expressions before adding this
* dependency, because recordDependencyOnSingleRelExpr may not create any
* dependencies for whole-row Vars.
*/
if (!nattnums)
{
ObjectAddressSet(parentobject, RelationRelationId, relid);
recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
}
/*
* Store dependencies on anything mentioned in statistics expressions,
* just like we do for index expressions.
*/
if (stxexprs)
recordDependencyOnSingleRelExpr(&myself,
(Node *) stxexprs,
relid,
DEPENDENCY_NORMAL,
DEPENDENCY_AUTO, false, true);
/*
* Also add dependencies on namespace and owner. These are required
* because the stats object might have a different namespace and/or owner
@ -582,87 +737,6 @@ RemoveStatisticsById(Oid statsOid)
table_close(relation, RowExclusiveLock);
}
/*
* Update a statistics object for ALTER COLUMN TYPE on a source column.
*
* This could throw an error if the type change can't be supported.
* If it can be supported, but the stats must be recomputed, a likely choice
* would be to set the relevant column(s) of the pg_statistic_ext_data tuple
* to null until the next ANALYZE. (Note that the type change hasn't actually
* happened yet, so one option that's *not* on the table is to recompute
* immediately.)
*
* For both ndistinct and functional-dependencies stats, the on-disk
* representation is independent of the source column data types, and it is
* plausible to assume that the old statistic values will still be good for
* the new column contents. (Obviously, if the ALTER COLUMN TYPE has a USING
* expression that substantially alters the semantic meaning of the column
* values, this assumption could fail. But that seems like a corner case
* that doesn't justify zapping the stats in common cases.)
*
* For MCV lists that's not the case, as those statistics store the datums
* internally. In this case we simply reset the statistics value to NULL.
*
* Note that "type change" includes collation change, which means we can rely
* on the MCV list being consistent with the collation info in pg_attribute
* during estimation.
*/
void
UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum,
Oid oldColumnType, Oid newColumnType)
{
HeapTuple stup,
oldtup;
Relation rel;
Datum values[Natts_pg_statistic_ext_data];
bool nulls[Natts_pg_statistic_ext_data];
bool replaces[Natts_pg_statistic_ext_data];
oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statsOid));
if (!HeapTupleIsValid(oldtup))
elog(ERROR, "cache lookup failed for statistics object %u", statsOid);
/*
* When none of the defined statistics types contain datum values from the
* table's columns then there's no need to reset the stats. Functional
* dependencies and ndistinct stats should still hold true.
*/
if (!statext_is_kind_built(oldtup, STATS_EXT_MCV))
{
ReleaseSysCache(oldtup);
return;
}
/*
* OK, we need to reset some statistics. So let's build the new tuple,
* replacing the affected statistics types with NULL.
*/
memset(nulls, 0, Natts_pg_statistic_ext_data * sizeof(bool));
memset(replaces, 0, Natts_pg_statistic_ext_data * sizeof(bool));
memset(values, 0, Natts_pg_statistic_ext_data * sizeof(Datum));
replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
rel = table_open(StatisticExtDataRelationId, RowExclusiveLock);
/* replace the old tuple */
stup = heap_modify_tuple(oldtup,
RelationGetDescr(rel),
values,
nulls,
replaces);
ReleaseSysCache(oldtup);
CatalogTupleUpdate(rel, &stup->t_self, stup);
heap_freetuple(stup);
table_close(rel, RowExclusiveLock);
}
/*
* Select a nonconflicting name for a new statistics.
*
@ -731,18 +805,27 @@ ChooseExtendedStatisticNameAddition(List *exprs)
buf[0] = '\0';
foreach(lc, exprs)
{
ColumnRef *cref = (ColumnRef *) lfirst(lc);
StatsElem *selem = (StatsElem *) lfirst(lc);
const char *name;
/* It should be one of these, but just skip if it happens not to be */
if (!IsA(cref, ColumnRef))
if (!IsA(selem, StatsElem))
continue;
name = strVal((Value *) linitial(cref->fields));
name = selem->name;
if (buflen > 0)
buf[buflen++] = '_'; /* insert _ between names */
/*
* We use fixed 'expr' for expressions, which have empty column names.
* For indexes this is handled in ChooseIndexColumnNames, but we have
* no such function for stats and it does not seem worth adding. If a
* better name is needed, the user can specify it explicitly.
*/
if (!name)
name = "expr";
/*
* At this point we have buflen <= NAMEDATALEN. name should be less
* than NAMEDATALEN already, but use strlcpy for paranoia.
@ -754,3 +837,29 @@ ChooseExtendedStatisticNameAddition(List *exprs)
}
return pstrdup(buf);
}
/*
* StatisticsGetRelation: given a statistics's relation OID, get the OID of
* the relation it is an statistics on. Uses the system cache.
*/
Oid
StatisticsGetRelation(Oid statId, bool missing_ok)
{
HeapTuple tuple;
Form_pg_statistic_ext stx;
Oid result;
tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statId));
if (!HeapTupleIsValid(tuple))
{
if (missing_ok)
return InvalidOid;
elog(ERROR, "cache lookup failed for statistics object %u", statId);
}
stx = (Form_pg_statistic_ext) GETSTRUCT(tuple);
Assert(stx->oid == statId);
result = stx->stxrelid;
ReleaseSysCache(tuple);
return result;
}

View File

@ -41,6 +41,7 @@
#include "catalog/pg_namespace.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_tablespace.h"
#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_trigger.h"
#include "catalog/pg_type.h"
#include "catalog/storage.h"
@ -188,6 +189,8 @@ typedef struct AlteredTableInfo
List *changedIndexDefs; /* string definitions of same */
char *replicaIdentityIndex; /* index to reset as REPLICA IDENTITY */
char *clusterOnIndex; /* index to use for CLUSTER */
List *changedStatisticsOids; /* OIDs of statistics to rebuild */
List *changedStatisticsDefs; /* string definitions of same */
} AlteredTableInfo;
/* Struct describing one new constraint to check in Phase 3 scan */
@ -440,6 +443,8 @@ static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *c
ObjectAddresses *addrs);
static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
static ObjectAddress ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
static ObjectAddress ATExecAddConstraint(List **wqueue,
AlteredTableInfo *tab, Relation rel,
Constraint *newConstraint, bool recurse, bool is_readd,
@ -496,6 +501,7 @@ static ObjectAddress ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
AlterTableCmd *cmd, LOCKMODE lockmode);
static void RememberConstraintForRebuilding(Oid conoid, AlteredTableInfo *tab);
static void RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab);
static void RememberStatisticsForRebuilding(Oid indoid, AlteredTableInfo *tab);
static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab,
LOCKMODE lockmode);
static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId,
@ -4756,6 +4762,10 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab,
address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, true,
lockmode);
break;
case AT_ReAddStatistics: /* ADD STATISTICS */
address = ATExecAddStatistics(tab, rel, (CreateStatsStmt *) cmd->def,
true, lockmode);
break;
case AT_AddConstraint: /* ADD CONSTRAINT */
/* Transform the command only during initial examination */
if (cur_pass == AT_PASS_ADD_CONSTR)
@ -8283,6 +8293,29 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
return address;
}
/*
* ALTER TABLE ADD STATISTICS
*
* This is no such command in the grammar, but we use this internally to add
* AT_ReAddStatistics subcommands to rebuild extended statistics after a table
* column type change.
*/
static ObjectAddress
ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode)
{
ObjectAddress address;
Assert(IsA(stmt, CreateStatsStmt));
/* The CreateStatsStmt has already been through transformStatsStmt */
Assert(stmt->transformed);
address = CreateStatistics(stmt);
return address;
}
/*
* ALTER TABLE ADD CONSTRAINT USING INDEX
*
@ -11830,9 +11863,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
* Give the extended-stats machinery a chance to fix anything
* that this column type change would break.
*/
UpdateStatisticsForTypeChange(foundObject.objectId,
RelationGetRelid(rel), attnum,
attTup->atttypid, targettype);
RememberStatisticsForRebuilding(foundObject.objectId, tab);
break;
case OCLASS_PROC:
@ -12202,6 +12233,32 @@ RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab)
}
}
/*
* Subroutine for ATExecAlterColumnType: remember that a statistics object
* needs to be rebuilt (which we might already know).
*/
static void
RememberStatisticsForRebuilding(Oid stxoid, AlteredTableInfo *tab)
{
/*
* This de-duplication check is critical for two independent reasons: we
* mustn't try to recreate the same statistics object twice, and if the
* statistics depends on more than one column whose type is to be altered,
* we must capture its definition string before applying any of the type
* changes. ruleutils.c will get confused if we ask again later.
*/
if (!list_member_oid(tab->changedStatisticsOids, stxoid))
{
/* OK, capture the index's existing definition string */
char *defstring = pg_get_statisticsobjdef_string(stxoid);
tab->changedStatisticsOids = lappend_oid(tab->changedStatisticsOids,
stxoid);
tab->changedStatisticsDefs = lappend(tab->changedStatisticsDefs,
defstring);
}
}
/*
* Cleanup after we've finished all the ALTER TYPE operations for a
* particular relation. We have to drop and recreate all the indexes
@ -12306,6 +12363,22 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
add_exact_object_address(&obj, objects);
}
/* add dependencies for new statistics */
forboth(oid_item, tab->changedStatisticsOids,
def_item, tab->changedStatisticsDefs)
{
Oid oldId = lfirst_oid(oid_item);
Oid relid;
relid = StatisticsGetRelation(oldId, false);
ATPostAlterTypeParse(oldId, relid, InvalidOid,
(char *) lfirst(def_item),
wqueue, lockmode, tab->rewrite);
ObjectAddressSet(obj, StatisticExtRelationId, oldId);
add_exact_object_address(&obj, objects);
}
/*
* Queue up command to restore replica identity index marking
*/
@ -12354,9 +12427,9 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
}
/*
* Parse the previously-saved definition string for a constraint or index
* against the newly-established column data type(s), and queue up the
* resulting command parsetrees for execution.
* Parse the previously-saved definition string for a constraint, index or
* statistics object against the newly-established column data type(s), and
* queue up the resulting command parsetrees for execution.
*
* This might fail if, for example, you have a WHERE clause that uses an
* operator that's not available for the new column type.
@ -12402,6 +12475,11 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
querytree_list = lappend(querytree_list, stmt);
querytree_list = list_concat(querytree_list, afterStmts);
}
else if (IsA(stmt, CreateStatsStmt))
querytree_list = lappend(querytree_list,
transformStatsStmt(oldRelId,
(CreateStatsStmt *) stmt,
cmd));
else
querytree_list = lappend(querytree_list, stmt);
}
@ -12540,6 +12618,20 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
elog(ERROR, "unexpected statement subtype: %d",
(int) stmt->subtype);
}
else if (IsA(stm, CreateStatsStmt))
{
CreateStatsStmt *stmt = (CreateStatsStmt *) stm;
AlterTableCmd *newcmd;
/* keep the statistics object's comment */
stmt->stxcomment = GetComment(oldId, StatisticExtRelationId, 0);
newcmd = makeNode(AlterTableCmd);
newcmd->subtype = AT_ReAddStatistics;
newcmd->def = (Node *) stmt;
tab->subcmds[AT_PASS_MISC] =
lappend(tab->subcmds[AT_PASS_MISC], newcmd);
}
else
elog(ERROR, "unexpected statement type: %d",
(int) nodeTag(stm));

View File

@ -2980,6 +2980,17 @@ _copyIndexElem(const IndexElem *from)
return newnode;
}
static StatsElem *
_copyStatsElem(const StatsElem *from)
{
StatsElem *newnode = makeNode(StatsElem);
COPY_STRING_FIELD(name);
COPY_NODE_FIELD(expr);
return newnode;
}
static ColumnDef *
_copyColumnDef(const ColumnDef *from)
{
@ -5699,6 +5710,9 @@ copyObjectImpl(const void *from)
case T_IndexElem:
retval = _copyIndexElem(from);
break;
case T_StatsElem:
retval = _copyStatsElem(from);
break;
case T_ColumnDef:
retval = _copyColumnDef(from);
break;

View File

@ -2596,6 +2596,16 @@ _equalIndexElem(const IndexElem *a, const IndexElem *b)
return true;
}
static bool
_equalStatsElem(const StatsElem *a, const StatsElem *b)
{
COMPARE_STRING_FIELD(name);
COMPARE_NODE_FIELD(expr);
return true;
}
static bool
_equalColumnDef(const ColumnDef *a, const ColumnDef *b)
{
@ -3724,6 +3734,9 @@ equal(const void *a, const void *b)
case T_IndexElem:
retval = _equalIndexElem(a, b);
break;
case T_StatsElem:
retval = _equalStatsElem(a, b);
break;
case T_ColumnDef:
retval = _equalColumnDef(a, b);
break;

View File

@ -2943,6 +2943,15 @@ _outIndexElem(StringInfo str, const IndexElem *node)
WRITE_ENUM_FIELD(nulls_ordering, SortByNulls);
}
static void
_outStatsElem(StringInfo str, const StatsElem *node)
{
WRITE_NODE_TYPE("STATSELEM");
WRITE_STRING_FIELD(name);
WRITE_NODE_FIELD(expr);
}
static void
_outQuery(StringInfo str, const Query *node)
{
@ -4286,6 +4295,9 @@ outNode(StringInfo str, const void *obj)
case T_IndexElem:
_outIndexElem(str, obj);
break;
case T_StatsElem:
_outStatsElem(str, obj);
break;
case T_Query:
_outQuery(str, obj);
break;

View File

@ -34,6 +34,7 @@
#include "foreign/fdwapi.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
@ -1308,6 +1309,7 @@ get_relation_constraints(PlannerInfo *root,
static List *
get_relation_statistics(RelOptInfo *rel, Relation relation)
{
Index varno = rel->relid;
List *statoidlist;
List *stainfos = NIL;
ListCell *l;
@ -1321,6 +1323,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
HeapTuple htup;
HeapTuple dtup;
Bitmapset *keys = NULL;
List *exprs = NIL;
int i;
htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
@ -1340,6 +1343,49 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
for (i = 0; i < staForm->stxkeys.dim1; i++)
keys = bms_add_member(keys, staForm->stxkeys.values[i]);
/*
* Preprocess expressions (if any). We read the expressions, run them
* through eval_const_expressions, and fix the varnos.
*/
{
bool isnull;
Datum datum;
/* decode expression (if any) */
datum = SysCacheGetAttr(STATEXTOID, htup,
Anum_pg_statistic_ext_stxexprs, &isnull);
if (!isnull)
{
char *exprsString;
exprsString = TextDatumGetCString(datum);
exprs = (List *) stringToNode(exprsString);
pfree(exprsString);
/*
* Run the expressions through eval_const_expressions. This is
* not just an optimization, but is necessary, because the
* planner will be comparing them to similarly-processed qual
* clauses, and may fail to detect valid matches without this.
* We must not use canonicalize_qual, however, since these
* aren't qual expressions.
*/
exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
/* May as well fix opfuncids too */
fix_opfuncids((Node *) exprs);
/*
* Modify the copies we obtain from the relcache to have the
* correct varno for the parent relation, so that they match
* up correctly against qual clauses.
*/
if (varno != 1)
ChangeVarNodes((Node *) exprs, 1, varno, 0);
}
}
/* add one StatisticExtInfo for each kind built */
if (statext_is_kind_built(dtup, STATS_EXT_NDISTINCT))
{
@ -1349,6 +1395,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
info->rel = rel;
info->kind = STATS_EXT_NDISTINCT;
info->keys = bms_copy(keys);
info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
@ -1361,6 +1408,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
info->rel = rel;
info->kind = STATS_EXT_DEPENDENCIES;
info->keys = bms_copy(keys);
info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
@ -1373,6 +1421,20 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
info->rel = rel;
info->kind = STATS_EXT_MCV;
info->keys = bms_copy(keys);
info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
if (statext_is_kind_built(dtup, STATS_EXT_EXPRESSIONS))
{
StatisticExtInfo *info = makeNode(StatisticExtInfo);
info->statOid = statOid;
info->rel = rel;
info->kind = STATS_EXT_EXPRESSIONS;
info->keys = bms_copy(keys);
info->exprs = exprs;
stainfos = lappend(stainfos, info);
}

View File

@ -239,6 +239,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
WindowDef *windef;
JoinExpr *jexpr;
IndexElem *ielem;
StatsElem *selem;
Alias *alias;
RangeVar *range;
IntoClause *into;
@ -405,7 +406,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
old_aggr_definition old_aggr_list
oper_argtypes RuleActionList RuleActionMulti
opt_column_list columnList opt_name_list
sort_clause opt_sort_clause sortby_list index_params
sort_clause opt_sort_clause sortby_list index_params stats_params
opt_include opt_c_include index_including_params
name_list role_list from_clause from_list opt_array_bounds
qualified_name_list any_name any_name_list type_name_list
@ -512,6 +513,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
%type <list> func_alias_clause
%type <sortby> sortby
%type <ielem> index_elem index_elem_options
%type <selem> stats_param
%type <node> table_ref
%type <jexpr> joined_table
%type <range> relation_expr
@ -4097,7 +4099,7 @@ ExistingIndex: USING INDEX name { $$ = $3; }
CreateStatsStmt:
CREATE STATISTICS any_name
opt_name_list ON expr_list FROM from_list
opt_name_list ON stats_params FROM from_list
{
CreateStatsStmt *n = makeNode(CreateStatsStmt);
n->defnames = $3;
@ -4109,7 +4111,7 @@ CreateStatsStmt:
$$ = (Node *)n;
}
| CREATE STATISTICS IF_P NOT EXISTS any_name
opt_name_list ON expr_list FROM from_list
opt_name_list ON stats_params FROM from_list
{
CreateStatsStmt *n = makeNode(CreateStatsStmt);
n->defnames = $6;
@ -4122,6 +4124,36 @@ CreateStatsStmt:
}
;
/*
* Statistics attributes can be either simple column references, or arbitrary
* expressions in parens. For compatibility with index attributes permitted
* in CREATE INDEX, we allow an expression that's just a function call to be
* written without parens.
*/
stats_params: stats_param { $$ = list_make1($1); }
| stats_params ',' stats_param { $$ = lappend($1, $3); }
;
stats_param: ColId
{
$$ = makeNode(StatsElem);
$$->name = $1;
$$->expr = NULL;
}
| func_expr_windowless
{
$$ = makeNode(StatsElem);
$$->name = NULL;
$$->expr = $1;
}
| '(' a_expr ')'
{
$$ = makeNode(StatsElem);
$$->name = NULL;
$$->expr = $2;
}
;
/*****************************************************************************
*

View File

@ -484,6 +484,13 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr)
else
err = _("grouping operations are not allowed in index predicates");
break;
case EXPR_KIND_STATS_EXPRESSION:
if (isAgg)
err = _("aggregate functions are not allowed in statistics expressions");
else
err = _("grouping operations are not allowed in statistics expressions");
break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
if (isAgg)
@ -910,6 +917,9 @@ transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc,
case EXPR_KIND_INDEX_EXPRESSION:
err = _("window functions are not allowed in index expressions");
break;
case EXPR_KIND_STATS_EXPRESSION:
err = _("window functions are not allowed in statistics expressions");
break;
case EXPR_KIND_INDEX_PREDICATE:
err = _("window functions are not allowed in index predicates");
break;

View File

@ -500,6 +500,7 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref)
case EXPR_KIND_FUNCTION_DEFAULT:
case EXPR_KIND_INDEX_EXPRESSION:
case EXPR_KIND_INDEX_PREDICATE:
case EXPR_KIND_STATS_EXPRESSION:
case EXPR_KIND_ALTER_COL_TRANSFORM:
case EXPR_KIND_EXECUTE_PARAMETER:
case EXPR_KIND_TRIGGER_WHEN:
@ -1741,6 +1742,9 @@ transformSubLink(ParseState *pstate, SubLink *sublink)
case EXPR_KIND_INDEX_PREDICATE:
err = _("cannot use subquery in index predicate");
break;
case EXPR_KIND_STATS_EXPRESSION:
err = _("cannot use subquery in statistics expression");
break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
err = _("cannot use subquery in transform expression");
break;
@ -3030,6 +3034,8 @@ ParseExprKindName(ParseExprKind exprKind)
return "index expression";
case EXPR_KIND_INDEX_PREDICATE:
return "index predicate";
case EXPR_KIND_STATS_EXPRESSION:
return "statistics expression";
case EXPR_KIND_ALTER_COL_TRANSFORM:
return "USING";
case EXPR_KIND_EXECUTE_PARAMETER:

View File

@ -2503,6 +2503,9 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location)
case EXPR_KIND_INDEX_PREDICATE:
err = _("set-returning functions are not allowed in index predicates");
break;
case EXPR_KIND_STATS_EXPRESSION:
err = _("set-returning functions are not allowed in statistics expressions");
break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
err = _("set-returning functions are not allowed in transform expressions");
break;

View File

@ -1917,6 +1917,9 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
stat_types = lappend(stat_types, makeString("dependencies"));
else if (enabled[i] == STATS_EXT_MCV)
stat_types = lappend(stat_types, makeString("mcv"));
else if (enabled[i] == STATS_EXT_EXPRESSIONS)
/* expression stats are not exposed to users */
continue;
else
elog(ERROR, "unrecognized statistics kind %c", enabled[i]);
}
@ -1924,14 +1927,47 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
/* Determine which columns the statistics are on */
for (i = 0; i < statsrec->stxkeys.dim1; i++)
{
ColumnRef *cref = makeNode(ColumnRef);
StatsElem *selem = makeNode(StatsElem);
AttrNumber attnum = statsrec->stxkeys.values[i];
cref->fields = list_make1(makeString(get_attname(heapRelid,
attnum, false)));
cref->location = -1;
selem->name = get_attname(heapRelid, attnum, false);
selem->expr = NULL;
def_names = lappend(def_names, cref);
def_names = lappend(def_names, selem);
}
/*
* Now handle expressions, if there are any. The order (with respect to
* regular attributes) does not really matter for extended stats, so we
* simply append them after simple column references.
*
* XXX Some places during build/estimation treat expressions as if they
* are before atttibutes, but for the CREATE command that's entirely
* irrelevant.
*/
datum = SysCacheGetAttr(STATEXTOID, ht_stats,
Anum_pg_statistic_ext_stxexprs, &isnull);
if (!isnull)
{
ListCell *lc;
List *exprs = NIL;
char *exprsString;
exprsString = TextDatumGetCString(datum);
exprs = (List *) stringToNode(exprsString);
foreach(lc, exprs)
{
StatsElem *selem = makeNode(StatsElem);
selem->name = NULL;
selem->expr = (Node *) lfirst(lc);
def_names = lappend(def_names, selem);
}
pfree(exprsString);
}
/* finally, build the output node */
@ -1942,6 +1978,7 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
stats->relations = list_make1(heapRel);
stats->stxcomment = NULL;
stats->if_not_exists = false;
stats->transformed = true; /* don't need transformStatsStmt again */
/* Clean up */
ReleaseSysCache(ht_stats);
@ -2866,6 +2903,84 @@ transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString)
return stmt;
}
/*
* transformStatsStmt - parse analysis for CREATE STATISTICS
*
* To avoid race conditions, it's important that this function rely only on
* the passed-in relid (and not on stmt->relation) to determine the target
* relation.
*/
CreateStatsStmt *
transformStatsStmt(Oid relid, CreateStatsStmt *stmt, const char *queryString)
{
ParseState *pstate;
ParseNamespaceItem *nsitem;
ListCell *l;
Relation rel;
/* Nothing to do if statement already transformed. */
if (stmt->transformed)
return stmt;
/*
* We must not scribble on the passed-in CreateStatsStmt, so copy it.
* (This is overkill, but easy.)
*/
stmt = copyObject(stmt);
/* Set up pstate */
pstate = make_parsestate(NULL);
pstate->p_sourcetext = queryString;
/*
* Put the parent table into the rtable so that the expressions can refer
* to its fields without qualification. Caller is responsible for locking
* relation, but we still need to open it.
*/
rel = relation_open(relid, NoLock);
nsitem = addRangeTableEntryForRelation(pstate, rel,
AccessShareLock,
NULL, false, true);
/* no to join list, yes to namespaces */
addNSItemToQuery(pstate, nsitem, false, true, true);
/* take care of any expressions */
foreach(l, stmt->exprs)
{
StatsElem *selem = (StatsElem *) lfirst(l);
if (selem->expr)
{
/* Now do parse transformation of the expression */
selem->expr = transformExpr(pstate, selem->expr,
EXPR_KIND_STATS_EXPRESSION);
/* We have to fix its collations too */
assign_expr_collations(pstate, selem->expr);
}
}
/*
* Check that only the base rel is mentioned. (This should be dead code
* now that add_missing_from is history.)
*/
if (list_length(pstate->p_rtable) != 1)
ereport(ERROR,
(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
errmsg("statistics expressions can refer only to the table being indexed")));
free_parsestate(pstate);
/* Close relation */
table_close(rel, NoLock);
/* Mark statement as successfully transformed */
stmt->transformed = true;
return stmt;
}
/*
* transformRuleStmt -

View File

@ -70,15 +70,15 @@ static void generate_dependencies(DependencyGenerator state);
static DependencyGenerator DependencyGenerator_init(int n, int k);
static void DependencyGenerator_free(DependencyGenerator state);
static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
static double dependency_degree(int numrows, HeapTuple *rows, int k,
AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
static bool dependency_is_fully_matched(MVDependency *dependency,
Bitmapset *attnums);
static bool dependency_is_compatible_clause(Node *clause, Index relid,
AttrNumber *attnum);
static bool dependency_is_compatible_expression(Node *clause, Index relid,
List *statlist, Node **expr);
static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
int ndependencies,
Bitmapset *attnums);
int ndependencies, Bitmapset *attnums);
static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
int varRelid, JoinType jointype,
SpecialJoinInfo *sjinfo,
@ -219,16 +219,13 @@ DependencyGenerator_next(DependencyGenerator state)
* the last one.
*/
static double
dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
VacAttrStats **stats, Bitmapset *attrs)
dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
{
int i,
nitems;
MultiSortSupport mss;
SortItem *items;
AttrNumber *attnums;
AttrNumber *attnums_dep;
int numattrs;
/* counters valid within a group */
int group_size = 0;
@ -244,15 +241,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
mss = multi_sort_init(k);
/*
* Transform the attrs from bitmap to an array to make accessing the i-th
* member easier, and then construct a filtered version with only attnums
* referenced by the dependency we validate.
* Translate the array of indexes to regular attnums for the dependency (we
* will need this to identify the columns in StatsBuildData).
*/
attnums = build_attnums_array(attrs, &numattrs);
attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
for (i = 0; i < k; i++)
attnums_dep[i] = attnums[dependency[i]];
attnums_dep[i] = data->attnums[dependency[i]];
/*
* Verify the dependency (a,b,...)->z, using a rather simple algorithm:
@ -270,7 +264,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
/* prepare the sort function for the dimensions */
for (i = 0; i < k; i++)
{
VacAttrStats *colstat = stats[dependency[i]];
VacAttrStats *colstat = data->stats[dependency[i]];
TypeCacheEntry *type;
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@ -289,8 +283,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
* descriptor. For now that assumption holds, but it might change in the
* future for example if we support statistics on multiple tables.
*/
items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
mss, k, attnums_dep);
items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
/*
* Walk through the sorted array, split it into rows according to the
@ -336,11 +329,10 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
pfree(items);
pfree(mss);
pfree(attnums);
pfree(attnums_dep);
/* Compute the 'degree of validity' as (supporting/total). */
return (n_supporting_rows * 1.0 / numrows);
return (n_supporting_rows * 1.0 / data->numrows);
}
/*
@ -360,23 +352,15 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
* (c) -> b
*/
MVDependencies *
statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
VacAttrStats **stats)
statext_dependencies_build(StatsBuildData *data)
{
int i,
k;
int numattrs;
AttrNumber *attnums;
/* result */
MVDependencies *dependencies = NULL;
/*
* Transform the bms into an array, to make accessing i-th member easier.
*/
attnums = build_attnums_array(attrs, &numattrs);
Assert(numattrs >= 2);
Assert(data->nattnums >= 2);
/*
* We'll try build functional dependencies starting from the smallest ones
@ -384,12 +368,12 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
* included in the statistics object. We start from the smallest ones
* because we want to be able to skip already implied ones.
*/
for (k = 2; k <= numattrs; k++)
for (k = 2; k <= data->nattnums; k++)
{
AttrNumber *dependency; /* array with k elements */
/* prepare a DependencyGenerator of variation */
DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
/* generate all possible variations of k values (out of n) */
while ((dependency = DependencyGenerator_next(DependencyGenerator)))
@ -398,7 +382,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
MVDependency *d;
/* compute how valid the dependency seems */
degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
degree = dependency_degree(data, k, dependency);
/*
* if the dependency seems entirely invalid, don't store it
@ -413,7 +397,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
d->degree = degree;
d->nattributes = k;
for (i = 0; i < k; i++)
d->attributes[i] = attnums[dependency[i]];
d->attributes[i] = data->attnums[dependency[i]];
/* initialize the list of dependencies */
if (dependencies == NULL)
@ -747,6 +731,7 @@ static bool
dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
{
Var *var;
Node *clause_expr;
if (IsA(clause, RestrictInfo))
{
@ -774,9 +759,9 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
/* Make sure non-selected argument is a pseudoconstant. */
if (is_pseudo_constant_clause(lsecond(expr->args)))
var = linitial(expr->args);
clause_expr = linitial(expr->args);
else if (is_pseudo_constant_clause(linitial(expr->args)))
var = lsecond(expr->args);
clause_expr = lsecond(expr->args);
else
return false;
@ -805,8 +790,8 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
/*
* Reject ALL() variant, we only care about ANY/IN.
*
* FIXME Maybe we should check if all the values are the same, and
* allow ALL in that case? Doesn't seem very practical, though.
* XXX Maybe we should check if all the values are the same, and allow
* ALL in that case? Doesn't seem very practical, though.
*/
if (!expr->useOr)
return false;
@ -822,7 +807,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
if (!is_pseudo_constant_clause(lsecond(expr->args)))
return false;
var = linitial(expr->args);
clause_expr = linitial(expr->args);
/*
* If it's not an "=" operator, just ignore the clause, as it's not
@ -838,13 +823,13 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
}
else if (is_orclause(clause))
{
BoolExpr *expr = (BoolExpr *) clause;
BoolExpr *bool_expr = (BoolExpr *) clause;
ListCell *lc;
/* start with no attribute number */
*attnum = InvalidAttrNumber;
foreach(lc, expr->args)
foreach(lc, bool_expr->args)
{
AttrNumber clause_attnum;
@ -859,6 +844,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
if (*attnum == InvalidAttrNumber)
*attnum = clause_attnum;
/* ensure all the variables are the same (same attnum) */
if (*attnum != clause_attnum)
return false;
}
@ -872,7 +858,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
* "NOT x" can be interpreted as "x = false", so get the argument and
* proceed with seeing if it's a suitable Var.
*/
var = (Var *) get_notclausearg(clause);
clause_expr = (Node *) get_notclausearg(clause);
}
else
{
@ -880,20 +866,23 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
* A boolean expression "x" can be interpreted as "x = true", so
* proceed with seeing if it's a suitable Var.
*/
var = (Var *) clause;
clause_expr = (Node *) clause;
}
/*
* We may ignore any RelabelType node above the operand. (There won't be
* more than one, since eval_const_expressions has been applied already.)
*/
if (IsA(var, RelabelType))
var = (Var *) ((RelabelType *) var)->arg;
if (IsA(clause_expr, RelabelType))
clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
/* We only support plain Vars for now */
if (!IsA(var, Var))
if (!IsA(clause_expr, Var))
return false;
/* OK, we know we have a Var */
var = (Var *) clause_expr;
/* Ensure Var is from the correct relation */
if (var->varno != relid)
return false;
@ -1157,6 +1146,212 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
return s1;
}
/*
* dependency_is_compatible_expression
* Determines if the expression is compatible with functional dependencies
*
* Similar to dependency_is_compatible_clause, but doesn't enforce that the
* expression is a simple Var. OTOH we check that there's at least one
* statistics object matching the expression.
*/
static bool
dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
{
List *vars;
ListCell *lc,
*lc2;
Node *clause_expr;
if (IsA(clause, RestrictInfo))
{
RestrictInfo *rinfo = (RestrictInfo *) clause;
/* Pseudoconstants are not interesting (they couldn't contain a Var) */
if (rinfo->pseudoconstant)
return false;
/* Clauses referencing multiple, or no, varnos are incompatible */
if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
return false;
clause = (Node *) rinfo->clause;
}
if (is_opclause(clause))
{
/* If it's an opclause, check for Var = Const or Const = Var. */
OpExpr *expr = (OpExpr *) clause;
/* Only expressions with two arguments are candidates. */
if (list_length(expr->args) != 2)
return false;
/* Make sure non-selected argument is a pseudoconstant. */
if (is_pseudo_constant_clause(lsecond(expr->args)))
clause_expr = linitial(expr->args);
else if (is_pseudo_constant_clause(linitial(expr->args)))
clause_expr = lsecond(expr->args);
else
return false;
/*
* If it's not an "=" operator, just ignore the clause, as it's not
* compatible with functional dependencies.
*
* This uses the function for estimating selectivity, not the operator
* directly (a bit awkward, but well ...).
*
* XXX this is pretty dubious; probably it'd be better to check btree
* or hash opclass membership, so as not to be fooled by custom
* selectivity functions, and to be more consistent with decisions
* elsewhere in the planner.
*/
if (get_oprrest(expr->opno) != F_EQSEL)
return false;
/* OK to proceed with checking "var" */
}
else if (IsA(clause, ScalarArrayOpExpr))
{
/* If it's an scalar array operator, check for Var IN Const. */
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
/*
* Reject ALL() variant, we only care about ANY/IN.
*
* FIXME Maybe we should check if all the values are the same, and
* allow ALL in that case? Doesn't seem very practical, though.
*/
if (!expr->useOr)
return false;
/* Only expressions with two arguments are candidates. */
if (list_length(expr->args) != 2)
return false;
/*
* We know it's always (Var IN Const), so we assume the var is the
* first argument, and pseudoconstant is the second one.
*/
if (!is_pseudo_constant_clause(lsecond(expr->args)))
return false;
clause_expr = linitial(expr->args);
/*
* If it's not an "=" operator, just ignore the clause, as it's not
* compatible with functional dependencies. The operator is identified
* simply by looking at which function it uses to estimate
* selectivity. That's a bit strange, but it's what other similar
* places do.
*/
if (get_oprrest(expr->opno) != F_EQSEL)
return false;
/* OK to proceed with checking "var" */
}
else if (is_orclause(clause))
{
BoolExpr *bool_expr = (BoolExpr *) clause;
ListCell *lc;
/* start with no expression (we'll use the first match) */
*expr = NULL;
foreach(lc, bool_expr->args)
{
Node *or_expr = NULL;
/*
* Had we found incompatible expression in the arguments, treat
* the whole expression as incompatible.
*/
if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
statlist, &or_expr))
return false;
if (*expr == NULL)
*expr = or_expr;
/* ensure all the expressions are the same */
if (!equal(or_expr, *expr))
return false;
}
/* the expression is already checked by the recursive call */
return true;
}
else if (is_notclause(clause))
{
/*
* "NOT x" can be interpreted as "x = false", so get the argument and
* proceed with seeing if it's a suitable Var.
*/
clause_expr = (Node *) get_notclausearg(clause);
}
else
{
/*
* A boolean expression "x" can be interpreted as "x = true", so
* proceed with seeing if it's a suitable Var.
*/
clause_expr = (Node *) clause;
}
/*
* We may ignore any RelabelType node above the operand. (There won't be
* more than one, since eval_const_expressions has been applied already.)
*/
if (IsA(clause_expr, RelabelType))
clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
vars = pull_var_clause(clause_expr, 0);
foreach(lc, vars)
{
Var *var = (Var *) lfirst(lc);
/* Ensure Var is from the correct relation */
if (var->varno != relid)
return false;
/* We also better ensure the Var is from the current level */
if (var->varlevelsup != 0)
return false;
/* Also ignore system attributes (we don't allow stats on those) */
if (!AttrNumberIsForUserDefinedAttr(var->varattno))
return false;
}
/*
* Check if we actually have a matching statistics for the expression.
*
* XXX Maybe this is an overkill. We'll eliminate the expressions later.
*/
foreach(lc, statlist)
{
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
/* ignore stats without dependencies */
if (info->kind != STATS_EXT_DEPENDENCIES)
continue;
foreach(lc2, info->exprs)
{
Node *stat_expr = (Node *) lfirst(lc2);
if (equal(clause_expr, stat_expr))
{
*expr = stat_expr;
return true;
}
}
}
return false;
}
/*
* dependencies_clauselist_selectivity
* Return the estimated selectivity of (a subset of) the given clauses
@ -1204,6 +1399,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
MVDependency **dependencies;
int ndependencies;
int i;
AttrNumber attnum_offset;
/* unique expressions */
Node **unique_exprs;
int unique_exprs_cnt;
/* check if there's any stats that might be useful for us. */
if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
@ -1212,6 +1412,15 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
list_length(clauses));
/*
* We allocate space as if every clause was a unique expression, although
* that's probably overkill. Some will be simple column references that
* we'll translate to attnums, and there might be duplicates. But it's
* easier and cheaper to just do one allocation than repalloc later.
*/
unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
unique_exprs_cnt = 0;
/*
* Pre-process the clauses list to extract the attnums seen in each item.
* We need to determine if there's any clauses which will be useful for
@ -1222,29 +1431,127 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
*
* We also skip clauses that we already estimated using different types of
* statistics (we treat them as incompatible).
*
* To handle expressions, we assign them negative attnums, as if it was a
* system attribute (this is fine, as we only allow extended stats on user
* attributes). And then we offset everything by the number of
* expressions, so that we can store the values in a bitmapset.
*/
listidx = 0;
foreach(l, clauses)
{
Node *clause = (Node *) lfirst(l);
AttrNumber attnum;
Node *expr = NULL;
if (!bms_is_member(listidx, *estimatedclauses) &&
dependency_is_compatible_clause(clause, rel->relid, &attnum))
/* ignore clause by default */
list_attnums[listidx] = InvalidAttrNumber;
if (!bms_is_member(listidx, *estimatedclauses))
{
list_attnums[listidx] = attnum;
clauses_attnums = bms_add_member(clauses_attnums, attnum);
/*
* If it's a simple column refrence, just extract the attnum. If
* it's an expression, assign a negative attnum as if it was a
* system attribute.
*/
if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
{
list_attnums[listidx] = attnum;
}
else if (dependency_is_compatible_expression(clause, rel->relid,
rel->statlist,
&expr))
{
/* special attnum assigned to this expression */
attnum = InvalidAttrNumber;
Assert(expr != NULL);
/* If the expression is duplicate, use the same attnum. */
for (i = 0; i < unique_exprs_cnt; i++)
{
if (equal(unique_exprs[i], expr))
{
/* negative attribute number to expression */
attnum = -(i + 1);
break;
}
}
/* not found in the list, so add it */
if (attnum == InvalidAttrNumber)
{
unique_exprs[unique_exprs_cnt++] = expr;
/* after incrementing the value, to get -1, -2, ... */
attnum = (-unique_exprs_cnt);
}
/* remember which attnum was assigned to this clause */
list_attnums[listidx] = attnum;
}
}
else
list_attnums[listidx] = InvalidAttrNumber;
listidx++;
}
Assert(listidx == list_length(clauses));
/*
* If there's not at least two distinct attnums then reject the whole list
* of clauses. We must return 1.0 so the calling function's selectivity is
* unaffected.
* How much we need to offset the attnums? If there are no expressions,
* then no offset is needed. Otherwise we need to offset enough for the
* lowest value (-unique_exprs_cnt) to become 1.
*/
if (unique_exprs_cnt > 0)
attnum_offset = (unique_exprs_cnt + 1);
else
attnum_offset = 0;
/*
* Now that we know how many expressions there are, we can offset the
* values just enough to build the bitmapset.
*/
for (i = 0; i < list_length(clauses); i++)
{
AttrNumber attnum;
/* ignore incompatible or already estimated clauses */
if (list_attnums[i] == InvalidAttrNumber)
continue;
/* make sure the attnum is in the expected range */
Assert(list_attnums[i] >= (-unique_exprs_cnt));
Assert(list_attnums[i] <= MaxHeapAttributeNumber);
/* make sure the attnum is positive (valid AttrNumber) */
attnum = list_attnums[i] + attnum_offset;
/*
* Either it's a regular attribute, or it's an expression, in which
* case we must not have seen it before (expressions are unique).
*
* XXX Check whether it's a regular attribute has to be done using the
* original attnum, while the second check has to use the value with
* an offset.
*/
Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
!bms_is_member(attnum, clauses_attnums));
/*
* Remember the offset attnum, both for attributes and expressions.
* We'll pass list_attnums to clauselist_apply_dependencies, which
* uses it to identify clauses in a bitmap. We could also pass the
* offset, but this is more convenient.
*/
list_attnums[i] = attnum;
clauses_attnums = bms_add_member(clauses_attnums, attnum);
}
/*
* If there's not at least two distinct attnums and expressions, then
* reject the whole list of clauses. We must return 1.0 so the calling
* function's selectivity is unaffected.
*/
if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
{
@ -1272,26 +1579,203 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
foreach(l, rel->statlist)
{
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
Bitmapset *matched;
BMS_Membership membership;
int nmatched;
int nexprs;
int k;
MVDependencies *deps;
/* skip statistics that are not of the correct type */
if (stat->kind != STATS_EXT_DEPENDENCIES)
continue;
matched = bms_intersect(clauses_attnums, stat->keys);
membership = bms_membership(matched);
bms_free(matched);
/*
* Count matching attributes - we have to undo the attnum offsets. The
* input attribute numbers are not offset (expressions are not
* included in stat->keys, so it's not necessary). But we need to
* offset it before checking against clauses_attnums.
*/
nmatched = 0;
k = -1;
while ((k = bms_next_member(stat->keys, k)) >= 0)
{
AttrNumber attnum = (AttrNumber) k;
/* skip objects matching fewer than two attributes from clauses */
if (membership != BMS_MULTIPLE)
/* skip expressions */
if (!AttrNumberIsForUserDefinedAttr(attnum))
continue;
/* apply the same offset as above */
attnum += attnum_offset;
if (bms_is_member(attnum, clauses_attnums))
nmatched++;
}
/* count matching expressions */
nexprs = 0;
for (i = 0; i < unique_exprs_cnt; i++)
{
ListCell *lc;
foreach(lc, stat->exprs)
{
Node *stat_expr = (Node *) lfirst(lc);
/* try to match it */
if (equal(stat_expr, unique_exprs[i]))
nexprs++;
}
}
/*
* Skip objects matching fewer than two attributes/expressions from
* clauses.
*/
if (nmatched + nexprs < 2)
continue;
func_dependencies[nfunc_dependencies]
= statext_dependencies_load(stat->statOid);
deps = statext_dependencies_load(stat->statOid);
total_ndeps += func_dependencies[nfunc_dependencies]->ndeps;
nfunc_dependencies++;
/*
* The expressions may be represented by different attnums in the
* stats, we need to remap them to be consistent with the clauses.
* That will make the later steps (e.g. picking the strongest item and
* so on) much simpler and cheaper, because it won't need to care
* about the offset at all.
*
* When we're at it, we can ignore dependencies that are not fully
* matched by clauses (i.e. referencing attributes or expressions that
* are not in the clauses).
*
* We have to do this for all statistics, as long as there are any
* expressions - we need to shift the attnums in all dependencies.
*
* XXX Maybe we should do this always, because it also eliminates some
* of the dependencies early. It might be cheaper than having to walk
* the longer list in find_strongest_dependency later, especially as
* we need to do that repeatedly?
*
* XXX We have to do this even when there are no expressions in
* clauses, otherwise find_strongest_dependency may fail for stats
* with expressions (due to lookup of negative value in bitmap). So we
* need to at least filter out those dependencies. Maybe we could do
* it in a cheaper way (if there are no expr clauses, we can just
* discard all negative attnums without any lookups).
*/
if (unique_exprs_cnt > 0 || stat->exprs != NIL)
{
int ndeps = 0;
for (i = 0; i < deps->ndeps; i++)
{
bool skip = false;
MVDependency *dep = deps->deps[i];
int j;
for (j = 0; j < dep->nattributes; j++)
{
int idx;
Node *expr;
int k;
AttrNumber unique_attnum = InvalidAttrNumber;
AttrNumber attnum;
/* undo the per-statistics offset */
attnum = dep->attributes[j];
/*
* For regular attributes we can simply check if it
* matches any clause. If there's no matching clause, we
* can just ignore it. We need to offset the attnum
* though.
*/
if (AttrNumberIsForUserDefinedAttr(attnum))
{
dep->attributes[j] = attnum + attnum_offset;
if (!bms_is_member(dep->attributes[j], clauses_attnums))
{
skip = true;
break;
}
continue;
}
/*
* the attnum should be a valid system attnum (-1, -2,
* ...)
*/
Assert(AttributeNumberIsValid(attnum));
/*
* For expressions, we need to do two translations. First
* we have to translate the negative attnum to index in
* the list of expressions (in the statistics object).
* Then we need to see if there's a matching clause. The
* index of the unique expression determines the attnum
* (and we offset it).
*/
idx = -(1 + attnum);
/* Is the expression index is valid? */
Assert((idx >= 0) && (idx < list_length(stat->exprs)));
expr = (Node *) list_nth(stat->exprs, idx);
/* try to find the expression in the unique list */
for (k = 0; k < unique_exprs_cnt; k++)
{
/*
* found a matching unique expression, use the attnum
* (derived from index of the unique expression)
*/
if (equal(unique_exprs[k], expr))
{
unique_attnum = -(k + 1) + attnum_offset;
break;
}
}
/*
* Found no matching expression, so we can simply skip
* this dependency, because there's no chance it will be
* fully covered.
*/
if (unique_attnum == InvalidAttrNumber)
{
skip = true;
break;
}
/* otherwise remap it to the new attnum */
dep->attributes[j] = unique_attnum;
}
/* if found a matching dependency, keep it */
if (!skip)
{
/* maybe we've skipped something earlier, so move it */
if (ndeps != i)
deps->deps[ndeps] = deps->deps[i];
ndeps++;
}
}
deps->ndeps = ndeps;
}
/*
* It's possible we've removed all dependencies, in which case we
* don't bother adding it to the list.
*/
if (deps->ndeps > 0)
{
func_dependencies[nfunc_dependencies] = deps;
total_ndeps += deps->ndeps;
nfunc_dependencies++;
}
}
/* if no matching stats could be found then we've nothing to do */
@ -1300,6 +1784,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
pfree(func_dependencies);
bms_free(clauses_attnums);
pfree(list_attnums);
pfree(unique_exprs);
return 1.0;
}
@ -1347,6 +1832,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
pfree(func_dependencies);
bms_free(clauses_attnums);
pfree(list_attnums);
pfree(unique_exprs);
return s1;
}

File diff suppressed because it is too large Load Diff

View File

@ -74,7 +74,7 @@
((ndims) * sizeof(DimensionInfo)) + \
((nitems) * ITEM_SIZE(ndims)))
static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
static MultiSortSupport build_mss(StatsBuildData *data);
static SortItem *build_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss, int *ndistinct);
@ -181,32 +181,33 @@ get_mincount_for_mcv_list(int samplerows, double totalrows)
*
*/
MCVList *
statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
VacAttrStats **stats, double totalrows, int stattarget)
statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget)
{
int i,
numattrs,
numrows,
ngroups,
nitems;
AttrNumber *attnums;
double mincount;
SortItem *items;
SortItem *groups;
MCVList *mcvlist = NULL;
MultiSortSupport mss;
attnums = build_attnums_array(attrs, &numattrs);
/* comparator for all the columns */
mss = build_mss(stats, numattrs);
mss = build_mss(data);
/* sort the rows */
items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
mss, numattrs, attnums);
items = build_sorted_items(data, &nitems, mss,
data->nattnums, data->attnums);
if (!items)
return NULL;
/* for convenience */
numattrs = data->nattnums;
numrows = data->numrows;
/* transform the sorted rows into groups (sorted by frequency) */
groups = build_distinct_groups(nitems, items, mss, &ngroups);
@ -289,7 +290,7 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
/* store info about data type OIDs */
for (i = 0; i < numattrs; i++)
mcvlist->types[i] = stats[i]->attrtypid;
mcvlist->types[i] = data->stats[i]->attrtypid;
/* Copy the first chunk of groups into the result. */
for (i = 0; i < nitems; i++)
@ -347,9 +348,10 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
* build MultiSortSupport for the attributes passed in attrs
*/
static MultiSortSupport
build_mss(VacAttrStats **stats, int numattrs)
build_mss(StatsBuildData *data)
{
int i;
int numattrs = data->nattnums;
/* Sort by multiple columns (using array of SortSupport) */
MultiSortSupport mss = multi_sort_init(numattrs);
@ -357,7 +359,7 @@ build_mss(VacAttrStats **stats, int numattrs)
/* prepare the sort functions for all the attributes */
for (i = 0; i < numattrs; i++)
{
VacAttrStats *colstat = stats[i];
VacAttrStats *colstat = data->stats[i];
TypeCacheEntry *type;
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@ -1523,6 +1525,59 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
return byteasend(fcinfo);
}
/*
* match the attribute/expression to a dimension of the statistic
*
* Match the attribute/expression to statistics dimension. Optionally
* determine the collation.
*/
static int
mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid)
{
int idx = -1;
if (IsA(expr, Var))
{
/* simple Var, so just lookup using varattno */
Var *var = (Var *) expr;
if (collid)
*collid = var->varcollid;
idx = bms_member_index(keys, var->varattno);
/* make sure the index is valid */
Assert((idx >= 0) && (idx <= bms_num_members(keys)));
}
else
{
ListCell *lc;
/* expressions are stored after the simple columns */
idx = bms_num_members(keys);
if (collid)
*collid = exprCollation(expr);
/* expression - lookup in stats expressions */
foreach(lc, exprs)
{
Node *stat_expr = (Node *) lfirst(lc);
if (equal(expr, stat_expr))
break;
idx++;
}
/* make sure the index is valid */
Assert((idx >= bms_num_members(keys)) &&
(idx <= bms_num_members(keys) + list_length(exprs)));
}
return idx;
}
/*
* mcv_get_match_bitmap
* Evaluate clauses using the MCV list, and update the match bitmap.
@ -1544,7 +1599,8 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
*/
static bool *
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
Bitmapset *keys, MCVList *mcvlist, bool is_or)
Bitmapset *keys, List *exprs,
MCVList *mcvlist, bool is_or)
{
int i;
ListCell *l;
@ -1582,77 +1638,78 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
OpExpr *expr = (OpExpr *) clause;
FmgrInfo opproc;
/* valid only after examine_clause_args returns true */
Var *var;
/* valid only after examine_opclause_args returns true */
Node *clause_expr;
Const *cst;
bool varonleft;
bool expronleft;
int idx;
Oid collid;
fmgr_info(get_opcode(expr->opno), &opproc);
/* extract the var and const from the expression */
if (examine_clause_args(expr->args, &var, &cst, &varonleft))
{
int idx;
/* extract the var/expr and const from the expression */
if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
elog(ERROR, "incompatible clause");
/* match the attribute to a dimension of the statistic */
idx = bms_member_index(keys, var->varattno);
/* match the attribute/expression to a dimension of the statistic */
idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
/*
* Walk through the MCV items and evaluate the current clause. We
* can skip items that were already ruled out, and terminate if
* there are no remaining MCV items that might possibly match.
*/
for (i = 0; i < mcvlist->nitems; i++)
{
bool match = true;
MCVItem *item = &mcvlist->items[i];
Assert(idx >= 0);
/*
* Walk through the MCV items and evaluate the current clause.
* We can skip items that were already ruled out, and
* terminate if there are no remaining MCV items that might
* possibly match.
* When the MCV item or the Const value is NULL we can treat
* this as a mismatch. We must not call the operator because
* of strictness.
*/
for (i = 0; i < mcvlist->nitems; i++)
if (item->isnull[idx] || cst->constisnull)
{
bool match = true;
MCVItem *item = &mcvlist->items[i];
/*
* When the MCV item or the Const value is NULL we can
* treat this as a mismatch. We must not call the operator
* because of strictness.
*/
if (item->isnull[idx] || cst->constisnull)
{
matches[i] = RESULT_MERGE(matches[i], is_or, false);
continue;
}
/*
* Skip MCV items that can't change result in the bitmap.
* Once the value gets false for AND-lists, or true for
* OR-lists, we don't need to look at more clauses.
*/
if (RESULT_IS_FINAL(matches[i], is_or))
continue;
/*
* First check whether the constant is below the lower
* boundary (in that case we can skip the bucket, because
* there's no overlap).
*
* We don't store collations used to build the statistics,
* but we can use the collation for the attribute itself,
* as stored in varcollid. We do reset the statistics
* after a type change (including collation change), so
* this is OK. We may need to relax this after allowing
* extended statistics on expressions.
*/
if (varonleft)
match = DatumGetBool(FunctionCall2Coll(&opproc,
var->varcollid,
item->values[idx],
cst->constvalue));
else
match = DatumGetBool(FunctionCall2Coll(&opproc,
var->varcollid,
cst->constvalue,
item->values[idx]));
/* update the match bitmap with the result */
matches[i] = RESULT_MERGE(matches[i], is_or, match);
matches[i] = RESULT_MERGE(matches[i], is_or, false);
continue;
}
/*
* Skip MCV items that can't change result in the bitmap. Once
* the value gets false for AND-lists, or true for OR-lists,
* we don't need to look at more clauses.
*/
if (RESULT_IS_FINAL(matches[i], is_or))
continue;
/*
* First check whether the constant is below the lower
* boundary (in that case we can skip the bucket, because
* there's no overlap).
*
* We don't store collations used to build the statistics, but
* we can use the collation for the attribute itself, as
* stored in varcollid. We do reset the statistics after a
* type change (including collation change), so this is OK.
* For expressions we use the collation extracted from the
* expression itself.
*/
if (expronleft)
match = DatumGetBool(FunctionCall2Coll(&opproc,
collid,
item->values[idx],
cst->constvalue));
else
match = DatumGetBool(FunctionCall2Coll(&opproc,
collid,
cst->constvalue,
item->values[idx]));
/* update the match bitmap with the result */
matches[i] = RESULT_MERGE(matches[i], is_or, match);
}
}
else if (IsA(clause, ScalarArrayOpExpr))
@ -1660,115 +1717,116 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
FmgrInfo opproc;
/* valid only after examine_clause_args returns true */
Var *var;
/* valid only after examine_opclause_args returns true */
Node *clause_expr;
Const *cst;
bool varonleft;
bool expronleft;
Oid collid;
int idx;
/* array evaluation */
ArrayType *arrayval;
int16 elmlen;
bool elmbyval;
char elmalign;
int num_elems;
Datum *elem_values;
bool *elem_nulls;
fmgr_info(get_opcode(expr->opno), &opproc);
/* extract the var and const from the expression */
if (examine_clause_args(expr->args, &var, &cst, &varonleft))
/* extract the var/expr and const from the expression */
if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
elog(ERROR, "incompatible clause");
/* ScalarArrayOpExpr has the Var always on the left */
Assert(expronleft);
/* XXX what if (cst->constisnull == NULL)? */
if (!cst->constisnull)
{
int idx;
arrayval = DatumGetArrayTypeP(cst->constvalue);
get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
&elmlen, &elmbyval, &elmalign);
deconstruct_array(arrayval,
ARR_ELEMTYPE(arrayval),
elmlen, elmbyval, elmalign,
&elem_values, &elem_nulls, &num_elems);
}
ArrayType *arrayval;
int16 elmlen;
bool elmbyval;
char elmalign;
int num_elems;
Datum *elem_values;
bool *elem_nulls;
/* match the attribute/expression to a dimension of the statistic */
idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
/* ScalarArrayOpExpr has the Var always on the left */
Assert(varonleft);
if (!cst->constisnull)
{
arrayval = DatumGetArrayTypeP(cst->constvalue);
get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
&elmlen, &elmbyval, &elmalign);
deconstruct_array(arrayval,
ARR_ELEMTYPE(arrayval),
elmlen, elmbyval, elmalign,
&elem_values, &elem_nulls, &num_elems);
}
/* match the attribute to a dimension of the statistic */
idx = bms_member_index(keys, var->varattno);
/*
* Walk through the MCV items and evaluate the current clause. We
* can skip items that were already ruled out, and terminate if
* there are no remaining MCV items that might possibly match.
*/
for (i = 0; i < mcvlist->nitems; i++)
{
int j;
bool match = (expr->useOr ? false : true);
MCVItem *item = &mcvlist->items[i];
/*
* Walk through the MCV items and evaluate the current clause.
* We can skip items that were already ruled out, and
* terminate if there are no remaining MCV items that might
* possibly match.
* When the MCV item or the Const value is NULL we can treat
* this as a mismatch. We must not call the operator because
* of strictness.
*/
for (i = 0; i < mcvlist->nitems; i++)
if (item->isnull[idx] || cst->constisnull)
{
int j;
bool match = (expr->useOr ? false : true);
MCVItem *item = &mcvlist->items[i];
/*
* When the MCV item or the Const value is NULL we can
* treat this as a mismatch. We must not call the operator
* because of strictness.
*/
if (item->isnull[idx] || cst->constisnull)
{
matches[i] = RESULT_MERGE(matches[i], is_or, false);
continue;
}
/*
* Skip MCV items that can't change result in the bitmap.
* Once the value gets false for AND-lists, or true for
* OR-lists, we don't need to look at more clauses.
*/
if (RESULT_IS_FINAL(matches[i], is_or))
continue;
for (j = 0; j < num_elems; j++)
{
Datum elem_value = elem_values[j];
bool elem_isnull = elem_nulls[j];
bool elem_match;
/* NULL values always evaluate as not matching. */
if (elem_isnull)
{
match = RESULT_MERGE(match, expr->useOr, false);
continue;
}
/*
* Stop evaluating the array elements once we reach
* match value that can't change - ALL() is the same
* as AND-list, ANY() is the same as OR-list.
*/
if (RESULT_IS_FINAL(match, expr->useOr))
break;
elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
var->varcollid,
item->values[idx],
elem_value));
match = RESULT_MERGE(match, expr->useOr, elem_match);
}
/* update the match bitmap with the result */
matches[i] = RESULT_MERGE(matches[i], is_or, match);
matches[i] = RESULT_MERGE(matches[i], is_or, false);
continue;
}
/*
* Skip MCV items that can't change result in the bitmap. Once
* the value gets false for AND-lists, or true for OR-lists,
* we don't need to look at more clauses.
*/
if (RESULT_IS_FINAL(matches[i], is_or))
continue;
for (j = 0; j < num_elems; j++)
{
Datum elem_value = elem_values[j];
bool elem_isnull = elem_nulls[j];
bool elem_match;
/* NULL values always evaluate as not matching. */
if (elem_isnull)
{
match = RESULT_MERGE(match, expr->useOr, false);
continue;
}
/*
* Stop evaluating the array elements once we reach match
* value that can't change - ALL() is the same as
* AND-list, ANY() is the same as OR-list.
*/
if (RESULT_IS_FINAL(match, expr->useOr))
break;
elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
collid,
item->values[idx],
elem_value));
match = RESULT_MERGE(match, expr->useOr, elem_match);
}
/* update the match bitmap with the result */
matches[i] = RESULT_MERGE(matches[i], is_or, match);
}
}
else if (IsA(clause, NullTest))
{
NullTest *expr = (NullTest *) clause;
Var *var = (Var *) (expr->arg);
Node *clause_expr = (Node *) (expr->arg);
/* match the attribute to a dimension of the statistic */
int idx = bms_member_index(keys, var->varattno);
/* match the attribute/expression to a dimension of the statistic */
int idx = mcv_match_expression(clause_expr, keys, exprs, NULL);
/*
* Walk through the MCV items and evaluate the current clause. We
@ -1811,7 +1869,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
Assert(list_length(bool_clauses) >= 2);
/* build the match bitmap for the OR-clauses */
bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs,
mcvlist, is_orclause(clause));
/*
@ -1839,7 +1897,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
Assert(list_length(not_args) == 1);
/* build the match bitmap for the NOT-clause */
not_matches = mcv_get_match_bitmap(root, not_args, keys,
not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs,
mcvlist, false);
/*
@ -1982,7 +2040,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
mcv = statext_mcv_load(stat->statOid);
/* build a match bitmap for the clauses */
matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
mcv, false);
/* sum frequencies for all the matching MCV items */
*basesel = 0.0;
@ -2056,7 +2115,7 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
/* build the match bitmap for the new clause */
new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
mcv, false);
stat->exprs, mcv, false);
/*
* Sum the frequencies for all the MCV items matching this clause and also

View File

@ -36,8 +36,7 @@
#include "utils/syscache.h"
#include "utils/typcache.h"
static double ndistinct_for_combination(double totalrows, int numrows,
HeapTuple *rows, VacAttrStats **stats,
static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
int k, int *combination);
static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
static int n_choose_k(int n, int k);
@ -81,15 +80,18 @@ static void generate_combinations(CombinationGenerator *state);
*
* This computes the ndistinct estimate using the same estimator used
* in analyze.c and then computes the coefficient.
*
* To handle expressions easily, we treat them as system attributes with
* negative attnums, and offset everything by number of expressions to
* allow using Bitmapsets.
*/
MVNDistinct *
statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows,
Bitmapset *attrs, VacAttrStats **stats)
statext_ndistinct_build(double totalrows, StatsBuildData *data)
{
MVNDistinct *result;
int k;
int itemcnt;
int numattrs = bms_num_members(attrs);
int numattrs = data->nattnums;
int numcombs = num_combinations(numattrs);
result = palloc(offsetof(MVNDistinct, items) +
@ -112,13 +114,19 @@ statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows,
MVNDistinctItem *item = &result->items[itemcnt];
int j;
item->attrs = NULL;
item->attributes = palloc(sizeof(AttrNumber) * k);
item->nattributes = k;
/* translate the indexes to attnums */
for (j = 0; j < k; j++)
item->attrs = bms_add_member(item->attrs,
stats[combination[j]]->attr->attnum);
{
item->attributes[j] = data->attnums[combination[j]];
Assert(AttributeNumberIsValid(item->attributes[j]));
}
item->ndistinct =
ndistinct_for_combination(totalrows, numrows, rows,
stats, k, combination);
ndistinct_for_combination(totalrows, data, k, combination);
itemcnt++;
Assert(itemcnt <= result->nitems);
@ -189,7 +197,7 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct)
{
int nmembers;
nmembers = bms_num_members(ndistinct->items[i].attrs);
nmembers = ndistinct->items[i].nattributes;
Assert(nmembers >= 2);
len += SizeOfItem(nmembers);
@ -214,22 +222,15 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct)
for (i = 0; i < ndistinct->nitems; i++)
{
MVNDistinctItem item = ndistinct->items[i];
int nmembers = bms_num_members(item.attrs);
int x;
int nmembers = item.nattributes;
memcpy(tmp, &item.ndistinct, sizeof(double));
tmp += sizeof(double);
memcpy(tmp, &nmembers, sizeof(int));
tmp += sizeof(int);
x = -1;
while ((x = bms_next_member(item.attrs, x)) >= 0)
{
AttrNumber value = (AttrNumber) x;
memcpy(tmp, &value, sizeof(AttrNumber));
tmp += sizeof(AttrNumber);
}
memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
tmp += nmembers * sizeof(AttrNumber);
/* protect against overflows */
Assert(tmp <= ((char *) output + len));
@ -301,27 +302,21 @@ statext_ndistinct_deserialize(bytea *data)
for (i = 0; i < ndistinct->nitems; i++)
{
MVNDistinctItem *item = &ndistinct->items[i];
int nelems;
item->attrs = NULL;
/* ndistinct value */
memcpy(&item->ndistinct, tmp, sizeof(double));
tmp += sizeof(double);
/* number of attributes */
memcpy(&nelems, tmp, sizeof(int));
memcpy(&item->nattributes, tmp, sizeof(int));
tmp += sizeof(int);
Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS));
Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
while (nelems-- > 0)
{
AttrNumber attno;
item->attributes
= (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
memcpy(&attno, tmp, sizeof(AttrNumber));
tmp += sizeof(AttrNumber);
item->attrs = bms_add_member(item->attrs, attno);
}
memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
tmp += sizeof(AttrNumber) * item->nattributes;
/* still within the bytea */
Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
@ -369,17 +364,17 @@ pg_ndistinct_out(PG_FUNCTION_ARGS)
for (i = 0; i < ndist->nitems; i++)
{
int j;
MVNDistinctItem item = ndist->items[i];
int x = -1;
bool first = true;
if (i > 0)
appendStringInfoString(&str, ", ");
while ((x = bms_next_member(item.attrs, x)) >= 0)
for (j = 0; j < item.nattributes; j++)
{
appendStringInfo(&str, "%s%d", first ? "\"" : ", ", x);
first = false;
AttrNumber attnum = item.attributes[j];
appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum);
}
appendStringInfo(&str, "\": %d", (int) item.ndistinct);
}
@ -427,8 +422,8 @@ pg_ndistinct_send(PG_FUNCTION_ARGS)
* combination of multiple columns.
*/
static double
ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
VacAttrStats **stats, int k, int *combination)
ndistinct_for_combination(double totalrows, StatsBuildData *data,
int k, int *combination)
{
int i,
j;
@ -439,6 +434,7 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
Datum *values;
SortItem *items;
MultiSortSupport mss;
int numrows = data->numrows;
mss = multi_sort_init(k);
@ -467,25 +463,27 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
*/
for (i = 0; i < k; i++)
{
VacAttrStats *colstat = stats[combination[i]];
Oid typid;
TypeCacheEntry *type;
Oid collid = InvalidOid;
VacAttrStats *colstat = data->stats[combination[i]];
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
typid = colstat->attrtypid;
collid = colstat->attrcollid;
type = lookup_type_cache(typid, TYPECACHE_LT_OPR);
if (type->lt_opr == InvalidOid) /* shouldn't happen */
elog(ERROR, "cache lookup failed for ordering operator for type %u",
colstat->attrtypid);
typid);
/* prepare the sort function for this dimension */
multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
multi_sort_add_dimension(mss, i, type->lt_opr, collid);
/* accumulate all the data for this dimension into the arrays */
for (j = 0; j < numrows; j++)
{
items[j].values[i] =
heap_getattr(rows[j],
colstat->attr->attnum,
colstat->tupDesc,
&items[j].isnull[i]);
items[j].values[i] = data->values[combination[i]][j];
items[j].isnull[i] = data->nulls[combination[i]][j];
}
}

View File

@ -1816,7 +1816,34 @@ ProcessUtilitySlow(ParseState *pstate,
break;
case T_CreateStatsStmt:
address = CreateStatistics((CreateStatsStmt *) parsetree);
{
Oid relid;
CreateStatsStmt *stmt = (CreateStatsStmt *) parsetree;
RangeVar *rel = (RangeVar *) linitial(stmt->relations);
if (!IsA(rel, RangeVar))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("only a single relation is allowed in CREATE STATISTICS")));
/*
* CREATE STATISTICS will influence future execution plans
* but does not interfere with currently executing plans.
* So it should be enough to take ShareUpdateExclusiveLock
* on relation, conflicting with ANALYZE and other DDL
* that sets statistical information, but not with normal
* queries.
*
* XXX RangeVarCallbackOwnsRelation not needed here, to
* keep the same behavior as before.
*/
relid = RangeVarGetRelid(rel, ShareUpdateExclusiveLock, false);
/* Run parse analysis ... */
stmt = transformStatsStmt(relid, stmt, queryString);
address = CreateStatistics(stmt);
}
break;
case T_AlterStatsStmt:

View File

@ -336,7 +336,8 @@ static char *pg_get_indexdef_worker(Oid indexrelid, int colno,
bool attrsOnly, bool keysOnly,
bool showTblSpc, bool inherits,
int prettyFlags, bool missing_ok);
static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok);
static char *pg_get_statisticsobj_worker(Oid statextid, bool columns_only,
bool missing_ok);
static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags,
bool attrsOnly, bool missing_ok);
static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
@ -1507,7 +1508,36 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS)
Oid statextid = PG_GETARG_OID(0);
char *res;
res = pg_get_statisticsobj_worker(statextid, true);
res = pg_get_statisticsobj_worker(statextid, false, true);
if (res == NULL)
PG_RETURN_NULL();
PG_RETURN_TEXT_P(string_to_text(res));
}
/*
* Internal version for use by ALTER TABLE.
* Includes a tablespace clause in the result.
* Returns a palloc'd C string; no pretty-printing.
*/
char *
pg_get_statisticsobjdef_string(Oid statextid)
{
return pg_get_statisticsobj_worker(statextid, false, false);
}
/*
* pg_get_statisticsobjdef_columns
* Get columns and expressions for an extended statistics object
*/
Datum
pg_get_statisticsobjdef_columns(PG_FUNCTION_ARGS)
{
Oid statextid = PG_GETARG_OID(0);
char *res;
res = pg_get_statisticsobj_worker(statextid, true, true);
if (res == NULL)
PG_RETURN_NULL();
@ -1519,7 +1549,7 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS)
* Internal workhorse to decompile an extended statistics object.
*/
static char *
pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
pg_get_statisticsobj_worker(Oid statextid, bool columns_only, bool missing_ok)
{
Form_pg_statistic_ext statextrec;
HeapTuple statexttup;
@ -1534,6 +1564,11 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
bool dependencies_enabled;
bool mcv_enabled;
int i;
List *context;
ListCell *lc;
List *exprs = NIL;
bool has_exprs;
int ncolumns;
statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
@ -1544,75 +1579,114 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
elog(ERROR, "cache lookup failed for statistics object %u", statextid);
}
/* has the statistics expressions? */
has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL);
statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
/*
* Get the statistics expressions, if any. (NOTE: we do not use the
* relcache versions of the expressions, because we want to display
* non-const-folded expressions.)
*/
if (has_exprs)
{
Datum exprsDatum;
bool isnull;
char *exprsString;
exprsDatum = SysCacheGetAttr(STATEXTOID, statexttup,
Anum_pg_statistic_ext_stxexprs, &isnull);
Assert(!isnull);
exprsString = TextDatumGetCString(exprsDatum);
exprs = (List *) stringToNode(exprsString);
pfree(exprsString);
}
else
exprs = NIL;
/* count the number of columns (attributes and expressions) */
ncolumns = statextrec->stxkeys.dim1 + list_length(exprs);
initStringInfo(&buf);
nsp = get_namespace_name(statextrec->stxnamespace);
appendStringInfo(&buf, "CREATE STATISTICS %s",
quote_qualified_identifier(nsp,
NameStr(statextrec->stxname)));
/*
* Decode the stxkind column so that we know which stats types to print.
*/
datum = SysCacheGetAttr(STATEXTOID, statexttup,
Anum_pg_statistic_ext_stxkind, &isnull);
Assert(!isnull);
arr = DatumGetArrayTypeP(datum);
if (ARR_NDIM(arr) != 1 ||
ARR_HASNULL(arr) ||
ARR_ELEMTYPE(arr) != CHAROID)
elog(ERROR, "stxkind is not a 1-D char array");
enabled = (char *) ARR_DATA_PTR(arr);
ndistinct_enabled = false;
dependencies_enabled = false;
mcv_enabled = false;
for (i = 0; i < ARR_DIMS(arr)[0]; i++)
if (!columns_only)
{
if (enabled[i] == STATS_EXT_NDISTINCT)
ndistinct_enabled = true;
if (enabled[i] == STATS_EXT_DEPENDENCIES)
dependencies_enabled = true;
if (enabled[i] == STATS_EXT_MCV)
mcv_enabled = true;
}
nsp = get_namespace_name(statextrec->stxnamespace);
appendStringInfo(&buf, "CREATE STATISTICS %s",
quote_qualified_identifier(nsp,
NameStr(statextrec->stxname)));
/*
* If any option is disabled, then we'll need to append the types clause
* to show which options are enabled. We omit the types clause on purpose
* when all options are enabled, so a pg_dump/pg_restore will create all
* statistics types on a newer postgres version, if the statistics had all
* options enabled on the original version.
*/
if (!ndistinct_enabled || !dependencies_enabled || !mcv_enabled)
{
bool gotone = false;
/*
* Decode the stxkind column so that we know which stats types to
* print.
*/
datum = SysCacheGetAttr(STATEXTOID, statexttup,
Anum_pg_statistic_ext_stxkind, &isnull);
Assert(!isnull);
arr = DatumGetArrayTypeP(datum);
if (ARR_NDIM(arr) != 1 ||
ARR_HASNULL(arr) ||
ARR_ELEMTYPE(arr) != CHAROID)
elog(ERROR, "stxkind is not a 1-D char array");
enabled = (char *) ARR_DATA_PTR(arr);
appendStringInfoString(&buf, " (");
ndistinct_enabled = false;
dependencies_enabled = false;
mcv_enabled = false;
if (ndistinct_enabled)
for (i = 0; i < ARR_DIMS(arr)[0]; i++)
{
appendStringInfoString(&buf, "ndistinct");
gotone = true;
if (enabled[i] == STATS_EXT_NDISTINCT)
ndistinct_enabled = true;
else if (enabled[i] == STATS_EXT_DEPENDENCIES)
dependencies_enabled = true;
else if (enabled[i] == STATS_EXT_MCV)
mcv_enabled = true;
/* ignore STATS_EXT_EXPRESSIONS (it's built automatically) */
}
if (dependencies_enabled)
/*
* If any option is disabled, then we'll need to append the types
* clause to show which options are enabled. We omit the types clause
* on purpose when all options are enabled, so a pg_dump/pg_restore
* will create all statistics types on a newer postgres version, if
* the statistics had all options enabled on the original version.
*
* But if the statistics is defined on just a single column, it has to
* be an expression statistics. In that case we don't need to specify
* kinds.
*/
if ((!ndistinct_enabled || !dependencies_enabled || !mcv_enabled) &&
(ncolumns > 1))
{
appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
gotone = true;
bool gotone = false;
appendStringInfoString(&buf, " (");
if (ndistinct_enabled)
{
appendStringInfoString(&buf, "ndistinct");
gotone = true;
}
if (dependencies_enabled)
{
appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
gotone = true;
}
if (mcv_enabled)
appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
appendStringInfoChar(&buf, ')');
}
if (mcv_enabled)
appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
appendStringInfoChar(&buf, ')');
appendStringInfoString(&buf, " ON ");
}
appendStringInfoString(&buf, " ON ");
/* decode simple column references */
for (colno = 0; colno < statextrec->stxkeys.dim1; colno++)
{
AttrNumber attnum = statextrec->stxkeys.values[colno];
@ -1626,14 +1700,109 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
appendStringInfoString(&buf, quote_identifier(attname));
}
appendStringInfo(&buf, " FROM %s",
generate_relation_name(statextrec->stxrelid, NIL));
context = deparse_context_for(get_relation_name(statextrec->stxrelid),
statextrec->stxrelid);
foreach(lc, exprs)
{
Node *expr = (Node *) lfirst(lc);
char *str;
int prettyFlags = PRETTYFLAG_INDENT;
str = deparse_expression_pretty(expr, context, false, false,
prettyFlags, 0);
if (colno > 0)
appendStringInfoString(&buf, ", ");
/* Need parens if it's not a bare function call */
if (looks_like_function(expr))
appendStringInfoString(&buf, str);
else
appendStringInfo(&buf, "(%s)", str);
colno++;
}
if (!columns_only)
appendStringInfo(&buf, " FROM %s",
generate_relation_name(statextrec->stxrelid, NIL));
ReleaseSysCache(statexttup);
return buf.data;
}
/*
* Generate text array of expressions for statistics object.
*/
Datum
pg_get_statisticsobjdef_expressions(PG_FUNCTION_ARGS)
{
Oid statextid = PG_GETARG_OID(0);
Form_pg_statistic_ext statextrec;
HeapTuple statexttup;
Datum datum;
bool isnull;
List *context;
ListCell *lc;
List *exprs = NIL;
bool has_exprs;
char *tmp;
ArrayBuildState *astate = NULL;
statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
if (!HeapTupleIsValid(statexttup))
elog(ERROR, "cache lookup failed for statistics object %u", statextid);
/* has the statistics expressions? */
has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL);
/* no expressions? we're done */
if (!has_exprs)
{
ReleaseSysCache(statexttup);
PG_RETURN_NULL();
}
statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
/*
* Get the statistics expressions, and deparse them into text values.
*/
datum = SysCacheGetAttr(STATEXTOID, statexttup,
Anum_pg_statistic_ext_stxexprs, &isnull);
Assert(!isnull);
tmp = TextDatumGetCString(datum);
exprs = (List *) stringToNode(tmp);
pfree(tmp);
context = deparse_context_for(get_relation_name(statextrec->stxrelid),
statextrec->stxrelid);
foreach(lc, exprs)
{
Node *expr = (Node *) lfirst(lc);
char *str;
int prettyFlags = PRETTYFLAG_INDENT;
str = deparse_expression_pretty(expr, context, false, false,
prettyFlags, 0);
astate = accumArrayResult(astate,
PointerGetDatum(cstring_to_text(str)),
false,
TEXTOID,
CurrentMemoryContext);
}
ReleaseSysCache(statexttup);
PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
}
/*
* pg_get_partkeydef
*

View File

@ -3430,6 +3430,14 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
* If examine_variable is able to deduce anything about the GROUP BY
* expression, treat it as a single variable even if it's really more
* complicated.
*
* XXX This has the consequence that if there's a statistics on the
* expression, we don't split it into individual Vars. This affects
* our selection of statistics in estimate_multivariate_ndistinct,
* because it's probably better to use more accurate estimate for
* each expression and treat them as independent, than to combine
* estimates for the extracted variables when we don't know how that
* relates to the expressions.
*/
examine_variable(root, groupexpr, 0, &vardata);
if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
@ -3880,50 +3888,77 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
List **varinfos, double *ndistinct)
{
ListCell *lc;
Bitmapset *attnums = NULL;
int nmatches;
int nmatches_vars;
int nmatches_exprs;
Oid statOid = InvalidOid;
MVNDistinct *stats;
Bitmapset *matched = NULL;
StatisticExtInfo *matched_info = NULL;
/* bail out immediately if the table has no extended statistics */
if (!rel->statlist)
return false;
/* Determine the attnums we're looking for */
foreach(lc, *varinfos)
{
GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
AttrNumber attnum;
Assert(varinfo->rel == rel);
if (!IsA(varinfo->var, Var))
continue;
attnum = ((Var *) varinfo->var)->varattno;
if (!AttrNumberIsForUserDefinedAttr(attnum))
continue;
attnums = bms_add_member(attnums, attnum);
}
/* look for the ndistinct statistics matching the most vars */
nmatches = 1; /* we require at least two matches */
nmatches_vars = 0; /* we require at least two matches */
nmatches_exprs = 0;
foreach(lc, rel->statlist)
{
ListCell *lc2;
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
Bitmapset *shared;
int nshared;
int nshared_vars = 0;
int nshared_exprs = 0;
/* skip statistics of other kinds */
if (info->kind != STATS_EXT_NDISTINCT)
continue;
/* compute attnums shared by the vars and the statistics object */
shared = bms_intersect(info->keys, attnums);
nshared = bms_num_members(shared);
/*
* Determine how many expressions (and variables in non-matched
* expressions) match. We'll then use these numbers to pick the
* statistics object that best matches the clauses.
*/
foreach(lc2, *varinfos)
{
ListCell *lc3;
GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2);
AttrNumber attnum;
Assert(varinfo->rel == rel);
/* simple Var, search in statistics keys directly */
if (IsA(varinfo->var, Var))
{
attnum = ((Var *) varinfo->var)->varattno;
/*
* Ignore system attributes - we don't support statistics on
* them, so can't match them (and it'd fail as the values are
* negative).
*/
if (!AttrNumberIsForUserDefinedAttr(attnum))
continue;
if (bms_is_member(attnum, info->keys))
nshared_vars++;
continue;
}
/* expression - see if it's in the statistics */
foreach(lc3, info->exprs)
{
Node *expr = (Node *) lfirst(lc3);
if (equal(varinfo->var, expr))
{
nshared_exprs++;
break;
}
}
}
if (nshared_vars + nshared_exprs < 2)
continue;
/*
* Does this statistics object match more columns than the currently
@ -3932,18 +3967,21 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
* XXX This should break ties using name of the object, or something
* like that, to make the outcome stable.
*/
if (nshared > nmatches)
if ((nshared_exprs > nmatches_exprs) ||
(((nshared_exprs == nmatches_exprs)) && (nshared_vars > nmatches_vars)))
{
statOid = info->statOid;
nmatches = nshared;
matched = shared;
nmatches_vars = nshared_vars;
nmatches_exprs = nshared_exprs;
matched_info = info;
}
}
/* No match? */
if (statOid == InvalidOid)
return false;
Assert(nmatches > 1 && matched != NULL);
Assert(nmatches_vars + nmatches_exprs > 1);
stats = statext_ndistinct_load(statOid);
@ -3956,20 +3994,135 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
int i;
List *newlist = NIL;
MVNDistinctItem *item = NULL;
ListCell *lc2;
Bitmapset *matched = NULL;
AttrNumber attnum_offset;
/*
* How much we need to offset the attnums? If there are no
* expressions, no offset is needed. Otherwise offset enough to move
* the lowest one (which is equal to number of expressions) to 1.
*/
if (matched_info->exprs)
attnum_offset = (list_length(matched_info->exprs) + 1);
else
attnum_offset = 0;
/* see what actually matched */
foreach(lc2, *varinfos)
{
ListCell *lc3;
int idx;
bool found = false;
GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2);
/*
* Process a simple Var expression, by matching it to keys
* directly. If there's a matchine expression, we'll try
* matching it later.
*/
if (IsA(varinfo->var, Var))
{
AttrNumber attnum = ((Var *) varinfo->var)->varattno;
/*
* Ignore expressions on system attributes. Can't rely on
* the bms check for negative values.
*/
if (!AttrNumberIsForUserDefinedAttr(attnum))
continue;
/* Is the variable covered by the statistics? */
if (!bms_is_member(attnum, matched_info->keys))
continue;
attnum = attnum + attnum_offset;
/* ensure sufficient offset */
Assert(AttrNumberIsForUserDefinedAttr(attnum));
matched = bms_add_member(matched, attnum);
found = true;
}
/*
* XXX Maybe we should allow searching the expressions even if we
* found an attribute matching the expression? That would handle
* trivial expressions like "(a)" but it seems fairly useless.
*/
if (found)
continue;
/* expression - see if it's in the statistics */
idx = 0;
foreach(lc3, matched_info->exprs)
{
Node *expr = (Node *) lfirst(lc3);
if (equal(varinfo->var, expr))
{
AttrNumber attnum = -(idx + 1);
attnum = attnum + attnum_offset;
/* ensure sufficient offset */
Assert(AttrNumberIsForUserDefinedAttr(attnum));
matched = bms_add_member(matched, attnum);
/* there should be just one matching expression */
break;
}
idx++;
}
}
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
int j;
MVNDistinctItem *tmpitem = &stats->items[i];
if (bms_subset_compare(tmpitem->attrs, matched) == BMS_EQUAL)
if (tmpitem->nattributes != bms_num_members(matched))
continue;
/* assume it's the right item */
item = tmpitem;
/* check that all item attributes/expressions fit the match */
for (j = 0; j < tmpitem->nattributes; j++)
{
item = tmpitem;
break;
AttrNumber attnum = tmpitem->attributes[j];
/*
* Thanks to how we constructed the matched bitmap above, we
* can just offset all attnums the same way.
*/
attnum = attnum + attnum_offset;
if (!bms_is_member(attnum, matched))
{
/* nah, it's not this item */
item = NULL;
break;
}
}
/*
* If the item has all the matched attributes, we know it's the
* right one - there can't be a better one. matching more.
*/
if (item)
break;
}
/* make sure we found an item */
/*
* Make sure we found an item. There has to be one, because ndistinct
* statistics includes all combinations of attributes.
*/
if (!item)
elog(ERROR, "corrupt MVNDistinct entry");
@ -3977,18 +4130,63 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
foreach(lc, *varinfos)
{
GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
AttrNumber attnum;
ListCell *lc3;
bool found = false;
if (!IsA(varinfo->var, Var))
/*
* Let's look at plain variables first, because it's the most
* common case and the check is quite cheap. We can simply get the
* attnum and check (with an offset) matched bitmap.
*/
if (IsA(varinfo->var, Var))
{
newlist = lappend(newlist, varinfo);
AttrNumber attnum = ((Var *) varinfo->var)->varattno;
/*
* If it's a system attribute, we're done. We don't support
* extended statistics on system attributes, so it's clearly
* not matched. Just keep the expression and continue.
*/
if (!AttrNumberIsForUserDefinedAttr(attnum))
{
newlist = lappend(newlist, varinfo);
continue;
}
/* apply the same offset as above */
attnum += attnum_offset;
/* if it's not matched, keep the varinfo */
if (!bms_is_member(attnum, matched))
newlist = lappend(newlist, varinfo);
/* The rest of the loop deals with complex expressions. */
continue;
}
attnum = ((Var *) varinfo->var)->varattno;
/*
* Process complex expressions, not just simple Vars.
*
* First, we search for an exact match of an expression. If we
* find one, we can just discard the whole GroupExprInfo, with all
* the variables we extracted from it.
*
* Otherwise we inspect the individual vars, and try matching it
* to variables in the item.
*/
foreach(lc3, matched_info->exprs)
{
Node *expr = (Node *) lfirst(lc3);
if (AttrNumberIsForUserDefinedAttr(attnum) &&
bms_is_member(attnum, matched))
if (equal(varinfo->var, expr))
{
found = true;
break;
}
}
/* found exact match, skip */
if (found)
continue;
newlist = lappend(newlist, varinfo);
@ -4690,6 +4888,13 @@ get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo,
*join_is_reversed = false;
}
/* statext_expressions_load copies the tuple, so just pfree it. */
static void
ReleaseDummy(HeapTuple tuple)
{
pfree(tuple);
}
/*
* examine_variable
* Try to look up statistical data about an expression.
@ -4830,6 +5035,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
* operator we are estimating for. FIXME later.
*/
ListCell *ilist;
ListCell *slist;
foreach(ilist, onerel->indexlist)
{
@ -4986,6 +5192,129 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
if (vardata->statsTuple)
break;
}
/*
* Search extended statistics for one with a matching expression.
* There might be multiple ones, so just grab the first one. In the
* future, we might consider the statistics target (and pick the most
* accurate statistics) and maybe some other parameters.
*/
foreach(slist, onerel->statlist)
{
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(slist);
ListCell *expr_item;
int pos;
/*
* Stop once we've found statistics for the expression (either
* from extended stats, or for an index in the preceding loop).
*/
if (vardata->statsTuple)
break;
/* skip stats without per-expression stats */
if (info->kind != STATS_EXT_EXPRESSIONS)
continue;
pos = 0;
foreach(expr_item, info->exprs)
{
Node *expr = (Node *) lfirst(expr_item);
Assert(expr);
/* strip RelabelType before comparing it */
if (expr && IsA(expr, RelabelType))
expr = (Node *) ((RelabelType *) expr)->arg;
/* found a match, see if we can extract pg_statistic row */
if (equal(node, expr))
{
HeapTuple t = statext_expressions_load(info->statOid, pos);
/* Get index's table for permission check */
RangeTblEntry *rte;
Oid userid;
vardata->statsTuple = t;
/*
* XXX Not sure if we should cache the tuple somewhere.
* Now we just create a new copy every time.
*/
vardata->freefunc = ReleaseDummy;
rte = planner_rt_fetch(onerel->relid, root);
Assert(rte->rtekind == RTE_RELATION);
/*
* Use checkAsUser if it's set, in case we're accessing
* the table via a view.
*/
userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
/*
* For simplicity, we insist on the whole table being
* selectable, rather than trying to identify which
* column(s) the statistics depends on. Also require all
* rows to be selectable --- there must be no
* securityQuals from security barrier views or RLS
* policies.
*/
vardata->acl_ok =
rte->securityQuals == NIL &&
(pg_class_aclcheck(rte->relid, userid,
ACL_SELECT) == ACLCHECK_OK);
/*
* If the user doesn't have permissions to access an
* inheritance child relation, check the permissions of
* the table actually mentioned in the query, since most
* likely the user does have that permission. Note that
* whole-table select privilege on the parent doesn't
* quite guarantee that the user could read all columns of
* the child. But in practice it's unlikely that any
* interesting security violation could result from
* allowing access to the expression stats, so we allow it
* anyway. See similar code in examine_simple_variable()
* for additional comments.
*/
if (!vardata->acl_ok &&
root->append_rel_array != NULL)
{
AppendRelInfo *appinfo;
Index varno = onerel->relid;
appinfo = root->append_rel_array[varno];
while (appinfo &&
planner_rt_fetch(appinfo->parent_relid,
root)->rtekind == RTE_RELATION)
{
varno = appinfo->parent_relid;
appinfo = root->append_rel_array[varno];
}
if (varno != onerel->relid)
{
/* Repeat access check on this rel */
rte = planner_rt_fetch(varno, root);
Assert(rte->rtekind == RTE_RELATION);
userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
vardata->acl_ok =
rte->securityQuals == NIL &&
(pg_class_aclcheck(rte->relid,
userid,
ACL_SELECT) == ACLCHECK_OK);
}
}
break;
}
pos++;
}
}
}
}

View File

@ -2637,6 +2637,18 @@ my %tests = (
unlike => { exclude_dump_test_schema => 1, },
},
'CREATE STATISTICS extended_stats_expression' => {
create_order => 99,
create_sql => 'CREATE STATISTICS dump_test.test_ext_stats_expr
ON (2 * col1) FROM dump_test.test_fifth_table',
regexp => qr/^
\QCREATE STATISTICS dump_test.test_ext_stats_expr ON ((2 * col1)) FROM dump_test.test_fifth_table;\E
/xms,
like =>
{ %full_runs, %dump_test_schema_runs, section_post_data => 1, },
unlike => { exclude_dump_test_schema => 1, },
},
'CREATE SEQUENCE test_table_col1_seq' => {
regexp => qr/^
\QCREATE SEQUENCE dump_test.test_table_col1_seq\E

View File

@ -2712,7 +2712,104 @@ describeOneTableDetails(const char *schemaname,
}
/* print any extended statistics */
if (pset.sversion >= 100000)
if (pset.sversion >= 140000)
{
printfPQExpBuffer(&buf,
"SELECT oid, "
"stxrelid::pg_catalog.regclass, "
"stxnamespace::pg_catalog.regnamespace AS nsp, "
"stxname,\n"
"pg_get_statisticsobjdef_columns(oid) AS columns,\n"
" 'd' = any(stxkind) AS ndist_enabled,\n"
" 'f' = any(stxkind) AS deps_enabled,\n"
" 'm' = any(stxkind) AS mcv_enabled,\n"
"stxstattarget\n"
"FROM pg_catalog.pg_statistic_ext stat\n"
"WHERE stxrelid = '%s'\n"
"ORDER BY 1;",
oid);
result = PSQLexec(buf.data);
if (!result)
goto error_return;
else
tuples = PQntuples(result);
if (tuples > 0)
{
printTableAddFooter(&cont, _("Statistics objects:"));
for (i = 0; i < tuples; i++)
{
bool gotone = false;
bool has_ndistinct;
bool has_dependencies;
bool has_mcv;
bool has_all;
bool has_some;
has_ndistinct = (strcmp(PQgetvalue(result, i, 5), "t") == 0);
has_dependencies = (strcmp(PQgetvalue(result, i, 6), "t") == 0);
has_mcv = (strcmp(PQgetvalue(result, i, 7), "t") == 0);
printfPQExpBuffer(&buf, " ");
/* statistics object name (qualified with namespace) */
appendPQExpBuffer(&buf, "\"%s\".\"%s\"",
PQgetvalue(result, i, 2),
PQgetvalue(result, i, 3));
/*
* When printing kinds we ignore expression statistics,
* which is used only internally and can't be specified by
* user. We don't print the kinds when either none are
* specified (in which case it has to be statistics on a
* single expr) or when all are specified (in which case
* we assume it's expanded by CREATE STATISTICS).
*/
has_all = (has_ndistinct && has_dependencies && has_mcv);
has_some = (has_ndistinct || has_dependencies || has_mcv);
if (has_some && !has_all)
{
appendPQExpBuffer(&buf, " (");
/* options */
if (has_ndistinct)
{
appendPQExpBufferStr(&buf, "ndistinct");
gotone = true;
}
if (has_dependencies)
{
appendPQExpBuffer(&buf, "%sdependencies", gotone ? ", " : "");
gotone = true;
}
if (has_mcv)
{
appendPQExpBuffer(&buf, "%smcv", gotone ? ", " : "");
}
appendPQExpBuffer(&buf, ")");
}
appendPQExpBuffer(&buf, " ON %s FROM %s",
PQgetvalue(result, i, 4),
PQgetvalue(result, i, 1));
/* Show the stats target if it's not default */
if (strcmp(PQgetvalue(result, i, 8), "-1") != 0)
appendPQExpBuffer(&buf, "; STATISTICS %s",
PQgetvalue(result, i, 8));
printTableAddFooter(&cont, buf.data);
}
}
PQclear(result);
}
else if (pset.sversion >= 100000)
{
printfPQExpBuffer(&buf,
"SELECT oid, "
@ -4468,18 +4565,27 @@ listExtendedStats(const char *pattern)
printfPQExpBuffer(&buf,
"SELECT \n"
"es.stxnamespace::pg_catalog.regnamespace::text AS \"%s\", \n"
"es.stxname AS \"%s\", \n"
"pg_catalog.format('%%s FROM %%s', \n"
" (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n"
" FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n"
" JOIN pg_catalog.pg_attribute a \n"
" ON (es.stxrelid = a.attrelid \n"
" AND a.attnum = s.attnum \n"
" AND NOT a.attisdropped)), \n"
"es.stxrelid::regclass) AS \"%s\"",
"es.stxname AS \"%s\", \n",
gettext_noop("Schema"),
gettext_noop("Name"),
gettext_noop("Definition"));
gettext_noop("Name"));
if (pset.sversion >= 140000)
appendPQExpBuffer(&buf,
"pg_catalog.format('%%s FROM %%s', \n"
" pg_get_statisticsobjdef_columns(es.oid), \n"
" es.stxrelid::regclass) AS \"%s\"",
gettext_noop("Definition"));
else
appendPQExpBuffer(&buf,
"pg_catalog.format('%%s FROM %%s', \n"
" (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n"
" FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n"
" JOIN pg_catalog.pg_attribute a \n"
" ON (es.stxrelid = a.attrelid \n"
" AND a.attnum = s.attnum \n"
" AND NOT a.attisdropped)), \n"
"es.stxrelid::regclass) AS \"%s\"",
gettext_noop("Definition"));
appendPQExpBuffer(&buf,
",\nCASE WHEN 'd' = any(es.stxkind) THEN 'defined' \n"

View File

@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202103265
#define CATALOG_VERSION_NO 202103266
#endif

View File

@ -3658,6 +3658,14 @@
proname => 'pg_get_statisticsobjdef', provolatile => 's',
prorettype => 'text', proargtypes => 'oid',
prosrc => 'pg_get_statisticsobjdef' },
{ oid => '8887', descr => 'extended statistics columns',
proname => 'pg_get_statisticsobjdef_columns', provolatile => 's',
prorettype => 'text', proargtypes => 'oid',
prosrc => 'pg_get_statisticsobjdef_columns' },
{ oid => '8886', descr => 'extended statistics expressions',
proname => 'pg_get_statisticsobjdef_expressions', provolatile => 's',
prorettype => '_text', proargtypes => 'oid',
prosrc => 'pg_get_statisticsobjdef_expressions' },
{ oid => '3352', descr => 'partition key description',
proname => 'pg_get_partkeydef', provolatile => 's', prorettype => 'text',
proargtypes => 'oid', prosrc => 'pg_get_partkeydef' },

View File

@ -54,6 +54,9 @@ CATALOG(pg_statistic_ext,3381,StatisticExtRelationId)
#ifdef CATALOG_VARLEN
char stxkind[1] BKI_FORCE_NOT_NULL; /* statistics kinds requested
* to build */
pg_node_tree stxexprs; /* A list of expression trees for stats
* attributes that are not simple column
* references. */
#endif
} FormData_pg_statistic_ext;
@ -81,6 +84,7 @@ DECLARE_ARRAY_FOREIGN_KEY((stxrelid, stxkeys), pg_attribute, (attrelid, attnum))
#define STATS_EXT_NDISTINCT 'd'
#define STATS_EXT_DEPENDENCIES 'f'
#define STATS_EXT_MCV 'm'
#define STATS_EXT_EXPRESSIONS 'e'
#endif /* EXPOSE_TO_CLIENT_CODE */

View File

@ -38,6 +38,7 @@ CATALOG(pg_statistic_ext_data,3429,StatisticExtDataRelationId)
pg_ndistinct stxdndistinct; /* ndistinct coefficients (serialized) */
pg_dependencies stxddependencies; /* dependencies (serialized) */
pg_mcv_list stxdmcv; /* MCV (serialized) */
pg_statistic stxdexpr[1]; /* stats for expressions */
#endif

View File

@ -81,9 +81,7 @@ extern ObjectAddress AlterOperator(AlterOperatorStmt *stmt);
extern ObjectAddress CreateStatistics(CreateStatsStmt *stmt);
extern ObjectAddress AlterStatistics(AlterStatsStmt *stmt);
extern void RemoveStatisticsById(Oid statsOid);
extern void UpdateStatisticsForTypeChange(Oid statsOid,
Oid relationOid, int attnum,
Oid oldColumnType, Oid newColumnType);
extern Oid StatisticsGetRelation(Oid statId, bool missing_ok);
/* commands/aggregatecmds.c */
extern ObjectAddress DefineAggregate(ParseState *pstate, List *name, List *args, bool oldstyle,

View File

@ -454,6 +454,7 @@ typedef enum NodeTag
T_TypeName,
T_ColumnDef,
T_IndexElem,
T_StatsElem,
T_Constraint,
T_DefElem,
T_RangeTblEntry,

View File

@ -1914,7 +1914,8 @@ typedef enum AlterTableType
AT_AddIdentity, /* ADD IDENTITY */
AT_SetIdentity, /* SET identity column options */
AT_DropIdentity, /* DROP IDENTITY */
AT_AlterCollationRefreshVersion /* ALTER COLLATION ... REFRESH VERSION */
AT_AlterCollationRefreshVersion, /* ALTER COLLATION ... REFRESH VERSION */
AT_ReAddStatistics /* internal to commands/tablecmds.c */
} AlterTableType;
typedef struct ReplicaIdentityStmt
@ -2872,8 +2873,24 @@ typedef struct CreateStatsStmt
List *relations; /* rels to build stats on (list of RangeVar) */
char *stxcomment; /* comment to apply to stats, or NULL */
bool if_not_exists; /* do nothing if stats name already exists */
bool transformed; /* true when transformStatsStmt is finished */
} CreateStatsStmt;
/*
* StatsElem - statistics parameters (used in CREATE STATISTICS)
*
* For a plain attribute, 'name' is the name of the referenced table column
* and 'expr' is NULL. For an expression, 'name' is NULL and 'expr' is the
* expression tree.
*/
typedef struct StatsElem
{
NodeTag type;
char *name; /* name of attribute to index, or NULL */
Node *expr; /* expression to index, or NULL */
} StatsElem;
/* ----------------------
* Alter Statistics Statement
* ----------------------

View File

@ -923,6 +923,7 @@ typedef struct StatisticExtInfo
RelOptInfo *rel; /* back-link to statistic's table */
char kind; /* statistics kind of this entry */
Bitmapset *keys; /* attnums of the columns covered */
List *exprs; /* expressions */
} StatisticExtInfo;
/*

View File

@ -69,6 +69,7 @@ typedef enum ParseExprKind
EXPR_KIND_FUNCTION_DEFAULT, /* default parameter value for function */
EXPR_KIND_INDEX_EXPRESSION, /* index expression */
EXPR_KIND_INDEX_PREDICATE, /* index predicate */
EXPR_KIND_STATS_EXPRESSION, /* extended statistics expression */
EXPR_KIND_ALTER_COL_TRANSFORM, /* transform expr in ALTER COLUMN TYPE */
EXPR_KIND_EXECUTE_PARAMETER, /* parameter value in EXECUTE */
EXPR_KIND_TRIGGER_WHEN, /* WHEN condition in CREATE TRIGGER */

View File

@ -26,6 +26,8 @@ extern AlterTableStmt *transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
List **afterStmts);
extern IndexStmt *transformIndexStmt(Oid relid, IndexStmt *stmt,
const char *queryString);
extern CreateStatsStmt *transformStatsStmt(Oid relid, CreateStatsStmt *stmt,
const char *queryString);
extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
List **actions, Node **whereClause);
extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);

View File

@ -57,19 +57,27 @@ typedef struct SortItem
int count;
} SortItem;
extern MVNDistinct *statext_ndistinct_build(double totalrows,
int numrows, HeapTuple *rows,
Bitmapset *attrs, VacAttrStats **stats);
/* a unified representation of the data the statistics is built on */
typedef struct StatsBuildData
{
int numrows;
int nattnums;
AttrNumber *attnums;
VacAttrStats **stats;
Datum **values;
bool **nulls;
} StatsBuildData;
extern MVNDistinct *statext_ndistinct_build(double totalrows, StatsBuildData *data);
extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct);
extern MVNDistinct *statext_ndistinct_deserialize(bytea *data);
extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows,
Bitmapset *attrs, VacAttrStats **stats);
extern MVDependencies *statext_dependencies_build(StatsBuildData *data);
extern bytea *statext_dependencies_serialize(MVDependencies *dependencies);
extern MVDependencies *statext_dependencies_deserialize(bytea *data);
extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows,
Bitmapset *attrs, VacAttrStats **stats,
extern MCVList *statext_mcv_build(StatsBuildData *data,
double totalrows, int stattarget);
extern bytea *statext_mcv_serialize(MCVList *mcv, VacAttrStats **stats);
extern MCVList *statext_mcv_deserialize(bytea *data);
@ -85,14 +93,14 @@ extern int multi_sort_compare_dims(int start, int end, const SortItem *a,
extern int compare_scalars_simple(const void *a, const void *b, void *arg);
extern int compare_datums_simple(Datum a, Datum b, SortSupport ssup);
extern AttrNumber *build_attnums_array(Bitmapset *attrs, int *numattrs);
extern AttrNumber *build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs);
extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows,
TupleDesc tdesc, MultiSortSupport mss,
extern SortItem *build_sorted_items(StatsBuildData *data, int *nitems,
MultiSortSupport mss,
int numattrs, AttrNumber *attnums);
extern bool examine_clause_args(List *args, Var **varp,
Const **cstp, bool *varonleftp);
extern bool examine_opclause_args(List *args, Node **exprp,
Const **cstp, bool *expronleftp);
extern Selectivity mcv_combine_selectivities(Selectivity simple_sel,
Selectivity mcv_sel,

View File

@ -26,7 +26,8 @@
typedef struct MVNDistinctItem
{
double ndistinct; /* ndistinct value for this combination */
Bitmapset *attrs; /* attr numbers of items */
int nattributes; /* number of attributes */
AttrNumber *attributes; /* attribute numbers */
} MVNDistinctItem;
/* A MVNDistinct object, comprising all possible combinations of columns */
@ -121,6 +122,8 @@ extern Selectivity statext_clauselist_selectivity(PlannerInfo *root,
extern bool has_stats_of_kind(List *stats, char requiredkind);
extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind,
Bitmapset **clause_attnums,
List **clause_exprs,
int nclauses);
extern HeapTuple statext_expressions_load(Oid stxoid, int idx);
#endif /* STATISTICS_H */

View File

@ -41,4 +41,6 @@ extern char *generate_collation_name(Oid collid);
extern char *generate_opclass_name(Oid opclass);
extern char *get_range_partbound_string(List *bound_datums);
extern char *pg_get_statisticsobjdef_string(Oid statextid);
#endif /* RULEUTILS_H */

View File

@ -304,7 +304,9 @@ CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) PRIMARY KEY, b text);
CREATE INDEX ctlt1_b_key ON ctlt1 (b);
CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b));
CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1;
CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1;
COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats';
COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats';
COMMENT ON COLUMN ctlt1.a IS 'A';
COMMENT ON COLUMN ctlt1.b IS 'B';
COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check';
@ -414,7 +416,8 @@ Indexes:
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
"public"."ctlt_all_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt_all
"public"."ctlt_all_a_b_stat" ON a, b FROM ctlt_all
"public"."ctlt_all_expr_stat" ON ((a || b)) FROM ctlt_all
SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_class c WHERE classoid = 'pg_class'::regclass AND objoid = i.indexrelid AND c.oid = i.indexrelid AND i.indrelid = 'ctlt_all'::regclass ORDER BY c.relname, objsubid;
relname | objsubid | description
@ -424,10 +427,11 @@ SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_clas
(2 rows)
SELECT s.stxname, objsubid, description FROM pg_description, pg_statistic_ext s WHERE classoid = 'pg_statistic_ext'::regclass AND objoid = s.oid AND s.stxrelid = 'ctlt_all'::regclass ORDER BY s.stxname, objsubid;
stxname | objsubid | description
-------------------+----------+-------------
ctlt_all_a_b_stat | 0 | ab stats
(1 row)
stxname | objsubid | description
--------------------+----------+---------------
ctlt_all_a_b_stat | 0 | ab stats
ctlt_all_expr_stat | 0 | ab expr stats
(2 rows)
CREATE TABLE inh_error1 () INHERITS (ctlt1, ctlt4);
NOTICE: merging multiple inherited definitions of column "a"
@ -452,7 +456,8 @@ Indexes:
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
"public"."pg_attrdef_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM public.pg_attrdef
"public"."pg_attrdef_a_b_stat" ON a, b FROM public.pg_attrdef
"public"."pg_attrdef_expr_stat" ON ((a || b)) FROM public.pg_attrdef
DROP TABLE public.pg_attrdef;
-- Check that LIKE isn't confused when new table masks the old, either
@ -473,7 +478,8 @@ Indexes:
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
"ctl_schema"."ctlt1_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt1
"ctl_schema"."ctlt1_a_b_stat" ON a, b FROM ctlt1
"ctl_schema"."ctlt1_expr_stat" ON ((a || b)) FROM ctlt1
ROLLBACK;
DROP TABLE ctlt1, ctlt2, ctlt3, ctlt4, ctlt12_storage, ctlt12_comments, ctlt1_inh, ctlt13_inh, ctlt13_like, ctlt_all, ctla, ctlb CASCADE;

View File

@ -151,11 +151,6 @@ NOTICE: checking pg_aggregate {aggmfinalfn} => pg_proc {oid}
NOTICE: checking pg_aggregate {aggsortop} => pg_operator {oid}
NOTICE: checking pg_aggregate {aggtranstype} => pg_type {oid}
NOTICE: checking pg_aggregate {aggmtranstype} => pg_type {oid}
NOTICE: checking pg_statistic_ext {stxrelid} => pg_class {oid}
NOTICE: checking pg_statistic_ext {stxnamespace} => pg_namespace {oid}
NOTICE: checking pg_statistic_ext {stxowner} => pg_authid {oid}
NOTICE: checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum}
NOTICE: checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid}
NOTICE: checking pg_statistic {starelid} => pg_class {oid}
NOTICE: checking pg_statistic {staop1} => pg_operator {oid}
NOTICE: checking pg_statistic {staop2} => pg_operator {oid}
@ -168,6 +163,11 @@ NOTICE: checking pg_statistic {stacoll3} => pg_collation {oid}
NOTICE: checking pg_statistic {stacoll4} => pg_collation {oid}
NOTICE: checking pg_statistic {stacoll5} => pg_collation {oid}
NOTICE: checking pg_statistic {starelid,staattnum} => pg_attribute {attrelid,attnum}
NOTICE: checking pg_statistic_ext {stxrelid} => pg_class {oid}
NOTICE: checking pg_statistic_ext {stxnamespace} => pg_namespace {oid}
NOTICE: checking pg_statistic_ext {stxowner} => pg_authid {oid}
NOTICE: checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum}
NOTICE: checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid}
NOTICE: checking pg_rewrite {ev_class} => pg_class {oid}
NOTICE: checking pg_trigger {tgrelid} => pg_class {oid}
NOTICE: checking pg_trigger {tgparentid} => pg_trigger {oid}

View File

@ -2418,6 +2418,7 @@ pg_stats_ext| SELECT cn.nspname AS schemaname,
( SELECT array_agg(a.attname ORDER BY a.attnum) AS array_agg
FROM (unnest(s.stxkeys) k(k)
JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))) AS attnames,
pg_get_statisticsobjdef_expressions(s.oid) AS exprs,
s.stxkind AS kinds,
sd.stxdndistinct AS n_distinct,
sd.stxddependencies AS dependencies,
@ -2439,6 +2440,78 @@ pg_stats_ext| SELECT cn.nspname AS schemaname,
FROM (unnest(s.stxkeys) k(k)
JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))
WHERE (NOT has_column_privilege(c.oid, a.attnum, 'select'::text))))) AND ((c.relrowsecurity = false) OR (NOT row_security_active(c.oid))));
pg_stats_ext_exprs| SELECT cn.nspname AS schemaname,
c.relname AS tablename,
sn.nspname AS statistics_schemaname,
s.stxname AS statistics_name,
pg_get_userbyid(s.stxowner) AS statistics_owner,
stat.expr,
(stat.a).stanullfrac AS null_frac,
(stat.a).stawidth AS avg_width,
(stat.a).stadistinct AS n_distinct,
CASE
WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stavalues1
WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stavalues2
WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stavalues3
WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stavalues4
WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stavalues5
ELSE NULL::anyarray
END AS most_common_vals,
CASE
WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stanumbers1
WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stanumbers2
WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stanumbers3
WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stanumbers4
WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stanumbers5
ELSE NULL::real[]
END AS most_common_freqs,
CASE
WHEN ((stat.a).stakind1 = 2) THEN (stat.a).stavalues1
WHEN ((stat.a).stakind2 = 2) THEN (stat.a).stavalues2
WHEN ((stat.a).stakind3 = 2) THEN (stat.a).stavalues3
WHEN ((stat.a).stakind4 = 2) THEN (stat.a).stavalues4
WHEN ((stat.a).stakind5 = 2) THEN (stat.a).stavalues5
ELSE NULL::anyarray
END AS histogram_bounds,
CASE
WHEN ((stat.a).stakind1 = 3) THEN (stat.a).stanumbers1[1]
WHEN ((stat.a).stakind2 = 3) THEN (stat.a).stanumbers2[1]
WHEN ((stat.a).stakind3 = 3) THEN (stat.a).stanumbers3[1]
WHEN ((stat.a).stakind4 = 3) THEN (stat.a).stanumbers4[1]
WHEN ((stat.a).stakind5 = 3) THEN (stat.a).stanumbers5[1]
ELSE NULL::real
END AS correlation,
CASE
WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stavalues1
WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stavalues2
WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stavalues3
WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stavalues4
WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stavalues5
ELSE NULL::anyarray
END AS most_common_elems,
CASE
WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stanumbers1
WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stanumbers2
WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stanumbers3
WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stanumbers4
WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stanumbers5
ELSE NULL::real[]
END AS most_common_elem_freqs,
CASE
WHEN ((stat.a).stakind1 = 5) THEN (stat.a).stanumbers1
WHEN ((stat.a).stakind2 = 5) THEN (stat.a).stanumbers2
WHEN ((stat.a).stakind3 = 5) THEN (stat.a).stanumbers3
WHEN ((stat.a).stakind4 = 5) THEN (stat.a).stanumbers4
WHEN ((stat.a).stakind5 = 5) THEN (stat.a).stanumbers5
ELSE NULL::real[]
END AS elem_count_histogram
FROM (((((pg_statistic_ext s
JOIN pg_class c ON ((c.oid = s.stxrelid)))
LEFT JOIN pg_statistic_ext_data sd ON ((s.oid = sd.stxoid)))
LEFT JOIN pg_namespace cn ON ((cn.oid = c.relnamespace)))
LEFT JOIN pg_namespace sn ON ((sn.oid = s.stxnamespace)))
JOIN LATERAL ( SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
unnest(sd.stxdexpr) AS a) stat ON ((stat.expr IS NOT NULL)));
pg_tables| SELECT n.nspname AS schemaname,
c.relname AS tablename,
pg_get_userbyid(c.relowner) AS tableowner,

File diff suppressed because it is too large Load Diff

View File

@ -124,7 +124,9 @@ CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) PRIMARY KEY, b text);
CREATE INDEX ctlt1_b_key ON ctlt1 (b);
CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b));
CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1;
CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1;
COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats';
COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats';
COMMENT ON COLUMN ctlt1.a IS 'A';
COMMENT ON COLUMN ctlt1.b IS 'B';
COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check';

View File

@ -28,16 +28,21 @@ end;
$$;
-- Verify failures
CREATE TABLE ext_stats_test (x int, y int, z int);
CREATE TABLE ext_stats_test (x text, y int, z int);
CREATE STATISTICS tst;
CREATE STATISTICS tst ON a, b;
CREATE STATISTICS tst FROM sometab;
CREATE STATISTICS tst ON a, b FROM nonexistent;
CREATE STATISTICS tst ON a, b FROM ext_stats_test;
CREATE STATISTICS tst ON x, x, y FROM ext_stats_test;
CREATE STATISTICS tst ON x + y FROM ext_stats_test;
CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test;
CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test;
CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test;
-- incorrect expressions
CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses
CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
DROP TABLE ext_stats_test;
-- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
@ -97,6 +102,36 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1;
ANALYZE ab1;
DROP TABLE ab1 CASCADE;
-- basic test for statistics on expressions
CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
-- expression stats may be built on a single expression column
CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1;
-- with a single expression, we only enable expression statistics
CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1;
SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2';
-- adding anything to the expression builds all statistics kinds
CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1;
SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3';
-- date_trunc on timestamptz is not immutable, but that should not matter
CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1;
-- date_trunc on timestamp is immutable
CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1;
-- insert some data and run analyze, to test that these cases build properly
INSERT INTO ab1
SELECT
generate_series(1,10),
generate_series(1,10),
generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'),
generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day');
ANALYZE ab1;
DROP TABLE ab1;
-- Verify supported object types for extended statistics
CREATE schema tststats;
@ -164,6 +199,14 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
-- correct command
CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
@ -184,6 +227,16 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
-- partial improvement (match on attributes)
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
-- expressions - no improvement
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
-- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
@ -216,6 +269,14 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
DROP STATISTICS s10;
SELECT s.stxkind, d.stxdndistinct
@ -234,6 +295,206 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
-- ndistinct estimates with statistics on expressions
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
ANALYZE ndistinct;
SELECT s.stxkind, d.stxdndistinct
FROM pg_statistic_ext s, pg_statistic_ext_data d
WHERE s.stxrelid = 'ndistinct'::regclass
AND d.stxoid = s.oid;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
DROP STATISTICS s10;
-- a mix of attributes and expressions
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
ANALYZE ndistinct;
SELECT s.stxkind, d.stxdndistinct
FROM pg_statistic_ext s, pg_statistic_ext_data d
WHERE s.stxrelid = 'ndistinct'::regclass
AND d.stxoid = s.oid;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
DROP STATISTICS s10;
-- combination of multiple ndistinct statistics, with/without expressions
TRUNCATE ndistinct;
-- two mostly independent groups of columns
INSERT INTO ndistinct (a, b, c, d)
SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20)
FROM generate_series(1,1000) s(i);
ANALYZE ndistinct;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
-- basic statistics on both attributes (no expressions)
CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct;
CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct;
ANALYZE ndistinct;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
-- replace the second statistics by statistics on expressions
DROP STATISTICS s12;
CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct;
ANALYZE ndistinct;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
-- replace the second statistics by statistics on both attributes and expressions
DROP STATISTICS s12;
CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct;
ANALYZE ndistinct;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
-- replace the other statistics by statistics on both attributes and expressions
DROP STATISTICS s11;
CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
ANALYZE ndistinct;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
-- replace statistics by somewhat overlapping ones (this expected to get worse estimate
-- because the first statistics shall be applied to 3 columns, and the second one can't
-- be really applied)
DROP STATISTICS s11;
DROP STATISTICS s12;
CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct;
ANALYZE ndistinct;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
DROP STATISTICS s11;
DROP STATISTICS s12;
-- functional dependencies tests
CREATE TABLE functional_dependencies (
filler1 TEXT,
@ -272,6 +533,29 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
TRUNCATE functional_dependencies;
DROP STATISTICS func_deps_stat;
-- now do the same thing, but with expressions
INSERT INTO functional_dependencies (a, b, c, filler1)
SELECT i, i, i, i FROM generate_series(1,5000) s(i);
ANALYZE functional_dependencies;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
-- create statistics
CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies;
ANALYZE functional_dependencies;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
-- a => b, a => c, b => c
TRUNCATE functional_dependencies;
DROP STATISTICS func_deps_stat;
INSERT INTO functional_dependencies (a, b, c, filler1)
SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
@ -397,9 +681,9 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
-- changing the type of column c causes its single-column stats to be dropped,
-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
-- clauses estimated with functional dependencies does not exceed this
-- changing the type of column c causes all its stats to be dropped, reverting
-- to default estimates without any statistics, i.e. 0.5% selectivity for each
-- condition
ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
@ -408,6 +692,132 @@ ANALYZE functional_dependencies;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
DROP STATISTICS func_deps_stat;
-- now try functional dependencies with expressions
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
-- IN
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
-- OR clauses referencing the same attribute
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
-- OR clauses referencing different attributes
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
-- ANY
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
-- ANY with inequalities should not benefit from functional dependencies
-- the estimates however improve thanks to having expression statistics
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
-- ALL (should not benefit from functional dependencies)
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
-- create statistics on expressions
CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies;
ANALYZE functional_dependencies;
-- print the detected dependencies
SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
-- IN
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
-- OR clauses referencing the same attribute
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
-- OR clauses referencing different attributes
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
-- ANY
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
-- ANY with inequalities should not benefit from functional dependencies
-- the estimates however improve thanks to having expression statistics
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
-- ALL (should not benefit from functional dependencies)
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
-- check the ability to use multiple functional dependencies
CREATE TABLE functional_dependencies_multi (
a INTEGER,
@ -479,6 +889,28 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
-- random data (no MCV list), but with expression
INSERT INTO mcv_lists (a, b, c, filler1)
SELECT i, i, i, i FROM generate_series(1,1000) s(i);
ANALYZE mcv_lists;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
-- create statistics
CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists;
ANALYZE mcv_lists;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
-- 100 distinct combinations, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
@ -565,6 +997,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)');
@ -602,6 +1036,98 @@ ANALYZE mcv_lists;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
-- 100 distinct combinations, all in the MCV list, but with expressions
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
INSERT INTO mcv_lists (a, b, c, filler1)
SELECT i, i, i, i FROM generate_series(1,1000) s(i);
ANALYZE mcv_lists;
-- without any stats on the expressions, we have to use default selectivities, which
-- is why the estimates here are different from the pre-computed case above
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats)
CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists;
CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists;
CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists;
ANALYZE mcv_lists;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
DROP STATISTICS mcv_lists_stats_1;
DROP STATISTICS mcv_lists_stats_2;
DROP STATISTICS mcv_lists_stats_3;
-- create statistics with both MCV and expressions
CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists;
ANALYZE mcv_lists;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL');
-- 100 distinct combinations with NULL values, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
@ -894,6 +1420,57 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR
DROP TABLE mcv_lists_multi;
-- statistics on integer expressions
CREATE TABLE expr_stats (a int, b int, c int);
INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
ANALYZE expr_stats;
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats;
ANALYZE expr_stats;
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
DROP STATISTICS expr_stats_1;
DROP TABLE expr_stats;
-- statistics on a mix columns and expressions
CREATE TABLE expr_stats (a int, b int, c int);
INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
ANALYZE expr_stats;
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats;
ANALYZE expr_stats;
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
DROP TABLE expr_stats;
-- statistics on expressions with different data types
CREATE TABLE expr_stats (a int, b name, c text);
INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i);
ANALYZE expr_stats;
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats;
ANALYZE expr_stats;
SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
DROP TABLE expr_stats;
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
-- the underlying table.