Convert pre-existing stats_ext tests to new style

The regression tests added in commit 7300a69950 test cardinality
estimates using a function that extracts the interesting pieces
from the EXPLAIN output, instead of testing the whole plan. That
seems both easier to understand and less fragile, so this applies
the same approach to pre-existing tests of ndistinct coefficients
and functional dependencies.

Discussion: https://postgr.es/m/dfdac334-9cf2-2597-fb27-f0fb3753f435@2ndquadrant.com
This commit is contained in:
Tomas Vondra 2019-04-16 00:02:22 +02:00
parent 3824ca30d1
commit dbb984128e
2 changed files with 235 additions and 397 deletions

View File

@ -2,9 +2,26 @@
-- We will be checking execution plans without/with statistics, so -- We will be checking execution plans without/with statistics, so
-- let's make sure we get simple non-parallel plans. Also set the -- let's make sure we get simple non-parallel plans. Also set the
-- work_mem low so that we can use small amounts of data. -- work_mem low so that we can use small amounts of data.
SET max_parallel_workers = 0; -- check the number of estimated/actual rows in the top node
SET max_parallel_workers_per_gather = 0; create function check_estimated_rows(text) returns table (estimated int, actual int)
SET work_mem = '128kB'; language plpgsql as
$$
declare
ln text;
tmp text[];
first_row bool := true;
begin
for ln in
execute format('explain analyze %s', $1)
loop
if first_row then
first_row := false;
tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
return query select tmp[1]::int, tmp[2]::int;
end if;
end loop;
end;
$$;
-- Verify failures -- Verify failures
CREATE STATISTICS tst; CREATE STATISTICS tst;
ERROR: syntax error at or near ";" ERROR: syntax error at or near ";"
@ -146,63 +163,38 @@ CREATE TABLE ndistinct (
-- over-estimates when using only per-column statistics -- over-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1) INSERT INTO ndistinct (a, b, c, filler1)
SELECT i/100, i/100, i/100, cash_words((i/100)::money) SELECT i/100, i/100, i/100, cash_words((i/100)::money)
FROM generate_series(1,30000) s(i); FROM generate_series(1,1000) s(i);
ANALYZE ndistinct; ANALYZE ndistinct;
-- Group Aggregate, due to over-estimate of the number of groups -- Group Aggregate, due to over-estimate of the number of groups
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 100 | 11
GroupAggregate (1 row)
Group Key: a, b
-> Sort
Sort Key: a, b
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 100 | 11
GroupAggregate (1 row)
Group Key: b, c
-> Sort
Sort Key: b, c
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 100 | 11
GroupAggregate (1 row)
Group Key: a, b, c
-> Sort
Sort Key: a, b, c
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 200 | 11
GroupAggregate (1 row)
Group Key: a, b, c, d
-> Sort
Sort Key: a, b, c, d
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 200 | 11
GroupAggregate (1 row)
Group Key: b, c, d
-> Sort
Sort Key: b, c, d
-> Seq Scan on ndistinct
(5 rows)
-- correct command -- correct command
CREATE STATISTICS s10 ON a, b, c FROM ndistinct; CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
@ -210,127 +202,87 @@ ANALYZE ndistinct;
SELECT stxkind, stxndistinct SELECT stxkind, stxndistinct
FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass; FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
stxkind | stxndistinct stxkind | stxndistinct
---------+--------------------------------------------------------- ---------+-----------------------------------------------------
{d,f,m} | {"3, 4": 301, "3, 6": 301, "4, 6": 301, "3, 4, 6": 301} {d,f,m} | {"3, 4": 11, "3, 6": 11, "4, 6": 11, "3, 4, 6": 11}
(1 row) (1 row)
-- Hash Aggregate, thanks to estimates improved by the statistic -- Hash Aggregate, thanks to estimates improved by the statistic
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 11 | 11
HashAggregate (1 row)
Group Key: a, b
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 11 | 11
HashAggregate (1 row)
Group Key: b, c
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 11 | 11
HashAggregate (1 row)
Group Key: a, b, c
-> Seq Scan on ndistinct
(3 rows)
-- last two plans keep using Group Aggregate, because 'd' is not covered -- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it -- by the statistic and while it's NULL-only we assume 200 values for it
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 200 | 11
GroupAggregate (1 row)
Group Key: a, b, c, d
-> Sort
Sort Key: a, b, c, d
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 200 | 11
GroupAggregate (1 row)
Group Key: b, c, d
-> Sort
Sort Key: b, c, d
-> Seq Scan on ndistinct
(5 rows)
TRUNCATE TABLE ndistinct; TRUNCATE TABLE ndistinct;
-- under-estimates when using only per-column statistics -- under-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1) INSERT INTO ndistinct (a, b, c, filler1)
SELECT mod(i,50), mod(i,51), mod(i,32), SELECT mod(i,50), mod(i,51), mod(i,32),
cash_words(mod(i,33)::int::money) cash_words(mod(i,33)::int::money)
FROM generate_series(1,10000) s(i); FROM generate_series(1,5000) s(i);
ANALYZE ndistinct; ANALYZE ndistinct;
SELECT stxkind, stxndistinct SELECT stxkind, stxndistinct
FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass; FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
stxkind | stxndistinct stxkind | stxndistinct
---------+------------------------------------------------------------- ---------+------------------------------------------------------------
{d,f,m} | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 10000} {d,f,m} | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 5000}
(1 row) (1 row)
-- plans using Group Aggregate, thanks to using correct esimates -- correct esimates
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 2550 | 2550
GroupAggregate (1 row)
Group Key: a, b
-> Sort
Sort Key: a, b
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 5000 | 5000
GroupAggregate (1 row)
Group Key: a, b, c
-> Sort
Sort Key: a, b, c
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------------- 5000 | 5000
GroupAggregate (1 row)
Group Key: a, b, c, d
-> Sort
Sort Key: a, b, c, d
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 1632 | 1632
HashAggregate (1 row)
Group Key: b, c, d
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 500 | 50
HashAggregate (1 row)
Group Key: a, d
-> Seq Scan on ndistinct
(3 rows)
DROP STATISTICS s10; DROP STATISTICS s10;
SELECT stxkind, stxndistinct SELECT stxkind, stxndistinct
@ -339,52 +291,36 @@ SELECT stxkind, stxndistinct
---------+-------------- ---------+--------------
(0 rows) (0 rows)
-- dropping the statistics switches the plans to Hash Aggregate, -- dropping the statistics results in under-estimates
-- due to under-estimates SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
EXPLAIN (COSTS off) estimated | actual
SELECT COUNT(*) FROM ndistinct GROUP BY a, b; -----------+--------
QUERY PLAN 500 | 2550
----------------------------- (1 row)
HashAggregate
Group Key: a, b
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 500 | 5000
HashAggregate (1 row)
Group Key: a, b, c
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 500 | 5000
HashAggregate (1 row)
Group Key: a, b, c, d
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 500 | 1632
HashAggregate (1 row)
Group Key: b, c, d
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, d; estimated | actual
QUERY PLAN -----------+--------
----------------------------- 500 | 50
HashAggregate (1 row)
Group Key: a, d
-> Seq Scan on ndistinct
(3 rows)
-- functional dependencies tests -- functional dependencies tests
CREATE TABLE functional_dependencies ( CREATE TABLE functional_dependencies (
@ -396,51 +332,38 @@ CREATE TABLE functional_dependencies (
c INT, c INT,
d TEXT d TEXT
); );
SET random_page_cost = 1.2;
CREATE INDEX fdeps_ab_idx ON functional_dependencies (a, b); CREATE INDEX fdeps_ab_idx ON functional_dependencies (a, b);
CREATE INDEX fdeps_abc_idx ON functional_dependencies (a, b, c); CREATE INDEX fdeps_abc_idx ON functional_dependencies (a, b, c);
-- random data (no functional dependencies) -- random data (no functional dependencies)
INSERT INTO functional_dependencies (a, b, c, filler1) INSERT INTO functional_dependencies (a, b, c, filler1)
SELECT mod(i, 23), mod(i, 29), mod(i, 31), i FROM generate_series(1,5000) s(i); SELECT mod(i, 23), mod(i, 29), mod(i, 31), i FROM generate_series(1,5000) s(i);
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1'; estimated | actual
QUERY PLAN -----------+--------
--------------------------------------------------- 8 | 8
Bitmap Heap Scan on functional_dependencies (1 row)
Recheck Cond: ((a = 1) AND (b = '1'::text))
-> Bitmap Index Scan on fdeps_abc_idx
Index Cond: ((a = 1) AND (b = '1'::text))
(4 rows)
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1; estimated | actual
QUERY PLAN -----------+--------
----------------------------------------------------------- 1 | 1
Index Scan using fdeps_abc_idx on functional_dependencies (1 row)
Index Cond: ((a = 1) AND (b = '1'::text) AND (c = 1))
(2 rows)
-- create statistics -- create statistics
CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies; CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1'; estimated | actual
QUERY PLAN -----------+--------
--------------------------------------------------- 8 | 8
Bitmap Heap Scan on functional_dependencies (1 row)
Recheck Cond: ((a = 1) AND (b = '1'::text))
-> Bitmap Index Scan on fdeps_abc_idx
Index Cond: ((a = 1) AND (b = '1'::text))
(4 rows)
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1; estimated | actual
QUERY PLAN -----------+--------
----------------------------------------------------------- 1 | 1
Index Scan using fdeps_abc_idx on functional_dependencies (1 row)
Index Cond: ((a = 1) AND (b = '1'::text) AND (c = 1))
(2 rows)
-- a => b, a => c, b => c -- a => b, a => c, b => c
TRUNCATE functional_dependencies; TRUNCATE functional_dependencies;
@ -448,92 +371,48 @@ DROP STATISTICS func_deps_stat;
INSERT INTO functional_dependencies (a, b, c, filler1) INSERT INTO functional_dependencies (a, b, c, filler1)
SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i); SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1'; estimated | actual
QUERY PLAN -----------+--------
----------------------------------------------------------- 1 | 50
Index Scan using fdeps_abc_idx on functional_dependencies (1 row)
Index Cond: ((a = 1) AND (b = '1'::text))
(2 rows)
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1; estimated | actual
QUERY PLAN -----------+--------
----------------------------------------------------------- 1 | 50
Index Scan using fdeps_abc_idx on functional_dependencies (1 row)
Index Cond: ((a = 1) AND (b = '1'::text) AND (c = 1))
(2 rows)
-- create statistics -- create statistics
CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies; CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1'; estimated | actual
QUERY PLAN -----------+--------
--------------------------------------------------- 50 | 50
Bitmap Heap Scan on functional_dependencies (1 row)
Recheck Cond: ((a = 1) AND (b = '1'::text))
-> Bitmap Index Scan on fdeps_abc_idx
Index Cond: ((a = 1) AND (b = '1'::text))
(4 rows)
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1; estimated | actual
QUERY PLAN -----------+--------
--------------------------------------------------- 50 | 50
Bitmap Heap Scan on functional_dependencies (1 row)
Recheck Cond: ((a = 1) AND (b = '1'::text))
Filter: (c = 1)
-> Bitmap Index Scan on fdeps_ab_idx
Index Cond: ((a = 1) AND (b = '1'::text))
(5 rows)
-- check change of column type doesn't break it -- check change of column type doesn't break it
ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1; estimated | actual
QUERY PLAN -----------+--------
--------------------------------------------------- 50 | 50
Bitmap Heap Scan on functional_dependencies (1 row)
Recheck Cond: ((a = 1) AND (b = '1'::text))
Filter: (c = '1'::numeric)
-> Bitmap Index Scan on fdeps_ab_idx
Index Cond: ((a = 1) AND (b = '1'::text))
(5 rows)
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1; estimated | actual
QUERY PLAN -----------+--------
--------------------------------------------------- 50 | 50
Bitmap Heap Scan on functional_dependencies (1 row)
Recheck Cond: ((a = 1) AND (b = '1'::text))
Filter: (c = '1'::numeric)
-> Bitmap Index Scan on fdeps_ab_idx
Index Cond: ((a = 1) AND (b = '1'::text))
(5 rows)
RESET random_page_cost;
-- check the number of estimated/actual rows in the top node
create function check_estimated_rows(text) returns table (estimated int, actual int)
language plpgsql as
$$
declare
ln text;
tmp text[];
first_row bool := true;
begin
for ln in
execute format('explain analyze %s', $1)
loop
if first_row then
first_row := false;
tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
return query select tmp[1]::int, tmp[2]::int;
end if;
end loop;
end;
$$;
-- MCV lists -- MCV lists
CREATE TABLE mcv_lists ( CREATE TABLE mcv_lists (
filler1 TEXT, filler1 TEXT,
@ -728,7 +607,6 @@ SELECT m.* FROM pg_statistic_ext,
0 | {1, 2, 3} | {f,f,f} | 1 | 1 0 | {1, 2, 3} | {f,f,f} | 1 | 1
(1 row) (1 row)
RESET random_page_cost;
-- mcv with arrays -- mcv with arrays
CREATE TABLE mcv_lists_arrays ( CREATE TABLE mcv_lists_arrays (
a TEXT[], a TEXT[],

View File

@ -3,9 +3,27 @@
-- We will be checking execution plans without/with statistics, so -- We will be checking execution plans without/with statistics, so
-- let's make sure we get simple non-parallel plans. Also set the -- let's make sure we get simple non-parallel plans. Also set the
-- work_mem low so that we can use small amounts of data. -- work_mem low so that we can use small amounts of data.
SET max_parallel_workers = 0;
SET max_parallel_workers_per_gather = 0; -- check the number of estimated/actual rows in the top node
SET work_mem = '128kB'; create function check_estimated_rows(text) returns table (estimated int, actual int)
language plpgsql as
$$
declare
ln text;
tmp text[];
first_row bool := true;
begin
for ln in
execute format('explain analyze %s', $1)
loop
if first_row then
first_row := false;
tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
return query select tmp[1]::int, tmp[2]::int;
end if;
end loop;
end;
$$;
-- Verify failures -- Verify failures
CREATE STATISTICS tst; CREATE STATISTICS tst;
@ -106,25 +124,20 @@ CREATE TABLE ndistinct (
-- over-estimates when using only per-column statistics -- over-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1) INSERT INTO ndistinct (a, b, c, filler1)
SELECT i/100, i/100, i/100, cash_words((i/100)::money) SELECT i/100, i/100, i/100, cash_words((i/100)::money)
FROM generate_series(1,30000) s(i); FROM generate_series(1,1000) s(i);
ANALYZE ndistinct; ANALYZE ndistinct;
-- Group Aggregate, due to over-estimate of the number of groups -- Group Aggregate, due to over-estimate of the number of groups
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
-- correct command -- correct command
CREATE STATISTICS s10 ON a, b, c FROM ndistinct; CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
@ -135,22 +148,17 @@ SELECT stxkind, stxndistinct
FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass; FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
-- Hash Aggregate, thanks to estimates improved by the statistic -- Hash Aggregate, thanks to estimates improved by the statistic
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
-- last two plans keep using Group Aggregate, because 'd' is not covered -- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it -- by the statistic and while it's NULL-only we assume 200 values for it
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
TRUNCATE TABLE ndistinct; TRUNCATE TABLE ndistinct;
@ -158,50 +166,39 @@ TRUNCATE TABLE ndistinct;
INSERT INTO ndistinct (a, b, c, filler1) INSERT INTO ndistinct (a, b, c, filler1)
SELECT mod(i,50), mod(i,51), mod(i,32), SELECT mod(i,50), mod(i,51), mod(i,32),
cash_words(mod(i,33)::int::money) cash_words(mod(i,33)::int::money)
FROM generate_series(1,10000) s(i); FROM generate_series(1,5000) s(i);
ANALYZE ndistinct; ANALYZE ndistinct;
SELECT stxkind, stxndistinct SELECT stxkind, stxndistinct
FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass; FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
-- plans using Group Aggregate, thanks to using correct esimates -- correct esimates
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
DROP STATISTICS s10; DROP STATISTICS s10;
SELECT stxkind, stxndistinct SELECT stxkind, stxndistinct
FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass; FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
-- dropping the statistics switches the plans to Hash Aggregate, -- dropping the statistics results in under-estimates
-- due to under-estimates SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
EXPLAIN (COSTS off) SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
-- functional dependencies tests -- functional dependencies tests
CREATE TABLE functional_dependencies ( CREATE TABLE functional_dependencies (
@ -214,8 +211,6 @@ CREATE TABLE functional_dependencies (
d TEXT d TEXT
); );
SET random_page_cost = 1.2;
CREATE INDEX fdeps_ab_idx ON functional_dependencies (a, b); CREATE INDEX fdeps_ab_idx ON functional_dependencies (a, b);
CREATE INDEX fdeps_abc_idx ON functional_dependencies (a, b, c); CREATE INDEX fdeps_abc_idx ON functional_dependencies (a, b, c);
@ -225,22 +220,18 @@ INSERT INTO functional_dependencies (a, b, c, filler1)
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1';
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
-- create statistics -- create statistics
CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies; CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1';
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
-- a => b, a => c, b => c -- a => b, a => c, b => c
TRUNCATE functional_dependencies; TRUNCATE functional_dependencies;
@ -251,56 +242,27 @@ INSERT INTO functional_dependencies (a, b, c, filler1)
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1';
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
-- create statistics -- create statistics
CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies; CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1';
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
-- check change of column type doesn't break it -- check change of column type doesn't break it
ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
ANALYZE functional_dependencies; ANALYZE functional_dependencies;
EXPLAIN (COSTS OFF) SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
RESET random_page_cost;
-- check the number of estimated/actual rows in the top node
create function check_estimated_rows(text) returns table (estimated int, actual int)
language plpgsql as
$$
declare
ln text;
tmp text[];
first_row bool := true;
begin
for ln in
execute format('explain analyze %s', $1)
loop
if first_row then
first_row := false;
tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
return query select tmp[1]::int, tmp[2]::int;
end if;
end loop;
end;
$$;
-- MCV lists -- MCV lists
CREATE TABLE mcv_lists ( CREATE TABLE mcv_lists (
@ -419,8 +381,6 @@ ANALYZE mcv_lists;
SELECT m.* FROM pg_statistic_ext, SELECT m.* FROM pg_statistic_ext,
pg_mcv_list_items(stxmcv) m WHERE stxname = 'mcv_lists_stats'; pg_mcv_list_items(stxmcv) m WHERE stxname = 'mcv_lists_stats';
RESET random_page_cost;
-- mcv with arrays -- mcv with arrays
CREATE TABLE mcv_lists_arrays ( CREATE TABLE mcv_lists_arrays (
a TEXT[], a TEXT[],