diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out index 30bd866138..8ffb0fcfc4 100644 --- a/src/test/regress/expected/random.out +++ b/src/test/regress/expected/random.out @@ -1,81 +1,146 @@ -- -- RANDOM --- Test the random function +-- Test random() and allies -- --- count the number of tuples originally, should be 1000 -SELECT count(*) FROM onek; - count -------- - 1000 +-- Tests in this file may have a small probability of failure, +-- since we are dealing with randomness. Try to keep the failure +-- risk for any one test case under 1e-9. +-- +-- There should be no duplicates in 1000 random() values. +-- (Assuming 52 random bits in the float8 results, we could +-- take as many as 3000 values and still have less than 1e-9 chance +-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem) +SELECT r, count(*) +FROM (SELECT random() r FROM generate_series(1, 1000)) ss +GROUP BY r HAVING count(*) > 1; + r | count +---+------- +(0 rows) + +-- The range should be [0, 1). We can expect that at least one out of 2000 +-- random values is in the lowest or highest 1% of the range with failure +-- probability less than about 1e-9. +SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range, + (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small, + (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large +FROM (SELECT random() r FROM generate_series(1, 2000)) ss; + out_of_range | has_small | has_large +--------------+-----------+----------- + 0 | t | t (1 row) --- pick three random rows, they shouldn't match -(SELECT unique1 AS random - FROM onek ORDER BY random() LIMIT 1) -INTERSECT -(SELECT unique1 AS random - FROM onek ORDER BY random() LIMIT 1) -INTERSECT -(SELECT unique1 AS random - FROM onek ORDER BY random() LIMIT 1); - random --------- -(0 rows) - --- count roughly 1/10 of the tuples -CREATE TABLE RANDOM_TBL AS - SELECT count(*) AS random - FROM onek WHERE random() < 1.0/10; --- select again, the count should be different -INSERT INTO RANDOM_TBL (random) - SELECT count(*) - FROM onek WHERE random() < 1.0/10; --- select again, the count should be different -INSERT INTO RANDOM_TBL (random) - SELECT count(*) - FROM onek WHERE random() < 1.0/10; --- select again, the count should be different -INSERT INTO RANDOM_TBL (random) - SELECT count(*) - FROM onek WHERE random() < 1.0/10; --- now test that they are different counts -SELECT random, count(random) FROM RANDOM_TBL - GROUP BY random HAVING count(random) > 3; - random | count ---------+------- -(0 rows) - -SELECT AVG(random) FROM RANDOM_TBL - HAVING AVG(random) NOT BETWEEN 80 AND 120; - avg ------ -(0 rows) +-- Check for uniform distribution using the Kolmogorov-Smirnov test. +CREATE FUNCTION ks_test_uniform_random() +RETURNS boolean AS +$$ +DECLARE + n int := 1000; -- Number of samples + c float8 := 1.94947; -- Critical value for 99.9% confidence + ok boolean; +BEGIN + ok := ( + WITH samples AS ( + SELECT random() r FROM generate_series(1, n) ORDER BY 1 + ), indexed_samples AS ( + SELECT (row_number() OVER())-1.0 i, r FROM samples + ) + SELECT max(abs(i/n-r)) < c / sqrt(n) FROM indexed_samples + ); + RETURN ok; +END +$$ +LANGUAGE plpgsql; +-- As written, ks_test_uniform_random() returns true about 99.9% +-- of the time. To get down to a roughly 1e-9 test failure rate, +-- just run it 3 times and accept if any one of them passes. +SELECT ks_test_uniform_random() OR + ks_test_uniform_random() OR + ks_test_uniform_random() AS uniform; + uniform +--------- + t +(1 row) -- now test random_normal() -TRUNCATE random_tbl; -INSERT INTO random_tbl (random) - SELECT count(*) - FROM onek WHERE random_normal(0, 1) < 0; -INSERT INTO random_tbl (random) - SELECT count(*) - FROM onek WHERE random_normal(0) < 0; -INSERT INTO random_tbl (random) - SELECT count(*) - FROM onek WHERE random_normal() < 0; -INSERT INTO random_tbl (random) - SELECT count(*) - FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0; --- expect similar, but not identical values -SELECT random, count(random) FROM random_tbl - GROUP BY random HAVING count(random) > 3; - random | count ---------+------- +-- As above, there should be no duplicates in 1000 random_normal() values. +SELECT r, count(*) +FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss +GROUP BY r HAVING count(*) > 1; + r | count +---+------- (0 rows) --- approximately check expected distribution -SELECT AVG(random) FROM random_tbl - HAVING AVG(random) NOT BETWEEN 400 AND 600; - avg ------ -(0 rows) +-- ... unless we force the range (standard deviation) to zero. +-- This is a good place to check that the mean input does something, too. +SELECT r, count(*) +FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss +GROUP BY r; + r | count +----+------- + 10 | 100 +(1 row) + +SELECT r, count(*) +FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss +GROUP BY r; + r | count +-----+------- + -10 | 100 +(1 row) + +-- setseed() should produce a reproducible series of random() values. +SELECT setseed(0.5); + setseed +--------- + +(1 row) + +SELECT random() FROM generate_series(1, 10); + random +--------------------- + 0.9851677175347999 + 0.825301858027981 + 0.12974610012450416 + 0.16356291958601088 + 0.6476186144084 + 0.8822771983038762 + 0.1404566845227775 + 0.15619865764623442 + 0.5145227426983392 + 0.7712969548127826 +(10 rows) + +-- Likewise for random_normal(); however, since its implementation relies +-- on libm functions that have different roundoff behaviors on different +-- machines, we have to round off the results a bit to get consistent output. +SET extra_float_digits = 0; +SELECT random_normal() FROM generate_series(1, 10); + random_normal +-------------------- + 0.208534644938377 + 0.264530240540963 + -0.606752467900428 + 0.825799427852654 + 1.70111611735357 + -0.223445463716189 + 0.249712419190998 + -1.2494722990669 + 0.125627152043677 + 0.475391614544013 +(10 rows) + +SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10); + r +------------------- + 1.00605972811732 + 1.09685453015002 + 1.02869206132007 + 0.909475676712336 + 0.983724763134265 + 0.939344549577623 + 1.18713500206363 + 0.962257684292933 + 0.914441206800407 + 0.964031055575433 +(10 rows) diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index f99e99373a..a930dfe48c 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -61,9 +61,6 @@ test: create_aggregate create_function_sql create_cast constraints triggers sele # ---------- test: sanity_check -# Note: the ignore: line does not skip random, just mark it as ignorable -ignore: random - # ---------- # Another group of parallel tests # aggregates depends on create_aggregate diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql index 3104af46b7..088c451b85 100644 --- a/src/test/regress/sql/random.sql +++ b/src/test/regress/sql/random.sql @@ -1,68 +1,85 @@ -- -- RANDOM --- Test the random function +-- Test random() and allies +-- +-- Tests in this file may have a small probability of failure, +-- since we are dealing with randomness. Try to keep the failure +-- risk for any one test case under 1e-9. -- --- count the number of tuples originally, should be 1000 -SELECT count(*) FROM onek; +-- There should be no duplicates in 1000 random() values. +-- (Assuming 52 random bits in the float8 results, we could +-- take as many as 3000 values and still have less than 1e-9 chance +-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem) +SELECT r, count(*) +FROM (SELECT random() r FROM generate_series(1, 1000)) ss +GROUP BY r HAVING count(*) > 1; --- pick three random rows, they shouldn't match -(SELECT unique1 AS random - FROM onek ORDER BY random() LIMIT 1) -INTERSECT -(SELECT unique1 AS random - FROM onek ORDER BY random() LIMIT 1) -INTERSECT -(SELECT unique1 AS random - FROM onek ORDER BY random() LIMIT 1); +-- The range should be [0, 1). We can expect that at least one out of 2000 +-- random values is in the lowest or highest 1% of the range with failure +-- probability less than about 1e-9. --- count roughly 1/10 of the tuples -CREATE TABLE RANDOM_TBL AS - SELECT count(*) AS random - FROM onek WHERE random() < 1.0/10; +SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range, + (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small, + (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large +FROM (SELECT random() r FROM generate_series(1, 2000)) ss; --- select again, the count should be different -INSERT INTO RANDOM_TBL (random) - SELECT count(*) - FROM onek WHERE random() < 1.0/10; +-- Check for uniform distribution using the Kolmogorov-Smirnov test. --- select again, the count should be different -INSERT INTO RANDOM_TBL (random) - SELECT count(*) - FROM onek WHERE random() < 1.0/10; +CREATE FUNCTION ks_test_uniform_random() +RETURNS boolean AS +$$ +DECLARE + n int := 1000; -- Number of samples + c float8 := 1.94947; -- Critical value for 99.9% confidence + ok boolean; +BEGIN + ok := ( + WITH samples AS ( + SELECT random() r FROM generate_series(1, n) ORDER BY 1 + ), indexed_samples AS ( + SELECT (row_number() OVER())-1.0 i, r FROM samples + ) + SELECT max(abs(i/n-r)) < c / sqrt(n) FROM indexed_samples + ); + RETURN ok; +END +$$ +LANGUAGE plpgsql; --- select again, the count should be different -INSERT INTO RANDOM_TBL (random) - SELECT count(*) - FROM onek WHERE random() < 1.0/10; - --- now test that they are different counts -SELECT random, count(random) FROM RANDOM_TBL - GROUP BY random HAVING count(random) > 3; - -SELECT AVG(random) FROM RANDOM_TBL - HAVING AVG(random) NOT BETWEEN 80 AND 120; +-- As written, ks_test_uniform_random() returns true about 99.9% +-- of the time. To get down to a roughly 1e-9 test failure rate, +-- just run it 3 times and accept if any one of them passes. +SELECT ks_test_uniform_random() OR + ks_test_uniform_random() OR + ks_test_uniform_random() AS uniform; -- now test random_normal() -TRUNCATE random_tbl; -INSERT INTO random_tbl (random) - SELECT count(*) - FROM onek WHERE random_normal(0, 1) < 0; -INSERT INTO random_tbl (random) - SELECT count(*) - FROM onek WHERE random_normal(0) < 0; -INSERT INTO random_tbl (random) - SELECT count(*) - FROM onek WHERE random_normal() < 0; -INSERT INTO random_tbl (random) - SELECT count(*) - FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0; +-- As above, there should be no duplicates in 1000 random_normal() values. +SELECT r, count(*) +FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss +GROUP BY r HAVING count(*) > 1; --- expect similar, but not identical values -SELECT random, count(random) FROM random_tbl - GROUP BY random HAVING count(random) > 3; +-- ... unless we force the range (standard deviation) to zero. +-- This is a good place to check that the mean input does something, too. +SELECT r, count(*) +FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss +GROUP BY r; +SELECT r, count(*) +FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss +GROUP BY r; --- approximately check expected distribution -SELECT AVG(random) FROM random_tbl - HAVING AVG(random) NOT BETWEEN 400 AND 600; +-- setseed() should produce a reproducible series of random() values. + +SELECT setseed(0.5); + +SELECT random() FROM generate_series(1, 10); + +-- Likewise for random_normal(); however, since its implementation relies +-- on libm functions that have different roundoff behaviors on different +-- machines, we have to round off the results a bit to get consistent output. +SET extra_float_digits = 0; + +SELECT random_normal() FROM generate_series(1, 10); +SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10);