Re: [PATCH] random_normal function

Поиск
Список
Период
Сортировка
От	Tom Lane
Тема	Re: [PATCH] random_normal function
Дата	9 января 2023 г. 18:52:16
Msg-id	4173840.1673290336@sss.pgh.pa.us обсуждение исходный текст
Ответ на	Re: [PATCH] random_normal function (Tom Lane <tgl@sss.pgh.pa.us>)
Ответы	Re: [PATCH] random_normal function Re: [PATCH] random_normal function
Список	pgsql-hackers
Дерево обсуждения
I wrote:
> Hmm ... it occurred to me to try the same check on the existing
> random() tests (attached), and darn if they don't fail even more
> often, usually within 50K iterations.  So maybe we should rethink
> that whole thing.

I pushed Paul's patch with the previously-discussed tests, but
the more I look at random.sql the less I like it.  I propose
that we nuke the existing tests from orbit and replace with
something more or less as attached.  This is faster than what
we have, removes the unnecessary dependency on the "onek" table,
and I believe it to be a substantially more thorough test of the
random functions' properties.  (We could probably go further
than this, like trying to verify distribution properties.  But
it's been too long since college statistics for me to want to
write such tests myself, and I'm not real sure we need them.)

BTW, if this does bring the probability of failure down to the
one-in-a-billion range, I think we could also nuke the whole
"ignore:" business, simplifying pg_regress and allowing the
random test to be run in parallel with others.

            regards, tom lane

diff --git a/src/test/regress/expected/random.out b/src/test/regress/expected/random.out
index 30bd866138..2a9249f3cb 100644
--- a/src/test/regress/expected/random.out
+++ b/src/test/regress/expected/random.out
@@ -1,81 +1,110 @@
 --
 -- RANDOM
--- Test the random function
+-- Test random() and allies
 --
--- count the number of tuples originally, should be 1000
-SELECT count(*) FROM onek;
- count
--------
-  1000
+-- Tests in this file may have a small probability of failure,
+-- since we are dealing with randomness.  Try to keep the failure
+-- risk for any one test case under 1e-9.
+--
+-- There should be no duplicates in 1000 random() values.
+-- (Assuming 52 random bits in the float8 results, we could
+-- take as many as 3000 values and still have less than 1e-9 chance
+-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem)
+SELECT r, count(*)
+FROM (SELECT random() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+ r | count
+---+-------
+(0 rows)
+
+-- The range should be [0, 1).  We can expect that at least one out of 2000
+-- random values is in the lowest or highest 1% of the range with failure
+-- probability less than about 1e-9.
+SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range,
+       (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small,
+       (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large
+FROM (SELECT random() r FROM generate_series(1, 2000)) ss;
+ out_of_range | has_small | has_large
+--------------+-----------+-----------
+            0 | t         | t
 (1 row)

--- pick three random rows, they shouldn't match
-(SELECT unique1 AS random
-  FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
-  FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
-  FROM onek ORDER BY random() LIMIT 1);
- random
---------
-(0 rows)
-
--- count roughly 1/10 of the tuples
-CREATE TABLE RANDOM_TBL AS
-  SELECT count(*) AS random
-  FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
-  SELECT count(*)
-  FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
-  SELECT count(*)
-  FROM onek WHERE random() < 1.0/10;
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
-  SELECT count(*)
-  FROM onek WHERE random() < 1.0/10;
--- now test that they are different counts
-SELECT random, count(random) FROM RANDOM_TBL
-  GROUP BY random HAVING count(random) > 3;
- random | count
---------+-------
-(0 rows)
-
-SELECT AVG(random) FROM RANDOM_TBL
-  HAVING AVG(random) NOT BETWEEN 80 AND 120;
- avg
------
-(0 rows)
-
 -- now test random_normal()
-TRUNCATE random_tbl;
-INSERT INTO random_tbl (random)
-  SELECT count(*)
-  FROM onek WHERE random_normal(0, 1) < 0;
-INSERT INTO random_tbl (random)
-  SELECT count(*)
-  FROM onek WHERE random_normal(0) < 0;
-INSERT INTO random_tbl (random)
-  SELECT count(*)
-  FROM onek WHERE random_normal() < 0;
-INSERT INTO random_tbl (random)
-  SELECT count(*)
-  FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0;
--- expect similar, but not identical values
-SELECT random, count(random) FROM random_tbl
-  GROUP BY random HAVING count(random) > 3;
- random | count
---------+-------
+-- As above, there should be no duplicates in 1000 random_normal() values.
+SELECT r, count(*)
+FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+ r | count
+---+-------
 (0 rows)

--- approximately check expected distribution
-SELECT AVG(random) FROM random_tbl
-  HAVING AVG(random) NOT BETWEEN 400 AND 600;
- avg
------
-(0 rows)
+-- ... unless we force the range (standard deviation) to zero.
+-- This is a good place to check that the mean input does something, too.
+SELECT r, count(*)
+FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+ r  | count
+----+-------
+ 10 |   100
+(1 row)
+
+SELECT r, count(*)
+FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+  r  | count
+-----+-------
+ -10 |   100
+(1 row)
+
+-- setseed() should produce a reproducible series of numbers.
+SELECT setseed(0.5);
+ setseed
+---------
+
+(1 row)
+
+SELECT random() FROM generate_series(1, 10);
+       random
+---------------------
+  0.9851677175347999
+   0.825301858027981
+ 0.12974610012450416
+ 0.16356291958601088
+     0.6476186144084
+  0.8822771983038762
+  0.1404566845227775
+ 0.15619865764623442
+  0.5145227426983392
+  0.7712969548127826
+(10 rows)
+
+SELECT random_normal() FROM generate_series(1, 10);
+    random_normal
+----------------------
+  0.20853464493837737
+   0.2645302405409627
+   -0.606752467900428
+   0.8257994278526545
+   1.7011161173535707
+ -0.22344546371618862
+  0.24971241919099813
+  -1.2494722990669047
+   0.1256271520436768
+  0.47539161454401274
+(10 rows)
+
+SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10);
+         r
+--------------------
+  1.006059728117318
+ 1.0968545301500248
+ 1.0286920613200707
+ 0.9094756767123359
+ 0.9837247631342648
+ 0.9393445495776231
+ 1.1871350020636253
+ 0.9622576842929335
+ 0.9144412068004066
+ 0.9640310555754329
+(10 rows)

diff --git a/src/test/regress/sql/random.sql b/src/test/regress/sql/random.sql
index 3104af46b7..adfd7b3261 100644
--- a/src/test/regress/sql/random.sql
+++ b/src/test/regress/sql/random.sql
@@ -1,68 +1,49 @@
 --
 -- RANDOM
--- Test the random function
+-- Test random() and allies
+--
+-- Tests in this file may have a small probability of failure,
+-- since we are dealing with randomness.  Try to keep the failure
+-- risk for any one test case under 1e-9.
 --

--- count the number of tuples originally, should be 1000
-SELECT count(*) FROM onek;
-
--- pick three random rows, they shouldn't match
-(SELECT unique1 AS random
-  FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
-  FROM onek ORDER BY random() LIMIT 1)
-INTERSECT
-(SELECT unique1 AS random
-  FROM onek ORDER BY random() LIMIT 1);
-
--- count roughly 1/10 of the tuples
-CREATE TABLE RANDOM_TBL AS
-  SELECT count(*) AS random
-  FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
-  SELECT count(*)
-  FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
-  SELECT count(*)
-  FROM onek WHERE random() < 1.0/10;
-
--- select again, the count should be different
-INSERT INTO RANDOM_TBL (random)
-  SELECT count(*)
-  FROM onek WHERE random() < 1.0/10;
-
--- now test that they are different counts
-SELECT random, count(random) FROM RANDOM_TBL
-  GROUP BY random HAVING count(random) > 3;
-
-SELECT AVG(random) FROM RANDOM_TBL
-  HAVING AVG(random) NOT BETWEEN 80 AND 120;
+-- There should be no duplicates in 1000 random() values.
+-- (Assuming 52 random bits in the float8 results, we could
+-- take as many as 3000 values and still have less than 1e-9 chance
+-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem)
+SELECT r, count(*)
+FROM (SELECT random() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;
+
+-- The range should be [0, 1).  We can expect that at least one out of 2000
+-- random values is in the lowest or highest 1% of the range with failure
+-- probability less than about 1e-9.
+
+SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range,
+       (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small,
+       (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large
+FROM (SELECT random() r FROM generate_series(1, 2000)) ss;

 -- now test random_normal()

-TRUNCATE random_tbl;
-INSERT INTO random_tbl (random)
-  SELECT count(*)
-  FROM onek WHERE random_normal(0, 1) < 0;
-INSERT INTO random_tbl (random)
-  SELECT count(*)
-  FROM onek WHERE random_normal(0) < 0;
-INSERT INTO random_tbl (random)
-  SELECT count(*)
-  FROM onek WHERE random_normal() < 0;
-INSERT INTO random_tbl (random)
-  SELECT count(*)
-  FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0;
+-- As above, there should be no duplicates in 1000 random_normal() values.
+SELECT r, count(*)
+FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss
+GROUP BY r HAVING count(*) > 1;

--- expect similar, but not identical values
-SELECT random, count(random) FROM random_tbl
-  GROUP BY random HAVING count(random) > 3;
+-- ... unless we force the range (standard deviation) to zero.
+-- This is a good place to check that the mean input does something, too.
+SELECT r, count(*)
+FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;
+SELECT r, count(*)
+FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss
+GROUP BY r;

--- approximately check expected distribution
-SELECT AVG(random) FROM random_tbl
-  HAVING AVG(random) NOT BETWEEN 400 AND 600;
+-- setseed() should produce a reproducible series of numbers.
+
+SELECT setseed(0.5);
+
+SELECT random() FROM generate_series(1, 10);
+SELECT random_normal() FROM generate_series(1, 10);
+SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10);
В списке pgsql-hackers по дате отправления:
Вход в личный кабинет

Восстановление пароля

Подтверждение аккаунта

Изменение пароля

Re: [PATCH] random_normal function