From be4b4dc75955318e763f5b2e3a990e35366ac797 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 1 Apr 2016 15:47:52 -0400
Subject: [PATCH] Omit null rows when applying the Haas-Stokes estimator for
 ndistinct.

Previously, we included null rows in the values of n and N that went
into the formula, which amounts to considering null as a value in its
own right; but the d and f1 values do not include nulls.  This is
inconsistent, and it contributes to significant underestimation of
ndistinct when the column is mostly nulls.  In any case stadistinct
is defined as the number of distinct non-null values, so we should
exclude nulls when doing this computation.

This is an aboriginal bug in our application of the Haas-Stokes formula,
but we'll refrain from back-patching for fear of destabilizing plan
choices in released branches.

While at it, make the code a bit more readable by omitting unnecessary
casts and intermediate variables.

Observation and original patch by Tomas Vondra, adjusted to fix both
uses of the formula by Alex Shulgin, cosmetic improvements by me
---
 src/backend/commands/analyze.c | 62 +++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 24 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 8a5f07c957..b0c65650ee 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2072,6 +2072,12 @@ compute_distinct_stats(VacAttrStatsP stats,
 			 * recommend are considerably more complex, and are numerically
 			 * very unstable when n is much smaller than N.
 			 *
+			 * In this calculation, we consider only non-nulls.  We used to
+			 * include rows with null values in the n and N counts, but that
+			 * leads to inaccurate answers in columns with many nulls, and
+			 * it's intuitively bogus anyway considering the desired result is
+			 * the number of distinct non-null values.
+			 *
 			 * We assume (not very reliably!) that all the multiply-occurring
 			 * values are reflected in the final track[] list, and the other
 			 * nonnull values all appeared but once.  (XXX this usually
@@ -2081,21 +2087,22 @@ compute_distinct_stats(VacAttrStatsP stats,
 			 */
 			int			f1 = nonnull_cnt - summultiple;
 			int			d = f1 + nmultiple;
-			double		numer,
-						denom,
-						stadistinct;
+			double		n = samplerows - null_cnt;
+			double		N = totalrows * (1.0 - stats->stanullfrac);
+			double		stadistinct;
 
-			numer = (double) samplerows *(double) d;
+			/* N == 0 shouldn't happen, but just in case ... */
+			if (N > 0)
+				stadistinct = (n * d) / ((n - f1) + f1 * n / N);
+			else
+				stadistinct = 0;
 
-			denom = (double) (samplerows - f1) +
-				(double) f1 *(double) samplerows / totalrows;
-
-			stadistinct = numer / denom;
 			/* Clamp to sane range in case of roundoff error */
-			if (stadistinct < (double) d)
-				stadistinct = (double) d;
-			if (stadistinct > totalrows)
-				stadistinct = totalrows;
+			if (stadistinct < d)
+				stadistinct = d;
+			if (stadistinct > N)
+				stadistinct = N;
+			/* And round to integer */
 			stats->stadistinct = floor(stadistinct + 0.5);
 		}
 
@@ -2425,26 +2432,33 @@ compute_scalar_stats(VacAttrStatsP stats,
 			 * recommend are considerably more complex, and are numerically
 			 * very unstable when n is much smaller than N.
 			 *
+			 * In this calculation, we consider only non-nulls.  We used to
+			 * include rows with null values in the n and N counts, but that
+			 * leads to inaccurate answers in columns with many nulls, and
+			 * it's intuitively bogus anyway considering the desired result is
+			 * the number of distinct non-null values.
+			 *
 			 * Overwidth values are assumed to have been distinct.
 			 *----------
 			 */
 			int			f1 = ndistinct - nmultiple + toowide_cnt;
 			int			d = f1 + nmultiple;
-			double		numer,
-						denom,
-						stadistinct;
+			double		n = samplerows - null_cnt;
+			double		N = totalrows * (1.0 - stats->stanullfrac);
+			double		stadistinct;
 
-			numer = (double) samplerows *(double) d;
+			/* N == 0 shouldn't happen, but just in case ... */
+			if (N > 0)
+				stadistinct = (n * d) / ((n - f1) + f1 * n / N);
+			else
+				stadistinct = 0;
 
-			denom = (double) (samplerows - f1) +
-				(double) f1 *(double) samplerows / totalrows;
-
-			stadistinct = numer / denom;
 			/* Clamp to sane range in case of roundoff error */
-			if (stadistinct < (double) d)
-				stadistinct = (double) d;
-			if (stadistinct > totalrows)
-				stadistinct = totalrows;
+			if (stadistinct < d)
+				stadistinct = d;
+			if (stadistinct > N)
+				stadistinct = N;
+			/* And round to integer */
 			stats->stadistinct = floor(stadistinct + 0.5);
 		}