From be4b4dc75955318e763f5b2e3a990e35366ac797 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 1 Apr 2016 15:47:52 -0400 Subject: [PATCH] Omit null rows when applying the Haas-Stokes estimator for ndistinct. Previously, we included null rows in the values of n and N that went into the formula, which amounts to considering null as a value in its own right; but the d and f1 values do not include nulls. This is inconsistent, and it contributes to significant underestimation of ndistinct when the column is mostly nulls. In any case stadistinct is defined as the number of distinct non-null values, so we should exclude nulls when doing this computation. This is an aboriginal bug in our application of the Haas-Stokes formula, but we'll refrain from back-patching for fear of destabilizing plan choices in released branches. While at it, make the code a bit more readable by omitting unnecessary casts and intermediate variables. Observation and original patch by Tomas Vondra, adjusted to fix both uses of the formula by Alex Shulgin, cosmetic improvements by me --- src/backend/commands/analyze.c | 62 +++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 8a5f07c957..b0c65650ee 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -2072,6 +2072,12 @@ compute_distinct_stats(VacAttrStatsP stats, * recommend are considerably more complex, and are numerically * very unstable when n is much smaller than N. * + * In this calculation, we consider only non-nulls. We used to + * include rows with null values in the n and N counts, but that + * leads to inaccurate answers in columns with many nulls, and + * it's intuitively bogus anyway considering the desired result is + * the number of distinct non-null values. + * * We assume (not very reliably!) that all the multiply-occurring * values are reflected in the final track[] list, and the other * nonnull values all appeared but once. (XXX this usually @@ -2081,21 +2087,22 @@ compute_distinct_stats(VacAttrStatsP stats, */ int f1 = nonnull_cnt - summultiple; int d = f1 + nmultiple; - double numer, - denom, - stadistinct; + double n = samplerows - null_cnt; + double N = totalrows * (1.0 - stats->stanullfrac); + double stadistinct; - numer = (double) samplerows *(double) d; + /* N == 0 shouldn't happen, but just in case ... */ + if (N > 0) + stadistinct = (n * d) / ((n - f1) + f1 * n / N); + else + stadistinct = 0; - denom = (double) (samplerows - f1) + - (double) f1 *(double) samplerows / totalrows; - - stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ - if (stadistinct < (double) d) - stadistinct = (double) d; - if (stadistinct > totalrows) - stadistinct = totalrows; + if (stadistinct < d) + stadistinct = d; + if (stadistinct > N) + stadistinct = N; + /* And round to integer */ stats->stadistinct = floor(stadistinct + 0.5); } @@ -2425,26 +2432,33 @@ compute_scalar_stats(VacAttrStatsP stats, * recommend are considerably more complex, and are numerically * very unstable when n is much smaller than N. * + * In this calculation, we consider only non-nulls. We used to + * include rows with null values in the n and N counts, but that + * leads to inaccurate answers in columns with many nulls, and + * it's intuitively bogus anyway considering the desired result is + * the number of distinct non-null values. + * * Overwidth values are assumed to have been distinct. *---------- */ int f1 = ndistinct - nmultiple + toowide_cnt; int d = f1 + nmultiple; - double numer, - denom, - stadistinct; + double n = samplerows - null_cnt; + double N = totalrows * (1.0 - stats->stanullfrac); + double stadistinct; - numer = (double) samplerows *(double) d; + /* N == 0 shouldn't happen, but just in case ... */ + if (N > 0) + stadistinct = (n * d) / ((n - f1) + f1 * n / N); + else + stadistinct = 0; - denom = (double) (samplerows - f1) + - (double) f1 *(double) samplerows / totalrows; - - stadistinct = numer / denom; /* Clamp to sane range in case of roundoff error */ - if (stadistinct < (double) d) - stadistinct = (double) d; - if (stadistinct > totalrows) - stadistinct = totalrows; + if (stadistinct < d) + stadistinct = d; + if (stadistinct > N) + stadistinct = N; + /* And round to integer */ stats->stadistinct = floor(stadistinct + 0.5); }