From a8fe109ac186dea6616ec207c349ef92c214bfda Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 10 Jun 2001 02:59:35 +0000 Subject: [PATCH] Fix thinko in hash cost estimation: average frequency should be computed from total number of distinct values in whole relation, not # distinct values we expect to have after restriction clauses are applied. --- src/backend/optimizer/path/costsize.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 65c211deae..06793f1d8b 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -42,7 +42,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.75 2001/06/05 05:26:04 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.76 2001/06/10 02:59:35 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -873,6 +873,9 @@ estimate_hash_bucketsize(Query *root, Var *var) if (ndistinct < 0.0) ndistinct = -ndistinct * rel->tuples; + /* Also compute avg freq of all distinct data values in raw relation */ + avgfreq = (1.0 - stats->stanullfrac) / ndistinct; + /* * Adjust ndistinct to account for restriction clauses. Observe we are * assuming that the data distribution is affected uniformly by the @@ -883,17 +886,6 @@ estimate_hash_bucketsize(Query *root, Var *var) */ ndistinct *= rel->rows / rel->tuples; - /* - * Discourage use of hash join if there seem not to be very many distinct - * data values. The threshold here is somewhat arbitrary, as is the - * fraction used to "discourage" the choice. - */ - if (ndistinct < 50.0) - { - ReleaseSysCache(tuple); - return 0.5; - } - /* * Form initial estimate of bucketsize fraction. Here we use rel->rows, * ie the number of rows after applying restriction clauses, because @@ -903,8 +895,8 @@ estimate_hash_bucketsize(Query *root, Var *var) estfract = (double) NTUP_PER_BUCKET / rel->rows; /* - * Adjust estimated bucketsize if too few distinct values to fill - * all the buckets. + * Adjust estimated bucketsize if too few distinct values (after + * restriction clauses) to fill all the buckets. */ needdistinct = rel->rows / (double) NTUP_PER_BUCKET; if (ndistinct < needdistinct) @@ -931,8 +923,6 @@ estimate_hash_bucketsize(Query *root, Var *var) /* * Adjust estimated bucketsize upward to account for skewed distribution. */ - avgfreq = (1.0 - stats->stanullfrac) / ndistinct; - if (avgfreq > 0.0 && mcvfreq > avgfreq) estfract *= mcvfreq / avgfreq;