diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index b8fed0304f..b8cee44fbd 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * - * $Id: nodeHash.c,v 1.57 2001/05/27 20:42:18 tgl Exp $ + * $Id: nodeHash.c,v 1.58 2001/06/11 00:17:07 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -16,14 +16,12 @@ * ExecHash - generate an in-memory hash table of the relation * ExecInitHash - initialize node and subnodes * ExecEndHash - shutdown node and subnodes - * */ +#include "postgres.h" #include #include -#include "postgres.h" - #include "executor/execdebug.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" @@ -209,111 +207,27 @@ ExecEndHash(Hash *node) * create a hashtable in shared memory for hashjoin. * ---------------------------------------------------------------- */ -#define FUDGE_FAC 2.0 - HashJoinTable ExecHashTableCreate(Hash *node) { - Plan *outerNode; - double ntuples; - int tupsize; - double inner_rel_bytes; - double hash_table_bytes; - int nbatch; HashJoinTable hashtable; - int nbuckets; + Plan *outerNode; int totalbuckets; - int bucketsize; + int nbuckets; + int nbatch; int i; MemoryContext oldcxt; /* * Get information about the size of the relation to be hashed (it's * the "outer" subtree of this node, but the inner relation of the - * hashjoin). - * - * Caution: this is only the planner's estimates, and so can't be trusted - * too far. Apply a healthy fudge factor. + * hashjoin). Compute the appropriate size of the hash table. */ outerNode = outerPlan(node); - ntuples = outerNode->plan_rows; - if (ntuples <= 0.0) /* force a plausible size if no info */ - ntuples = 1000.0; - /* - * estimate tupsize based on footprint of tuple in hashtable... but - * what about palloc overhead? - */ - tupsize = MAXALIGN(outerNode->plan_width) + - MAXALIGN(sizeof(HashJoinTupleData)); - inner_rel_bytes = ntuples * tupsize * FUDGE_FAC; + ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width, + &totalbuckets, &nbuckets, &nbatch); - /* - * Target hashtable size is SortMem kilobytes, but not less than - * sqrt(estimated inner rel size), so as to avoid horrible - * performance. - */ - hash_table_bytes = sqrt(inner_rel_bytes); - if (hash_table_bytes < (SortMem * 1024L)) - hash_table_bytes = SortMem * 1024L; - - /* - * Count the number of hash buckets we want for the whole relation, - * for an average bucket load of NTUP_PER_BUCKET (per virtual - * bucket!). - */ - totalbuckets = (int) ceil(ntuples * FUDGE_FAC / NTUP_PER_BUCKET); - - /* - * Count the number of buckets we think will actually fit in the - * target memory size, at a loading of NTUP_PER_BUCKET (physical - * buckets). NOTE: FUDGE_FAC here determines the fraction of the - * hashtable space reserved to allow for nonuniform distribution of - * hash values. Perhaps this should be a different number from the - * other uses of FUDGE_FAC, but since we have no real good way to pick - * either one... - */ - bucketsize = NTUP_PER_BUCKET * tupsize; - nbuckets = (int) (hash_table_bytes / (bucketsize * FUDGE_FAC)); - if (nbuckets <= 0) - nbuckets = 1; - - if (totalbuckets <= nbuckets) - { - - /* - * We have enough space, so no batching. In theory we could even - * reduce nbuckets, but since that could lead to poor behavior if - * estimated ntuples is much less than reality, it seems better to - * make more buckets instead of fewer. - */ - totalbuckets = nbuckets; - nbatch = 0; - } - else - { - - /* - * Need to batch; compute how many batches we want to use. Note - * that nbatch doesn't have to have anything to do with the ratio - * totalbuckets/nbuckets; in fact, it is the number of groups we - * will use for the part of the data that doesn't fall into the - * first nbuckets hash buckets. - */ - nbatch = (int) ceil((inner_rel_bytes - hash_table_bytes) / - hash_table_bytes); - if (nbatch <= 0) - nbatch = 1; - } - - /* - * Now, totalbuckets is the number of (virtual) hashbuckets for the - * whole relation, and nbuckets is the number of physical hashbuckets - * we will use in the first pass. Data falling into the first - * nbuckets virtual hashbuckets gets handled in the first pass; - * everything else gets divided into nbatch batches to be processed in - * additional passes. - */ #ifdef HJDEBUG printf("nbatch = %d, totalbuckets = %d, nbuckets = %d\n", nbatch, totalbuckets, nbuckets); @@ -407,6 +321,117 @@ ExecHashTableCreate(Hash *node) return hashtable; } + +/* + * Compute appropriate size for hashtable given the estimated size of the + * relation to be hashed (number of rows and average row width). + * + * Caution: the input is only the planner's estimates, and so can't be + * trusted too far. Apply a healthy fudge factor. + * + * This is exported so that the planner's costsize.c can use it. + */ + +/* Target bucket loading (tuples per bucket) */ +#define NTUP_PER_BUCKET 10 +/* Fudge factor to allow for inaccuracy of input estimates */ +#define FUDGE_FAC 2.0 + +void +ExecChooseHashTableSize(double ntuples, int tupwidth, + int *virtualbuckets, + int *physicalbuckets, + int *numbatches) +{ + int tupsize; + double inner_rel_bytes; + double hash_table_bytes; + int nbatch; + int nbuckets; + int totalbuckets; + int bucketsize; + + /* Force a plausible relation size if no info */ + if (ntuples <= 0.0) + ntuples = 1000.0; + + /* + * Estimate tupsize based on footprint of tuple in hashtable... but + * what about palloc overhead? + */ + tupsize = MAXALIGN(tupwidth) + MAXALIGN(sizeof(HashJoinTupleData)); + inner_rel_bytes = ntuples * tupsize * FUDGE_FAC; + + /* + * Target hashtable size is SortMem kilobytes, but not less than + * sqrt(estimated inner rel size), so as to avoid horrible + * performance. + */ + hash_table_bytes = sqrt(inner_rel_bytes); + if (hash_table_bytes < (SortMem * 1024L)) + hash_table_bytes = SortMem * 1024L; + + /* + * Count the number of hash buckets we want for the whole relation, + * for an average bucket load of NTUP_PER_BUCKET (per virtual + * bucket!). + */ + totalbuckets = (int) ceil(ntuples * FUDGE_FAC / NTUP_PER_BUCKET); + + /* + * Count the number of buckets we think will actually fit in the + * target memory size, at a loading of NTUP_PER_BUCKET (physical + * buckets). NOTE: FUDGE_FAC here determines the fraction of the + * hashtable space reserved to allow for nonuniform distribution of + * hash values. Perhaps this should be a different number from the + * other uses of FUDGE_FAC, but since we have no real good way to pick + * either one... + */ + bucketsize = NTUP_PER_BUCKET * tupsize; + nbuckets = (int) (hash_table_bytes / (bucketsize * FUDGE_FAC)); + if (nbuckets <= 0) + nbuckets = 1; + + if (totalbuckets <= nbuckets) + { + /* + * We have enough space, so no batching. In theory we could even + * reduce nbuckets, but since that could lead to poor behavior if + * estimated ntuples is much less than reality, it seems better to + * make more buckets instead of fewer. + */ + totalbuckets = nbuckets; + nbatch = 0; + } + else + { + /* + * Need to batch; compute how many batches we want to use. Note + * that nbatch doesn't have to have anything to do with the ratio + * totalbuckets/nbuckets; in fact, it is the number of groups we + * will use for the part of the data that doesn't fall into the + * first nbuckets hash buckets. + */ + nbatch = (int) ceil((inner_rel_bytes - hash_table_bytes) / + hash_table_bytes); + if (nbatch <= 0) + nbatch = 1; + } + + /* + * Now, totalbuckets is the number of (virtual) hashbuckets for the + * whole relation, and nbuckets is the number of physical hashbuckets + * we will use in the first pass. Data falling into the first + * nbuckets virtual hashbuckets gets handled in the first pass; + * everything else gets divided into nbatch batches to be processed in + * additional passes. + */ + *virtualbuckets = totalbuckets; + *physicalbuckets = nbuckets; + *numbatches = nbatch; +} + + /* ---------------------------------------------------------------- * ExecHashTableDestroy * diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 06793f1d8b..2099adc664 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -42,7 +42,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.76 2001/06/10 02:59:35 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.77 2001/06/11 00:17:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -791,19 +791,19 @@ cost_hashjoin(Path *path, Query *root, * smart enough to figure out how the restrict clauses might change the * distribution, so this will have to do for now. * - * The executor tries for average bucket loading of NTUP_PER_BUCKET by setting - * number of buckets equal to ntuples / NTUP_PER_BUCKET, which would yield - * a bucketsize fraction of NTUP_PER_BUCKET / ntuples. But that goal will - * be reached only if the data values are uniformly distributed among the - * buckets, which requires (a) at least ntuples / NTUP_PER_BUCKET distinct - * data values, and (b) a not-too-skewed data distribution. Otherwise the - * buckets will be nonuniformly occupied. If the other relation in the join - * has a similar distribution, the most-loaded buckets are exactly those - * that will be probed most often. Therefore, the "average" bucket size for - * costing purposes should really be taken as something close to the "worst - * case" bucket size. We try to estimate this by first scaling up if there - * are too few distinct data values, and then scaling up again by the - * ratio of the most common value's frequency to the average frequency. + * We can get the number of buckets the executor will use for the given + * input relation. If the data were perfectly distributed, with the same + * number of tuples going into each available bucket, then the bucketsize + * fraction would be 1/nbuckets. But this happy state of affairs will occur + * only if (a) there are at least nbuckets distinct data values, and (b) + * we have a not-too-skewed data distribution. Otherwise the buckets will + * be nonuniformly occupied. If the other relation in the join has a key + * distribution similar to this one's, then the most-loaded buckets are + * exactly those that will be probed most often. Therefore, the "average" + * bucket size for costing purposes should really be taken as something close + * to the "worst case" bucket size. We try to estimate this by adjusting the + * fraction if there are too few distinct data values, and then scaling up + * by the ratio of the most common value's frequency to the average frequency. * * If no statistics are available, use a default estimate of 0.1. This will * discourage use of a hash rather strongly if the inner relation is large, @@ -815,11 +815,13 @@ estimate_hash_bucketsize(Query *root, Var *var) { Oid relid; RelOptInfo *rel; + int virtualbuckets; + int physicalbuckets; + int numbatches; HeapTuple tuple; Form_pg_statistic stats; double estfract, ndistinct, - needdistinct, mcvfreq, avgfreq; float4 *numbers; @@ -841,6 +843,12 @@ estimate_hash_bucketsize(Query *root, Var *var) if (rel->tuples <= 0.0 || rel->rows <= 0.0) return 0.1; /* ensure we can divide below */ + /* Get hash table size that executor would use for this relation */ + ExecChooseHashTableSize(rel->rows, rel->width, + &virtualbuckets, + &physicalbuckets, + &numbatches); + tuple = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid), Int16GetDatum(var->varattno), @@ -857,7 +865,7 @@ estimate_hash_bucketsize(Query *root, Var *var) case ObjectIdAttributeNumber: case SelfItemPointerAttributeNumber: /* these are unique, so buckets should be well-distributed */ - return (double) NTUP_PER_BUCKET / rel->rows; + return 1.0 / (double) virtualbuckets; case TableOidAttributeNumber: /* hashing this is a terrible idea... */ return 1.0; @@ -873,6 +881,12 @@ estimate_hash_bucketsize(Query *root, Var *var) if (ndistinct < 0.0) ndistinct = -ndistinct * rel->tuples; + if (ndistinct <= 0.0) /* ensure we can divide */ + { + ReleaseSysCache(tuple); + return 0.1; + } + /* Also compute avg freq of all distinct data values in raw relation */ avgfreq = (1.0 - stats->stanullfrac) / ndistinct; @@ -887,20 +901,14 @@ estimate_hash_bucketsize(Query *root, Var *var) ndistinct *= rel->rows / rel->tuples; /* - * Form initial estimate of bucketsize fraction. Here we use rel->rows, - * ie the number of rows after applying restriction clauses, because - * that's what the fraction will eventually be multiplied by in - * cost_heapjoin. + * Initial estimate of bucketsize fraction is 1/nbuckets as long as + * the number of buckets is less than the expected number of distinct + * values; otherwise it is 1/ndistinct. */ - estfract = (double) NTUP_PER_BUCKET / rel->rows; - - /* - * Adjust estimated bucketsize if too few distinct values (after - * restriction clauses) to fill all the buckets. - */ - needdistinct = rel->rows / (double) NTUP_PER_BUCKET; - if (ndistinct < needdistinct) - estfract *= needdistinct / ndistinct; + if (ndistinct > (double) virtualbuckets) + estfract = 1.0 / (double) virtualbuckets; + else + estfract = 1.0 / ndistinct; /* * Look up the frequency of the most common value, if available. diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index e00bdfbc35..512edec6d1 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: nodeHash.h,v 1.19 2001/03/22 04:00:44 momjian Exp $ + * $Id: nodeHash.h,v 1.20 2001/06/11 00:17:07 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -16,9 +16,6 @@ #include "nodes/plannodes.h" -/* NTUP_PER_BUCKET is exported because planner wants to see it */ -#define NTUP_PER_BUCKET 10 - extern TupleTableSlot *ExecHash(Hash *node); extern bool ExecInitHash(Hash *node, EState *estate, Plan *parent); extern int ExecCountSlotsHash(Hash *node); @@ -35,5 +32,9 @@ extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate, List *hjclauses, ExprContext *econtext); extern void ExecHashTableReset(HashJoinTable hashtable, long ntuples); extern void ExecReScanHash(Hash *node, ExprContext *exprCtxt, Plan *parent); +extern void ExecChooseHashTableSize(double ntuples, int tupwidth, + int *virtualbuckets, + int *physicalbuckets, + int *numbatches); #endif /* NODEHASH_H */