diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 77d2651d6d..0d3c6e84d9 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -6,7 +6,7 @@ * Copyright (c) 1994, Regents of the University of California * * - * $Id: nodeHash.c,v 1.32 1999/04/07 23:33:30 tgl Exp $ + * $Id: nodeHash.c,v 1.33 1999/05/06 00:30:46 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -38,12 +38,13 @@ #include "utils/hsearch.h" extern int NBuffers; -static int HashTBSize; + +#define HJ_TEMP_NAMELEN 16 /* max length for mk_hj_temp file names */ static void mk_hj_temp(char *tempname); static int hashFunc(Datum key, int len, bool byVal); -static int ExecHashPartition(Hash *node); static RelativeAddr hashTableAlloc(int size, HashJoinTable hashtable); +static void * absHashTableAlloc(int size, HashJoinTable hashtable); static void ExecHashOverflowInsert(HashJoinTable hashtable, HashBucket bucket, HeapTuple heapTuple); @@ -270,13 +271,19 @@ ExecEndHash(Hash *node) static RelativeAddr hashTableAlloc(int size, HashJoinTable hashtable) { - RelativeAddr p; - - p = hashtable->top; - hashtable->top += size; + RelativeAddr p = hashtable->top; + hashtable->top += MAXALIGN(size); return p; } +static void * +absHashTableAlloc(int size, HashJoinTable hashtable) +{ + RelativeAddr p = hashTableAlloc(size, hashtable); + return ABSADDR(p); +} + + /* ---------------------------------------------------------------- * ExecHashTableCreate * @@ -290,9 +297,12 @@ HashJoinTable ExecHashTableCreate(Hash *node) { Plan *outerNode; + int HashTBSize; int nbatch; int ntuples; int tupsize; + int pages; + int sqrtpages; IpcMemoryId shmid; HashJoinTable hashtable; HashBucket bucket; @@ -307,43 +317,72 @@ ExecHashTableCreate(Hash *node) int *innerbatchSizes; RelativeAddr tempname; - nbatch = -1; - HashTBSize = NBuffers / 2; - while (nbatch < 0) - { - - /* - * determine number of batches for the hashjoin - */ - HashTBSize *= 2; - nbatch = ExecHashPartition(node); - } /* ---------------- - * get information about the size of the relation + * Get information about the size of the relation to be hashed + * (it's the "outer" subtree of this node, but the inner relation of + * the hashjoin). + * Caution: this is only the planner's estimates, and so + * can't be trusted too far. Apply a healthy fudge factor. * ---------------- */ outerNode = outerPlan(node); ntuples = outerNode->plan_size; - if (ntuples <= 0) - ntuples = 1000; /* XXX just a hack */ + if (ntuples <= 0) /* force a plausible size if no info */ + ntuples = 1000; tupsize = outerNode->plan_width + sizeof(HeapTupleData); + pages = (int) ceil((double) ntuples * tupsize * FUDGE_FAC / BLCKSZ); /* - * totalbuckets is the total number of hash buckets needed for the - * entire relation + * Max hashtable size is NBuffers pages, but not less than + * sqrt(estimated inner rel size), so as to avoid horrible performance. + * XXX since the hashtable is not allocated in shared mem anymore, + * it would probably be more appropriate to drive this from -S than -B. */ - totalbuckets = ceil((double) ntuples / NTUP_PER_BUCKET); - bucketsize = LONGALIGN(NTUP_PER_BUCKET * tupsize + sizeof(*bucket)); + sqrtpages = (int) ceil(sqrt((double) pages)); + HashTBSize = NBuffers; + if (sqrtpages > HashTBSize) + HashTBSize = sqrtpages; /* - * nbuckets is the number of hash buckets for the first pass of hybrid - * hashjoin + * Count the number of hash buckets we want for the whole relation, + * and the number we can actually fit in the allowed memory. + * NOTE: FUDGE_FAC here determines the fraction of the hashtable space + * saved for overflow records. Need a better approach... */ - nbuckets = (HashTBSize - nbatch) * BLCKSZ / (bucketsize * FUDGE_FAC); - if (totalbuckets < nbuckets) - totalbuckets = nbuckets; - if (nbatch == 0) + totalbuckets = (int) ceil((double) ntuples / NTUP_PER_BUCKET); + bucketsize = MAXALIGN(NTUP_PER_BUCKET * tupsize + sizeof(*bucket)); + nbuckets = (int) ((HashTBSize * BLCKSZ) / (bucketsize * FUDGE_FAC)); + + if (totalbuckets <= nbuckets) + { + /* We have enough space, so no batching. In theory we could + * even reduce HashTBSize, but as long as we don't have a way + * to deal with overflow-space overrun, best to leave the + * extra space available for overflow. + */ nbuckets = totalbuckets; + nbatch = 0; + } + else + { + /* Need to batch; compute how many batches we want to use. + * Note that nbatch doesn't have to have anything to do with + * the ratio totalbuckets/nbuckets; in fact, it is the number + * of groups we will use for the part of the data that doesn't + * fall into the first nbuckets hash buckets. + */ + nbatch = (int) ceil((double) (pages - HashTBSize) / HashTBSize); + if (nbatch <= 0) + nbatch = 1; + } + + /* Now, totalbuckets is the number of (virtual) hashbuckets for the + * whole relation, and nbuckets is the number of physical hashbuckets + * we will use in the first pass. Data falling into the first nbuckets + * virtual hashbuckets gets handled in the first pass; everything else + * gets divided into nbatch batches to be processed in additional + * passes. + */ #ifdef HJDEBUG printf("nbatch = %d, totalbuckets = %d, nbuckets = %d\n", nbatch, totalbuckets, nbuckets); @@ -351,10 +390,11 @@ ExecHashTableCreate(Hash *node) /* ---------------- * in non-parallel machines, we don't need to put the hash table - * in the shared memory. We just palloc it. + * in the shared memory. We just palloc it. The space needed + * is the hash area itself plus nbatch+1 I/O buffer pages. * ---------------- */ - hashtable = (HashJoinTable) palloc((HashTBSize + 1) * BLCKSZ); + hashtable = (HashJoinTable) palloc((HashTBSize + nbatch + 1) * BLCKSZ); shmid = 0; if (hashtable == NULL) @@ -367,13 +407,15 @@ ExecHashTableCreate(Hash *node) hashtable->totalbuckets = totalbuckets; hashtable->bucketsize = bucketsize; hashtable->shmid = shmid; - hashtable->top = sizeof(HashTableData); + hashtable->top = MAXALIGN(sizeof(HashTableData)); hashtable->bottom = HashTBSize * BLCKSZ; - /* - * hashtable->readbuf has to be long aligned!!! + * hashtable->readbuf has to be maxaligned!!! + * Note there are nbatch additional pages available after readbuf; + * these are used for buffering the outgoing batch data. */ hashtable->readbuf = hashtable->bottom; + hashtable->batch = hashtable->bottom + BLCKSZ; hashtable->nbatch = nbatch; hashtable->curbatch = 0; hashtable->pcount = hashtable->nprocess = 0; @@ -383,13 +425,13 @@ ExecHashTableCreate(Hash *node) * allocate and initialize the outer batches * --------------- */ - outerbatchNames = (RelativeAddr *) ABSADDR( - hashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable)); - outerbatchPos = (RelativeAddr *) ABSADDR( - hashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable)); + outerbatchNames = (RelativeAddr *) + absHashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable); + outerbatchPos = (RelativeAddr *) + absHashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable); for (i = 0; i < nbatch; i++) { - tempname = hashTableAlloc(12, hashtable); + tempname = hashTableAlloc(HJ_TEMP_NAMELEN, hashtable); mk_hj_temp(ABSADDR(tempname)); outerbatchNames[i] = tempname; outerbatchPos[i] = -1; @@ -400,15 +442,15 @@ ExecHashTableCreate(Hash *node) * allocate and initialize the inner batches * --------------- */ - innerbatchNames = (RelativeAddr *) ABSADDR( - hashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable)); - innerbatchPos = (RelativeAddr *) ABSADDR( - hashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable)); - innerbatchSizes = (int *) ABSADDR( - hashTableAlloc(nbatch * sizeof(int), hashtable)); + innerbatchNames = (RelativeAddr *) + absHashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable); + innerbatchPos = (RelativeAddr *) + absHashTableAlloc(nbatch * sizeof(RelativeAddr), hashtable); + innerbatchSizes = (int *) + absHashTableAlloc(nbatch * sizeof(int), hashtable); for (i = 0; i < nbatch; i++) { - tempname = hashTableAlloc(12, hashtable); + tempname = hashTableAlloc(HJ_TEMP_NAMELEN, hashtable); mk_hj_temp(ABSADDR(tempname)); innerbatchNames[i] = tempname; innerbatchPos[i] = -1; @@ -427,9 +469,8 @@ ExecHashTableCreate(Hash *node) hashtable->innerbatchSizes = (RelativeAddr) NULL; } - hashtable->batch = (RelativeAddr) LONGALIGN(hashtable->top + - bucketsize * nbuckets); - hashtable->overflownext = hashtable->batch + nbatch * BLCKSZ; + hashtable->overflownext = hashtable->top + bucketsize * nbuckets; + Assert(hashtable->overflownext < hashtable->bottom); /* ---------------- * initialize each hash bucket * ---------------- @@ -437,10 +478,10 @@ ExecHashTableCreate(Hash *node) bucket = (HashBucket) ABSADDR(hashtable->top); for (i = 0; i < nbuckets; i++) { - bucket->top = RELADDR((char *) bucket + sizeof(*bucket)); + bucket->top = RELADDR((char *) bucket + MAXALIGN(sizeof(*bucket))); bucket->bottom = bucket->top; bucket->firstotuple = bucket->lastotuple = -1; - bucket = (HashBucket) LONGALIGN(((char *) bucket + bucketsize)); + bucket = (HashBucket) ((char *) bucket + bucketsize); } return hashtable; } @@ -494,18 +535,18 @@ ExecHashTableInsert(HashJoinTable hashtable, */ bucket = (HashBucket) (ABSADDR(hashtable->top) + bucketno * hashtable->bucketsize); - if ((char *) LONGALIGN(ABSADDR(bucket->bottom)) - (char *) bucket + if (((char *) MAXALIGN(ABSADDR(bucket->bottom)) - (char *) bucket) + heapTuple->t_len + HEAPTUPLESIZE > hashtable->bucketsize) ExecHashOverflowInsert(hashtable, bucket, heapTuple); else { - memmove((char *) LONGALIGN(ABSADDR(bucket->bottom)), + memmove((char *) MAXALIGN(ABSADDR(bucket->bottom)), heapTuple, HEAPTUPLESIZE); - memmove((char *) LONGALIGN(ABSADDR(bucket->bottom)) + HEAPTUPLESIZE, + memmove((char *) MAXALIGN(ABSADDR(bucket->bottom)) + HEAPTUPLESIZE, heapTuple->t_data, heapTuple->t_len); - bucket->bottom = ((RelativeAddr) LONGALIGN(bucket->bottom) + + bucket->bottom = ((RelativeAddr) MAXALIGN(bucket->bottom) + heapTuple->t_len + HEAPTUPLESIZE); } } @@ -515,9 +556,8 @@ ExecHashTableInsert(HashJoinTable hashtable, * put the tuple into a tmp file for other batches * ----------------- */ - batchno = (float) (bucketno - hashtable->nbuckets) / - (float) (hashtable->totalbuckets - hashtable->nbuckets) - * nbatch; + batchno = (nbatch * (bucketno - hashtable->nbuckets)) / + (hashtable->totalbuckets - hashtable->nbuckets); buffer = ABSADDR(hashtable->batch) + batchno * BLCKSZ; batchSizes[batchno]++; pos = (char *) @@ -614,19 +654,11 @@ ExecHashOverflowInsert(HashJoinTable hashtable, * see if we run out of overflow space * ---------------- */ - newend = (RelativeAddr) LONGALIGN(hashtable->overflownext + sizeof(*otuple) + newend = (RelativeAddr) MAXALIGN(hashtable->overflownext + sizeof(*otuple) + heapTuple->t_len + HEAPTUPLESIZE); if (newend > hashtable->bottom) - { - /* ------------------ - * XXX the temporary hack above doesn't work because things - * above us don't know that we've moved the hash table! - * - Chris Dunlop, - * ------------------ - */ elog(ERROR, - "hash table out of memory. Use -B parameter to increase buffers."); - } + "hash table out of memory. Use -B parameter to increase buffers."); /* ---------------- * establish the overflow chain @@ -647,7 +679,7 @@ ExecHashOverflowInsert(HashJoinTable hashtable, * ---------------- */ otuple->next = -1; - otuple->tuple = RELADDR(LONGALIGN(((char *) otuple + sizeof(*otuple)))); + otuple->tuple = RELADDR(MAXALIGN(((char *) otuple + sizeof(*otuple)))); memmove(ABSADDR(otuple->tuple), heapTuple, HEAPTUPLESIZE); @@ -690,10 +722,10 @@ ExecScanHashBucket(HashJoinState *hjstate, { if (curtuple == NULL) heapTuple = (HeapTuple) - LONGALIGN(ABSADDR(bucket->top)); + MAXALIGN(ABSADDR(bucket->top)); else heapTuple = (HeapTuple) - LONGALIGN(((char *) curtuple + curtuple->t_len + HEAPTUPLESIZE)); + MAXALIGN(((char *) curtuple + curtuple->t_len + HEAPTUPLESIZE)); while (heapTuple < (HeapTuple) ABSADDR(bucket->bottom)) { @@ -713,7 +745,7 @@ ExecScanHashBucket(HashJoinState *hjstate, return heapTuple; heapTuple = (HeapTuple) - LONGALIGN(((char *) heapTuple + heapTuple->t_len + HEAPTUPLESIZE)); + MAXALIGN(((char *) heapTuple + heapTuple->t_len + HEAPTUPLESIZE)); } if (firstotuple == NULL) @@ -810,48 +842,12 @@ hashFunc(Datum key, int len, bool byVal) return h % PRIME2; } -/* ---------------------------------------------------------------- - * ExecHashPartition - * - * determine the number of batches needed for a hashjoin - * ---------------------------------------------------------------- - */ -static int -ExecHashPartition(Hash *node) -{ - Plan *outerNode; - int b; - int pages; - int ntuples; - int tupsize; - - /* - * get size information for plan node - */ - outerNode = outerPlan(node); - ntuples = outerNode->plan_size; - if (ntuples == 0) - ntuples = 1000; - tupsize = outerNode->plan_width + sizeof(HeapTupleData); - pages = ceil((double) ntuples * tupsize * FUDGE_FAC / BLCKSZ); - - /* - * if amount of buffer space below hashjoin threshold, return negative - */ - if (ceil(sqrt((double) pages)) > HashTBSize) - return -1; - if (pages <= HashTBSize) - b = 0; /* fit in memory, no partitioning */ - else - b = ceil((double) (pages - HashTBSize) / (double) (HashTBSize - 1)); - - return b; -} - /* ---------------------------------------------------------------- * ExecHashTableReset * * reset hash table header for new batch + * + * ntuples is the number of tuples in the inner relation's batch * ---------------------------------------------------------------- */ void @@ -860,29 +856,42 @@ ExecHashTableReset(HashJoinTable hashtable, int ntuples) int i; HashBucket bucket; - hashtable->nbuckets = hashtable->totalbuckets - = ceil((double) ntuples / NTUP_PER_BUCKET); + /* + * We can reset the number of hashbuckets since we are going to + * recalculate the hash values of all the tuples in the new batch + * anyway. We might as well spread out the hash values as much as + * we can within the available space. Note we must set nbuckets + * equal to totalbuckets since we will NOT generate any new output + * batches after this point. + */ + hashtable->nbuckets = hashtable->totalbuckets = + (int) (hashtable->bottom / (hashtable->bucketsize * FUDGE_FAC)); + /* + * reinitialize the overflow area to empty, and reinit each hash bucket. + */ hashtable->overflownext = hashtable->top + hashtable->bucketsize * hashtable->nbuckets; + Assert(hashtable->overflownext < hashtable->bottom); bucket = (HashBucket) ABSADDR(hashtable->top); for (i = 0; i < hashtable->nbuckets; i++) { - bucket->top = RELADDR((char *) bucket + sizeof(*bucket)); + bucket->top = RELADDR((char *) bucket + MAXALIGN(sizeof(*bucket))); bucket->bottom = bucket->top; bucket->firstotuple = bucket->lastotuple = -1; bucket = (HashBucket) ((char *) bucket + hashtable->bucketsize); } + hashtable->pcount = hashtable->nprocess; } -static int hjtmpcnt = 0; - static void mk_hj_temp(char *tempname) { - snprintf(tempname, strlen(tempname), "HJ%d.%d", (int) MyProcPid, hjtmpcnt); + static int hjtmpcnt = 0; + + snprintf(tempname, HJ_TEMP_NAMELEN, "HJ%d.%d", (int) MyProcPid, hjtmpcnt); hjtmpcnt = (hjtmpcnt + 1) % 1000; } diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index a2710148fb..b4ffdd5c73 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.17 1999/02/13 23:15:23 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.18 1999/05/06 00:30:47 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -650,8 +650,8 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate, heapTuple = (HeapTuple) (*position); heapTuple->t_data = (HeapTupleHeader) ((char *) heapTuple + HEAPTUPLESIZE); - (*position) = (char *) LONGALIGN(*position + - heapTuple->t_len + HEAPTUPLESIZE); + (*position) = (char *) MAXALIGN(*position + + heapTuple->t_len + HEAPTUPLESIZE); return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, false); } @@ -843,7 +843,7 @@ ExecHashJoinSaveTuple(HeapTuple heapTuple, } memmove(position, heapTuple, HEAPTUPLESIZE); memmove(position + HEAPTUPLESIZE, heapTuple->t_data, heapTuple->t_len); - position = (char *) LONGALIGN(position + heapTuple->t_len + HEAPTUPLESIZE); + position = (char *) MAXALIGN(position + heapTuple->t_len + HEAPTUPLESIZE); *pageend = position - buffer; return position; diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 66e88a0915..ec6c234f28 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -6,7 +6,7 @@ * * Copyright (c) 1994, Regents of the University of California * - * $Id: hashjoin.h,v 1.8 1999/02/13 23:21:24 momjian Exp $ + * $Id: hashjoin.h,v 1.9 1999/05/06 00:30:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -17,18 +17,23 @@ /* ----------------- * have to use relative address as pointers in the hashtable - * because the hashtable may reallocate in difference processes + * because the hashtable may reallocate in different processes + * + * XXX: this relative-address stuff is useless on all supported platforms + * and is a ever-dangerous source of bugs. Really ought to rip it out. * ----------------- */ typedef int RelativeAddr; /* ------------------ - * the relative addresses are always relative to the head of the - * hashtable, the following macro converts them to absolute address. + * The relative addresses are always relative to the head of the + * hashtable, the following macros convert them to/from absolute address. + * NULL is represented as -1 (CAUTION: RELADDR() doesn't handle that!). + * CAUTION: ABSADDR evaluates its arg twice!! * ------------------ */ -#define ABSADDR(X) ((X) < 0 ? NULL: (char*)hashtable + X) -#define RELADDR(X) (RelativeAddr)((char*)(X) - (char*)hashtable) +#define ABSADDR(X) ((X) < 0 ? (char*) NULL : (char*)hashtable + (X)) +#define RELADDR(X) ((RelativeAddr)((char*)(X) - (char*)hashtable)) typedef char **charPP; typedef int *intP; @@ -79,6 +84,4 @@ typedef struct HashBucketData typedef HashBucketData *HashBucket; -#define HASH_PERMISSION 0700 - #endif /* HASHJOIN_H */