postgresql/src/include/executor/hashjoin.h

/*-------------------------------------------------------------------------
 *
 * hashjoin.h
 *	  internal structures for hash joins
 *
 *
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.41 2006/07/13 18:01:02 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
#ifndef HASHJOIN_H
#define HASHJOIN_H

#include "fmgr.h"
#include "storage/buffile.h"

/* ----------------------------------------------------------------
 *				hash-join hash table structures
 *
 * Each active hashjoin has a HashJoinTable control block, which is
 * palloc'd in the executor's per-query context.  All other storage needed
 * for the hashjoin is kept in private memory contexts, two for each hashjoin.
 * This makes it easy and fast to release the storage when we don't need it
 * anymore.  (Exception: data associated with the temp files lives in the
 * per-query context too, since we always call buffile.c in that context.)
 *
 * The hashtable contexts are made children of the per-query context, ensuring
 * that they will be discarded at end of statement even if the join is
 * aborted early by an error.  (Likewise, any temporary files we make will
 * be cleaned up by the virtual file manager in event of an error.)
 *
 * Storage that should live through the entire join is allocated from the
 * "hashCxt", while storage that is only wanted for the current batch is
 * allocated in the "batchCxt".  By resetting the batchCxt at the end of
 * each batch, we free all the per-batch storage reliably and without tedium.
 *
 * During first scan of inner relation, we get its tuples from executor.
 * If nbatch > 1 then tuples that don't belong in first batch get saved
 * into inner-batch temp files. The same statements apply for the
 * first scan of the outer relation, except we write tuples to outer-batch
 * temp files.	After finishing the first scan, we do the following for
 * each remaining batch:
 *	1. Read tuples from inner batch file, load into hash buckets.
 *	2. Read tuples from outer batch file, match to hash buckets and output.
 *
 * It is possible to increase nbatch on the fly if the in-memory hash table
 * gets too big.  The hash-value-to-batch computation is arranged so that this
 * can only cause a tuple to go into a later batch than previously thought,
 * never into an earlier batch.  When we increase nbatch, we rescan the hash
 * table and dump out any tuples that are now of a later batch to the correct
 * inner batch file.  Subsequently, while reading either inner or outer batch
 * files, we might find tuples that no longer belong to the current batch;
 * if so, we just dump them out to the correct batch file.
 * ----------------------------------------------------------------
 */

/* these are in nodes/execnodes.h: */
/* typedef struct HashJoinTupleData *HashJoinTuple; */
/* typedef struct HashJoinTableData *HashJoinTable; */

typedef struct HashJoinTupleData
{
	struct HashJoinTupleData *next;		/* link to next tuple in same bucket */
	uint32		hashvalue;		/* tuple's hash code */
	/* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
} HashJoinTupleData;

#define HJTUPLE_OVERHEAD  MAXALIGN(sizeof(HashJoinTupleData))
#define HJTUPLE_MINTUPLE(hjtup)  \
	((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))


typedef struct HashJoinTableData
{
	int			nbuckets;		/* # buckets in the in-memory hash table */
	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
	struct HashJoinTupleData **buckets;
	/* buckets array is per-batch storage, as are all the tuples */

	int			nbatch;			/* number of batches */
	int			curbatch;		/* current batch #; 0 during 1st pass */

	int			nbatch_original;	/* nbatch when we started inner scan */
	int			nbatch_outstart;	/* nbatch when we started outer scan */

	bool		growEnabled;	/* flag to shut off nbatch increases */

	double		totalTuples;	/* # tuples obtained from inner plan */

	/*
	 * These arrays are allocated for the life of the hash join, but only if
	 * nbatch > 1.	A file is opened only when we first write a tuple into it
	 * (otherwise its pointer remains NULL).  Note that the zero'th array
	 * elements never get used, since we will process rather than dump out any
	 * tuples of batch zero.
	 */
	BufFile   **innerBatchFile; /* buffered virtual temp file per batch */
	BufFile   **outerBatchFile; /* buffered virtual temp file per batch */

	/*
	 * Info about the datatype-specific hash functions for the datatypes being
	 * hashed.	We assume that the inner and outer sides of each hashclause
	 * are the same type, or at least share the same hash function. This is an
	 * array of the same length as the number of hash keys.
	 */
	FmgrInfo   *hashfunctions;	/* lookup data for hash functions */

	Size		spaceUsed;		/* memory space currently used by tuples */
	Size		spaceAllowed;	/* upper limit for space used */

	MemoryContext hashCxt;		/* context for whole-hash-join storage */
	MemoryContext batchCxt;		/* context for this-batch-only storage */
} HashJoinTableData;

#endif   /* HASHJOIN_H */
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`/*-------------------------------------------------------------------------`
			`*`
Change my-function-name-- to my_function_name, and optimizer renames. 1999-02-14 00:22:53 +01:00			`* hashjoin.h`
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`* internal structures for hash joins`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`*`
			`*`
Update copyright for 2006. Update scripts. 2006-03-05 16:59:11 +01:00			`* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group`
Add: * Portions Copyright (c) 1996-2000, PostgreSQL, Inc to all files copyright Regents of Berkeley. Man, that's a lot of files. 2000-01-26 06:58:53 +01:00			`* Portions Copyright (c) 1994, Regents of the University of California`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`*`
More include file adjustments. 2006-07-13 20:01:02 +02:00			`* $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.41 2006/07/13 18:01:02 momjian Exp $`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`*`
			`*-------------------------------------------------------------------------`
			`*/`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`#ifndef HASHJOIN_H`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`#define HASHJOIN_H`

More include file adjustments. 2006-07-13 19:47:02 +02:00			`#include "fmgr.h"`
Split 'BufFile' routines out of fd.c into a new module, buffile.c. Extend BufFile so that it handles multi-segment temporary files transparently. This allows sorts and hashes to work with data exceeding 2Gig (or whatever the local limit on file size is). Change psort.c to use relative seeks instead of absolute seeks for backwards scanning, so that it won't fail when the data volume exceeds 2Gig. 1999-10-13 17:02:32 +02:00			`#include "storage/buffile.h"`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`/* ----------------------------------------------------------------`
			`* hash-join hash table structures`
			`*`
Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`* Each active hashjoin has a HashJoinTable control block, which is`
First stage of reclaiming memory in executor by resetting short-term memory contexts. Currently, only leaks in expressions executed as quals or projections are handled. Clean up some old dead cruft in executor while at it --- unused fields in state nodes, that sort of thing. 2000-07-12 04:37:39 +02:00			`* palloc'd in the executor's per-query context. All other storage needed`
			`* for the hashjoin is kept in private memory contexts, two for each hashjoin.`
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`* This makes it easy and fast to release the storage when we don't need it`
Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`* anymore. (Exception: data associated with the temp files lives in the`
			`* per-query context too, since we always call buffile.c in that context.)`
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`*`
Fix a many-legged critter reported by chifungfan@yahoo.com: under the right circumstances a hash join executed as a DECLARE CURSOR/FETCH query would crash the backend. Problem as seen in current sources was that the hash tables were stored in a context that was a child of TransactionCommandContext, which got zapped at completion of the FETCH command --- but cursor cleanup executed at COMMIT expected the tables to still be valid. I haven't chased down the details as seen in 7.0.* but I'm sure it's the same general problem. 2000-08-22 06:06:22 +02:00			`* The hashtable contexts are made children of the per-query context, ensuring`
First phase of memory management rewrite (see backend/utils/mmgr/README for details). It doesn't really do that much yet, since there are no short-term memory contexts in the executor, but the infrastructure is in place and long-term contexts are handled reasonably. A few long- standing bugs have been fixed, such as 'VACUUM; anything' in a single query string crashing. Also, out-of-memory is now considered a recoverable ERROR, not FATAL. Eliminate a large amount of crufty, now-dead code in and around memory management. Fix problem with holding off SIGTRAP, SIGSEGV, etc in postmaster and backend startup. 2000-06-28 05:33:33 +02:00			`* that they will be discarded at end of statement even if the join is`
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`* aborted early by an error. (Likewise, any temporary files we make will`
			`* be cleaned up by the virtual file manager in event of an error.)`
Fix some nasty coredump bugs in hashjoin. This code was just about certain to fail anytime it decided the relation to be hashed was too big to fit in memory --- the code for 'batching' a series of hashjoins had multiple errors. I've fixed the easier problems. A remaining big problem is that you can get 'hashtable out of memory' if the code's guesstimate about how much overflow space it will need turns out wrong. That will require much more extensive revisions to fix, so I'm committing these fixes now before I start on that problem. 1999-05-06 02:30:47 +02:00			`*`
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`* Storage that should live through the entire join is allocated from the`
First phase of memory management rewrite (see backend/utils/mmgr/README for details). It doesn't really do that much yet, since there are no short-term memory contexts in the executor, but the infrastructure is in place and long-term contexts are handled reasonably. A few long- standing bugs have been fixed, such as 'VACUUM; anything' in a single query string crashing. Also, out-of-memory is now considered a recoverable ERROR, not FATAL. Eliminate a large amount of crufty, now-dead code in and around memory management. Fix problem with holding off SIGTRAP, SIGSEGV, etc in postmaster and backend startup. 2000-06-28 05:33:33 +02:00			`* "hashCxt", while storage that is only wanted for the current batch is`
			`* allocated in the "batchCxt". By resetting the batchCxt at the end of`
			`* each batch, we free all the per-batch storage reliably and without tedium.`
Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`*`
			`* During first scan of inner relation, we get its tuples from executor.`
			`* If nbatch > 1 then tuples that don't belong in first batch get saved`
			`* into inner-batch temp files. The same statements apply for the`
			`* first scan of the outer relation, except we write tuples to outer-batch`
Standard pgindent run for 8.1. 2005-10-15 04:49:52 +02:00			`* temp files. After finishing the first scan, we do the following for`
Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`* each remaining batch:`
			`* 1. Read tuples from inner batch file, load into hash buckets.`
			`* 2. Read tuples from outer batch file, match to hash buckets and output.`
			`*`
			`* It is possible to increase nbatch on the fly if the in-memory hash table`
			`* gets too big. The hash-value-to-batch computation is arranged so that this`
			`* can only cause a tuple to go into a later batch than previously thought,`
			`* never into an earlier batch. When we increase nbatch, we rescan the hash`
			`* table and dump out any tuples that are now of a later batch to the correct`
			`* inner batch file. Subsequently, while reading either inner or outer batch`
			`* files, we might find tuples that no longer belong to the current batch;`
			`* if so, we just dump them out to the correct batch file.`
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`* ----------------------------------------------------------------`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`*/`

Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`/* these are in nodes/execnodes.h: */`
			`/* typedef struct HashJoinTupleData HashJoinTuple; /`
			`/* typedef struct HashJoinTableData HashJoinTable; /`

Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`typedef struct HashJoinTupleData`
			`{`
Standard pgindent run for 8.1. 2005-10-15 04:49:52 +02:00			`struct HashJoinTupleData next; / link to next tuple in same bucket */`
Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`uint32 hashvalue; /* tuple's hash code */`
Convert hash join code to use MinimalTuple format in tuple hash table and batch files. Should reduce memory and I/O demands for such joins. 2006-06-27 23:31:20 +02:00			`/* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */`
Another pgindent run. Sorry folks. 1999-05-26 00:43:53 +02:00			`} HashJoinTupleData;`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
Convert hash join code to use MinimalTuple format in tuple hash table and batch files. Should reduce memory and I/O demands for such joins. 2006-06-27 23:31:20 +02:00			`#define HJTUPLE_OVERHEAD MAXALIGN(sizeof(HashJoinTupleData))`
			`#define HJTUPLE_MINTUPLE(hjtup) \`
			`((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))`


Arrange for hash join to skip scanning the outer relation if it detects that the inner one is completely empty. Per recent discussion. Also some cosmetic cleanups in nearby code. 2004-09-22 21:13:52 +02:00			`typedef struct HashJoinTableData`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`{`
Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`int nbuckets; /* # buckets in the in-memory hash table */`
			`/* buckets[i] is head of list of tuples in i'th in-memory bucket */`
			`struct HashJoinTupleData **buckets;`
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`/* buckets array is per-batch storage, as are all the tuples */`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`int nbatch; /* number of batches */`
			`int curbatch; /* current batch #; 0 during 1st pass */`

			`int nbatch_original; /* nbatch when we started inner scan */`
			`int nbatch_outstart; /* nbatch when we started outer scan */`

			`bool growEnabled; /* flag to shut off nbatch increases */`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
Create a new 'MultiExecProcNode' call API for plan nodes that don't return just a single tuple at a time. Currently the only such node type is Hash, but I expect we will soon have indexscans that can return tuple bitmaps. A side benefit is that EXPLAIN ANALYZE now shows the correct tuple count for a Hash node. 2005-04-16 22:07:35 +02:00			`double totalTuples; /* # tuples obtained from inner plan */`
Arrange for hash join to skip scanning the outer relation if it detects that the inner one is completely empty. Per recent discussion. Also some cosmetic cleanups in nearby code. 2004-09-22 21:13:52 +02:00
pgindent run over code. 1999-05-25 18:15:34 +02:00			`/*`
Standard pgindent run for 8.1. 2005-10-15 04:49:52 +02:00			`* These arrays are allocated for the life of the hash join, but only if`
			`* nbatch > 1. A file is opened only when we first write a tuple into it`
			`* (otherwise its pointer remains NULL). Note that the zero'th array`
			`* elements never get used, since we will process rather than dump out any`
			`* tuples of batch zero.`
Rewrite hash join to use simple linked lists instead of a fixed-size hashtable. This should prevent 'hashtable out of memory' errors, unless you really do run out of memory. Note: target size for hashtable is now taken from -S postmaster switch, not -B, since it is local memory in the backend rather than shared memory. 1999-05-18 23:33:06 +02:00			`*/`
pgindent run over code. 1999-05-25 18:15:34 +02:00			`BufFile *innerBatchFile; / buffered virtual temp file per batch */`
			`BufFile *outerBatchFile; / buffered virtual temp file per batch */`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
First stage of reclaiming memory in executor by resetting short-term memory contexts. Currently, only leaks in expressions executed as quals or projections are handled. Clean up some old dead cruft in executor while at it --- unused fields in state nodes, that sort of thing. 2000-07-12 04:37:39 +02:00			`/*`
Standard pgindent run for 8.1. 2005-10-15 04:49:52 +02:00			`* Info about the datatype-specific hash functions for the datatypes being`
			`* hashed. We assume that the inner and outer sides of each hashclause`
			`* are the same type, or at least share the same hash function. This is an`
			`* array of the same length as the number of hash keys.`
First stage of reclaiming memory in executor by resetting short-term memory contexts. Currently, only leaks in expressions executed as quals or projections are handled. Clean up some old dead cruft in executor while at it --- unused fields in state nodes, that sort of thing. 2000-07-12 04:37:39 +02:00			`*/`
Revise hash join and hash aggregation code to use the same datatype- specific hash functions used by hash indexes, rather than the old not-datatype-aware ComputeHashFunc routine. This makes it safe to do hash joining on several datatypes that previously couldn't use hashing. The sets of datatypes that are hash indexable and hash joinable are now exactly the same, whereas before each had some that weren't in the other. 2003-06-23 00:04:55 +02:00			`FmgrInfo hashfunctions; / lookup data for hash functions */`
First stage of reclaiming memory in executor by resetting short-term memory contexts. Currently, only leaks in expressions executed as quals or projections are handled. Clean up some old dead cruft in executor while at it --- unused fields in state nodes, that sort of thing. 2000-07-12 04:37:39 +02:00
Revise hash join code so that we can increase the number of batches on-the-fly, and thereby avoid blowing out memory when the planner has underestimated the hash table size. Hash join will now obey the work_mem limit with some faithfulness. Per my recent proposal (hash aggregate part isn't done yet though). 2005-03-06 23:15:05 +01:00			`Size spaceUsed; /* memory space currently used by tuples */`
			`Size spaceAllowed; /* upper limit for space used */`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
pgindent run over code. 1999-05-25 18:15:34 +02:00			`MemoryContext hashCxt; /* context for whole-hash-join storage */`
			`MemoryContext batchCxt; /* context for this-batch-only storage */`
Arrange for hash join to skip scanning the outer relation if it detects that the inner one is completely empty. Per recent discussion. Also some cosmetic cleanups in nearby code. 2004-09-22 21:13:52 +02:00			`} HashJoinTableData;`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
New pgindent run with fixes suggested by Tom. Patch manually reviewed, initdb/regression tests pass. 2001-11-05 18:46:40 +01:00			`#endif /* HASHJOIN_H */`