/*------------------------------------------------------------------------- * * nodeHashjoin.c * Routines to handle hash join nodes * * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * $Header: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v 1.44 2002/12/13 19:45:52 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "executor/executor.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" #include "optimizer/clauses.h" #include "utils/memutils.h" static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate); static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, TupleTableSlot *tupleSlot); static int ExecHashJoinGetBatch(int bucketno, HashJoinTable hashtable); static int ExecHashJoinNewBatch(HashJoinState *hjstate); /* ---------------------------------------------------------------- * ExecHashJoin * * This function implements the Hybrid Hashjoin algorithm. * recursive partitioning remains to be added. * Note: the relation we build hash table on is the inner * the other one is outer. * ---------------------------------------------------------------- */ TupleTableSlot * /* return: a tuple or NULL */ ExecHashJoin(HashJoinState *node) { EState *estate; PlanState *outerNode; HashState *hashNode; List *hjclauses; List *outerkeys; List *joinqual; List *otherqual; ScanDirection dir; TupleTableSlot *inntuple; ExprContext *econtext; ExprDoneCond isDone; HashJoinTable hashtable; HeapTuple curtuple; TupleTableSlot *outerTupleSlot; TupleTableSlot *innerTupleSlot; int i; bool hashPhaseDone; /* * get information from HashJoin node */ hjclauses = node->hashclauses; estate = node->js.ps.state; joinqual = node->js.joinqual; otherqual = node->js.ps.qual; hashNode = (HashState *) innerPlanState(node); outerNode = outerPlanState(node); hashPhaseDone = node->hj_hashdone; dir = estate->es_direction; /* * get information from HashJoin state */ hashtable = node->hj_HashTable; outerkeys = node->hj_OuterHashKeys; econtext = node->js.ps.ps_ExprContext; /* * Check to see if we're still projecting out tuples from a previous * join tuple (because there is a function-returning-set in the * projection expressions). If so, try to project another one. */ if (node->js.ps.ps_TupFromTlist) { TupleTableSlot *result; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone == ExprMultipleResult) return result; /* Done with that source tuple... */ node->js.ps.ps_TupFromTlist = false; } /* * Reset per-tuple memory context to free any expression evaluation * storage allocated in the previous tuple cycle. Note this can't * happen until we're done projecting out tuples from a join tuple. */ ResetExprContext(econtext); /* * if this is the first call, build the hash table for inner relation */ if (!hashPhaseDone) { /* if the hash phase not completed */ if (hashtable == NULL) { /* if the hash table has not been created */ /* * create the hash table */ hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan); node->hj_HashTable = hashtable; /* * execute the Hash node, to build the hash table */ hashNode->hashtable = hashtable; innerTupleSlot = ExecProcNode((PlanState *) hashNode); } node->hj_hashdone = true; /* * Open temp files for outer batches, if needed. Note that file * buffers are palloc'd in regular executor context. */ for (i = 0; i < hashtable->nbatch; i++) hashtable->outerBatchFile[i] = BufFileCreateTemp(); } else if (hashtable == NULL) return NULL; /* * Now get an outer tuple and probe into the hash table for matches */ outerTupleSlot = node->js.ps.ps_OuterTupleSlot; for (;;) { /* * If we don't have an outer tuple, get the next one */ if (node->hj_NeedNewOuter) { outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode, node); if (TupIsNull(outerTupleSlot)) { /* * when the last batch runs out, clean up and exit */ ExecHashTableDestroy(hashtable); node->hj_HashTable = NULL; return NULL; } node->js.ps.ps_OuterTupleSlot = outerTupleSlot; econtext->ecxt_outertuple = outerTupleSlot; node->hj_NeedNewOuter = false; node->hj_MatchedOuter = false; /* * now we have an outer tuple, find the corresponding bucket * for this tuple from the hash table */ node->hj_CurBucketNo = ExecHashGetBucket(hashtable, econtext, outerkeys); node->hj_CurTuple = NULL; /* * Now we've got an outer tuple and the corresponding hash * bucket, but this tuple may not belong to the current batch. * This need only be checked in the first pass. */ if (hashtable->curbatch == 0) { int batch = ExecHashJoinGetBatch(node->hj_CurBucketNo, hashtable); if (batch > 0) { /* * Need to postpone this outer tuple to a later batch. * Save it in the corresponding outer-batch file. */ int batchno = batch - 1; hashtable->outerBatchSize[batchno]++; ExecHashJoinSaveTuple(outerTupleSlot->val, hashtable->outerBatchFile[batchno]); node->hj_NeedNewOuter = true; continue; /* loop around for a new outer tuple */ } } } /* * OK, scan the selected hash bucket for matches */ for (;;) { curtuple = ExecScanHashBucket(node, hjclauses, econtext); if (curtuple == NULL) break; /* out of matches */ /* * we've got a match, but still need to test non-hashed quals */ inntuple = ExecStoreTuple(curtuple, node->hj_HashTupleSlot, InvalidBuffer, false); /* don't pfree this tuple */ econtext->ecxt_innertuple = inntuple; /* reset temp memory each time to avoid leaks from qual expr */ ResetExprContext(econtext); /* * if we pass the qual, then save state for next call and have * ExecProject form the projection, store it in the tuple * table, and return the slot. * * Only the joinquals determine MatchedOuter status, but all * quals must pass to actually return the tuple. */ if (ExecQual(joinqual, econtext, false)) { node->hj_MatchedOuter = true; if (otherqual == NIL || ExecQual(otherqual, econtext, false)) { TupleTableSlot *result; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone != ExprEndResult) { node->js.ps.ps_TupFromTlist = (isDone == ExprMultipleResult); return result; } } } } /* * Now the current outer tuple has run out of matches, so check * whether to emit a dummy outer-join tuple. If not, loop around * to get a new outer tuple. */ node->hj_NeedNewOuter = true; if (!node->hj_MatchedOuter && node->js.jointype == JOIN_LEFT) { /* * We are doing an outer join and there were no join matches * for this outer tuple. Generate a fake join tuple with * nulls for the inner tuple, and return it if it passes the * non-join quals. */ econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; if (ExecQual(otherqual, econtext, false)) { /* * qualification was satisfied so we project and return * the slot containing the result tuple using * ExecProject(). */ TupleTableSlot *result; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone != ExprEndResult) { node->js.ps.ps_TupFromTlist = (isDone == ExprMultipleResult); return result; } } } } } /* ---------------------------------------------------------------- * ExecInitHashJoin * * Init routine for HashJoin node. * ---------------------------------------------------------------- */ HashJoinState * ExecInitHashJoin(HashJoin *node, EState *estate) { HashJoinState *hjstate; Plan *outerNode; Hash *hashNode; List *hclauses; List *hcl; /* * create state structure */ hjstate = makeNode(HashJoinState); hjstate->js.ps.plan = (Plan *) node; hjstate->js.ps.state = estate; /* * Miscellaneous initialization * * create expression context for node */ ExecAssignExprContext(estate, &hjstate->js.ps); /* * initialize child expressions */ hjstate->js.ps.targetlist = (List *) ExecInitExpr((Expr *) node->join.plan.targetlist, (PlanState *) hjstate); hjstate->js.ps.qual = (List *) ExecInitExpr((Expr *) node->join.plan.qual, (PlanState *) hjstate); hjstate->js.jointype = node->join.jointype; hjstate->js.joinqual = (List *) ExecInitExpr((Expr *) node->join.joinqual, (PlanState *) hjstate); hjstate->hashclauses = (List *) ExecInitExpr((Expr *) node->hashclauses, (PlanState *) hjstate); /* * initialize child nodes */ outerNode = outerPlan(node); hashNode = (Hash *) innerPlan(node); outerPlanState(hjstate) = ExecInitNode(outerNode, estate); innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate); #define HASHJOIN_NSLOTS 3 /* * tuple table initialization */ ExecInitResultTupleSlot(estate, &hjstate->js.ps); hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate); switch (node->join.jointype) { case JOIN_INNER: break; case JOIN_LEFT: hjstate->hj_NullInnerTupleSlot = ExecInitNullTupleSlot(estate, ExecGetTupType(innerPlanState(hjstate))); break; default: elog(ERROR, "ExecInitHashJoin: unsupported join type %d", (int) node->join.jointype); } /* * now for some voodoo. our temporary tuple slot is actually the * result tuple slot of the Hash node (which is our inner plan). we * do this because Hash nodes don't return tuples via ExecProcNode() * -- instead the hash join node uses ExecScanHashBucket() to get at * the contents of the hash table. -cim 6/9/91 */ { HashState *hashstate = (HashState *) innerPlanState(hjstate); TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot; hjstate->hj_HashTupleSlot = slot; } /* * initialize tuple type and projection info */ ExecAssignResultTypeFromTL(&hjstate->js.ps); ExecAssignProjectionInfo(&hjstate->js.ps); ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot, ExecGetTupType(outerPlanState(hjstate)), false); /* * initialize hash-specific info */ hjstate->hj_hashdone = false; hjstate->hj_HashTable = (HashJoinTable) NULL; hjstate->hj_CurBucketNo = 0; hjstate->hj_CurTuple = (HashJoinTuple) NULL; /* * The planner already made a list of the inner hashkeys for us, * but we also need a list of the outer hashkeys. Each list of * exprs must then be prepared for execution. */ hjstate->hj_InnerHashKeys = (List *) ExecInitExpr((Expr *) hashNode->hashkeys, innerPlanState(hjstate)); ((HashState *) innerPlanState(hjstate))->hashkeys = hjstate->hj_InnerHashKeys; hclauses = NIL; foreach(hcl, node->hashclauses) { hclauses = lappend(hclauses, get_leftop(lfirst(hcl))); } hjstate->hj_OuterHashKeys = (List *) ExecInitExpr((Expr *) hclauses, (PlanState *) hjstate); hjstate->js.ps.ps_OuterTupleSlot = NULL; hjstate->js.ps.ps_TupFromTlist = false; hjstate->hj_NeedNewOuter = true; hjstate->hj_MatchedOuter = false; return hjstate; } int ExecCountSlotsHashJoin(HashJoin *node) { return ExecCountSlotsNode(outerPlan(node)) + ExecCountSlotsNode(innerPlan(node)) + HASHJOIN_NSLOTS; } /* ---------------------------------------------------------------- * ExecEndHashJoin * * clean up routine for HashJoin node * ---------------------------------------------------------------- */ void ExecEndHashJoin(HashJoinState *node) { /* * free hash table in case we end plan before all tuples are retrieved */ if (node->hj_HashTable) { ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; } /* * Free the projection info and the scan attribute info */ ExecFreeProjectionInfo(&node->js.ps); ExecFreeExprContext(&node->js.ps); /* * clean up subtrees */ ExecEndNode(outerPlanState(node)); ExecEndNode(innerPlanState(node)); /* * clean out the tuple table */ ExecClearTuple(node->js.ps.ps_ResultTupleSlot); ExecClearTuple(node->hj_OuterTupleSlot); ExecClearTuple(node->hj_HashTupleSlot); } /* ---------------------------------------------------------------- * ExecHashJoinOuterGetTuple * * get the next outer tuple for hashjoin: either by * executing a plan node as in the first pass, or from * the tmp files for the hashjoin batches. * ---------------------------------------------------------------- */ static TupleTableSlot * ExecHashJoinOuterGetTuple(PlanState *node, HashJoinState *hjstate) { HashJoinTable hashtable = hjstate->hj_HashTable; int curbatch = hashtable->curbatch; TupleTableSlot *slot; if (curbatch == 0) { /* if it is the first pass */ slot = ExecProcNode(node); if (!TupIsNull(slot)) return slot; /* * We have just reached the end of the first pass. Try to switch * to a saved batch. */ curbatch = ExecHashJoinNewBatch(hjstate); } /* * Try to read from a temp file. Loop allows us to advance to new * batch as needed. */ while (curbatch <= hashtable->nbatch) { slot = ExecHashJoinGetSavedTuple(hjstate, hashtable->outerBatchFile[curbatch - 1], hjstate->hj_OuterTupleSlot); if (!TupIsNull(slot)) return slot; curbatch = ExecHashJoinNewBatch(hjstate); } /* Out of batches... */ return NULL; } /* ---------------------------------------------------------------- * ExecHashJoinGetSavedTuple * * read the next tuple from a tmp file * ---------------------------------------------------------------- */ static TupleTableSlot * ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, TupleTableSlot *tupleSlot) { HeapTupleData htup; size_t nread; HeapTuple heapTuple; nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData)); if (nread == 0) return NULL; /* end of file */ if (nread != sizeof(HeapTupleData)) elog(ERROR, "Read from hashjoin temp file failed"); heapTuple = palloc(HEAPTUPLESIZE + htup.t_len); memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData)); heapTuple->t_datamcxt = CurrentMemoryContext; heapTuple->t_data = (HeapTupleHeader) ((char *) heapTuple + HEAPTUPLESIZE); nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len); if (nread != (size_t) htup.t_len) elog(ERROR, "Read from hashjoin temp file failed"); return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true); } /* ---------------------------------------------------------------- * ExecHashJoinNewBatch * * switch to a new hashjoin batch * ---------------------------------------------------------------- */ static int ExecHashJoinNewBatch(HashJoinState *hjstate) { HashJoinTable hashtable = hjstate->hj_HashTable; int nbatch = hashtable->nbatch; int newbatch = hashtable->curbatch + 1; long *innerBatchSize = hashtable->innerBatchSize; long *outerBatchSize = hashtable->outerBatchSize; BufFile *innerFile; TupleTableSlot *slot; ExprContext *econtext; List *innerhashkeys; if (newbatch > 1) { /* * We no longer need the previous outer batch file; close it right * away to free disk space. */ BufFileClose(hashtable->outerBatchFile[newbatch - 2]); hashtable->outerBatchFile[newbatch - 2] = NULL; } /* * We can skip over any batches that are empty on either side. Release * associated temp files right away. */ while (newbatch <= nbatch && (innerBatchSize[newbatch - 1] == 0L || outerBatchSize[newbatch - 1] == 0L)) { BufFileClose(hashtable->innerBatchFile[newbatch - 1]); hashtable->innerBatchFile[newbatch - 1] = NULL; BufFileClose(hashtable->outerBatchFile[newbatch - 1]); hashtable->outerBatchFile[newbatch - 1] = NULL; newbatch++; } if (newbatch > nbatch) return newbatch; /* no more batches */ /* * Rewind inner and outer batch files for this batch, so that we can * start reading them. */ if (BufFileSeek(hashtable->outerBatchFile[newbatch - 1], 0, 0L, SEEK_SET)) elog(ERROR, "Failed to rewind hash temp file"); innerFile = hashtable->innerBatchFile[newbatch - 1]; if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) elog(ERROR, "Failed to rewind hash temp file"); /* * Reload the hash table with the new inner batch */ ExecHashTableReset(hashtable, innerBatchSize[newbatch - 1]); econtext = hjstate->js.ps.ps_ExprContext; innerhashkeys = hjstate->hj_InnerHashKeys; while ((slot = ExecHashJoinGetSavedTuple(hjstate, innerFile, hjstate->hj_HashTupleSlot)) && !TupIsNull(slot)) { econtext->ecxt_innertuple = slot; ExecHashTableInsert(hashtable, econtext, innerhashkeys); } /* * after we build the hash table, the inner batch file is no longer * needed */ BufFileClose(innerFile); hashtable->innerBatchFile[newbatch - 1] = NULL; hashtable->curbatch = newbatch; return newbatch; } /* ---------------------------------------------------------------- * ExecHashJoinGetBatch * * determine the batch number for a bucketno * +----------------+-------+-------+ ... +-------+ * 0 nbuckets totalbuckets * batch 0 1 2 ... * ---------------------------------------------------------------- */ static int ExecHashJoinGetBatch(int bucketno, HashJoinTable hashtable) { int b; if (bucketno < hashtable->nbuckets || hashtable->nbatch == 0) return 0; b = (hashtable->nbatch * (bucketno - hashtable->nbuckets)) / (hashtable->totalbuckets - hashtable->nbuckets); return b + 1; } /* ---------------------------------------------------------------- * ExecHashJoinSaveTuple * * save a tuple to a tmp file. * * The data recorded in the file for each tuple is an image of its * HeapTupleData (with meaningless t_data pointer) followed by the * HeapTupleHeader and tuple data. * ---------------------------------------------------------------- */ void ExecHashJoinSaveTuple(HeapTuple heapTuple, BufFile *file) { size_t written; written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData)); if (written != sizeof(HeapTupleData)) elog(ERROR, "Write to hashjoin temp file failed"); written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len); if (written != (size_t) heapTuple->t_len) elog(ERROR, "Write to hashjoin temp file failed"); } void ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt) { if (!node->hj_hashdone) return; node->hj_hashdone = false; /* * Unfortunately, currently we have to destroy hashtable in all * cases... */ if (node->hj_HashTable) { ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; } node->hj_CurBucketNo = 0; node->hj_CurTuple = (HashJoinTuple) NULL; node->js.ps.ps_OuterTupleSlot = (TupleTableSlot *) NULL; node->js.ps.ps_TupFromTlist = false; node->hj_NeedNewOuter = true; node->hj_MatchedOuter = false; /* * if chgParam of subnodes is not null then plans will be re-scanned * by first ExecProcNode. */ if (((PlanState *) node)->lefttree->chgParam == NULL) ExecReScan(((PlanState *) node)->lefttree, exprCtxt); if (((PlanState *) node)->righttree->chgParam == NULL) ExecReScan(((PlanState *) node)->righttree, exprCtxt); }