diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 0892cdbe3e..1fd6f0cb33 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994-5, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/explain.c,v 1.175 2008/05/14 19:10:29 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/explain.c,v 1.176 2008/08/07 03:04:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -558,19 +558,47 @@ explain_outNode(StringInfo str, pname = "Unique"; break; case T_SetOp: - switch (((SetOp *) plan)->cmd) + switch (((SetOp *) plan)->strategy) { - case SETOPCMD_INTERSECT: - pname = "SetOp Intersect"; + case SETOP_SORTED: + switch (((SetOp *) plan)->cmd) + { + case SETOPCMD_INTERSECT: + pname = "SetOp Intersect"; + break; + case SETOPCMD_INTERSECT_ALL: + pname = "SetOp Intersect All"; + break; + case SETOPCMD_EXCEPT: + pname = "SetOp Except"; + break; + case SETOPCMD_EXCEPT_ALL: + pname = "SetOp Except All"; + break; + default: + pname = "SetOp ???"; + break; + } break; - case SETOPCMD_INTERSECT_ALL: - pname = "SetOp Intersect All"; - break; - case SETOPCMD_EXCEPT: - pname = "SetOp Except"; - break; - case SETOPCMD_EXCEPT_ALL: - pname = "SetOp Except All"; + case SETOP_HASHED: + switch (((SetOp *) plan)->cmd) + { + case SETOPCMD_INTERSECT: + pname = "HashSetOp Intersect"; + break; + case SETOPCMD_INTERSECT_ALL: + pname = "HashSetOp Intersect All"; + break; + case SETOPCMD_EXCEPT: + pname = "HashSetOp Except"; + break; + case SETOPCMD_EXCEPT_ALL: + pname = "HashSetOp Except All"; + break; + default: + pname = "HashSetOp ???"; + break; + } break; default: pname = "SetOp ???"; diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c index c60ceff125..2421b0a6df 100644 --- a/src/backend/executor/nodeSetOp.c +++ b/src/backend/executor/nodeSetOp.c @@ -4,33 +4,38 @@ * Routines to handle INTERSECT and EXCEPT selection * * The input of a SetOp node consists of tuples from two relations, - * which have been combined into one dataset and sorted on all the nonjunk - * attributes. In addition there is a junk attribute that shows which - * relation each tuple came from. The SetOp node scans each group of - * identical tuples to determine how many came from each input relation. - * Then it is a simple matter to emit the output demanded by the SQL spec - * for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL. + * which have been combined into one dataset, with a junk attribute added + * that shows which relation each tuple came from. In SETOP_SORTED mode, + * the input has furthermore been sorted according to all the grouping + * columns (ie, all the non-junk attributes). The SetOp node scans each + * group of identical tuples to determine how many came from each input + * relation. Then it is a simple matter to emit the output demanded by the + * SQL spec for INTERSECT, INTERSECT ALL, EXCEPT, or EXCEPT ALL. + * + * In SETOP_HASHED mode, the input is delivered in no particular order. + * We build a hash table in memory with one entry for each group of + * identical tuples, and count the number of tuples in the group from + * each relation. After seeing all the input, we scan the hashtable and + * generate the correct output using those counts. * * This node type is not used for UNION or UNION ALL, since those can be * implemented more cheaply (there's no need for the junk attribute to * identify the source relation). * + * Note that SetOp does no qual checking nor projection. The delivered + * output tuples are just copies of the first-to-arrive tuple in each + * input group. + * * * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.25 2008/01/01 19:45:49 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeSetOp.c,v 1.26 2008/08/07 03:04:03 tgl Exp $ * *------------------------------------------------------------------------- */ -/* - * INTERFACE ROUTINES - * ExecSetOp - filter input to generate INTERSECT/EXCEPT results - * ExecInitSetOp - initialize node and subnodes.. - * ExecEndSetOp - shutdown node and subnodes - */ #include "postgres.h" @@ -39,6 +44,160 @@ #include "utils/memutils.h" +/* + * SetOpStatePerGroupData - per-group working state + * + * These values are working state that is initialized at the start of + * an input tuple group and updated for each input tuple. + * + * In SETOP_SORTED mode, we need only one of these structs, and it's kept in + * the plan state node. In SETOP_HASHED mode, the hash table contains one + * of these for each tuple group. + */ +typedef struct SetOpStatePerGroupData +{ + long numLeft; /* number of left-input dups in group */ + long numRight; /* number of right-input dups in group */ +} SetOpStatePerGroupData; + +/* + * To implement hashed mode, we need a hashtable that stores a + * representative tuple and the duplicate counts for each distinct set + * of grouping columns. We compute the hash key from the grouping columns. + */ +typedef struct SetOpHashEntryData *SetOpHashEntry; + +typedef struct SetOpHashEntryData +{ + TupleHashEntryData shared; /* common header for hash table entries */ + SetOpStatePerGroupData pergroup; +} SetOpHashEntryData; + + +static TupleTableSlot *setop_retrieve_direct(SetOpState *setopstate); +static void setop_fill_hash_table(SetOpState *setopstate); +static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate); + + +/* + * Initialize state for a new group of input values. + */ +static void +initialize_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup) +{ + pergroup->numLeft = pergroup->numRight = 0; +} + +/* + * Advance the appropriate counter for one input tuple. + */ +static void +advance_counts(SetOpState *setopstate, SetOpStatePerGroup pergroup, + TupleTableSlot *inputslot) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + int flag; + bool isNull; + + flag = DatumGetInt32(slot_getattr(inputslot, + node->flagColIdx, + &isNull)); + Assert(!isNull); + if (flag) + pergroup->numRight++; + else + pergroup->numLeft++; +} + +/* + * Initialize the hash table to empty. + */ +static void +build_hash_table(SetOpState *setopstate) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + + Assert(node->strategy == SETOP_HASHED); + Assert(node->numGroups > 0); + + setopstate->hashtable = BuildTupleHashTable(node->numCols, + node->dupColIdx, + setopstate->eqfunctions, + setopstate->hashfunctions, + node->numGroups, + sizeof(SetOpHashEntryData), + setopstate->tableContext, + setopstate->tempContext); +} + +/* + * Find or create a hashtable entry for the tuple group containing the + * given tuple. + */ +static SetOpHashEntry +lookup_hash_entry(SetOpState *setopstate, TupleTableSlot *inputslot) +{ + SetOpHashEntry entry; + bool isnew; + + /* find or create the hashtable entry */ + entry = (SetOpHashEntry) LookupTupleHashEntry(setopstate->hashtable, + inputslot, + &isnew); + + if (isnew) + { + /* initialize counts for new tuple group */ + initialize_counts(setopstate, &entry->pergroup); + } + + /* Must reset temp context after each hashtable lookup */ + MemoryContextReset(setopstate->tempContext); + + return entry; +} + +/* + * We've completed processing a tuple group. Decide how many copies (if any) + * of its representative row to emit, and store the count into numOutput. + * This logic is straight from the SQL92 specification. + */ +static void +set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup) +{ + SetOp *plannode = (SetOp *) setopstate->ps.plan; + + switch (plannode->cmd) + { + case SETOPCMD_INTERSECT: + if (pergroup->numLeft > 0 && pergroup->numRight > 0) + setopstate->numOutput = 1; + else + setopstate->numOutput = 0; + break; + case SETOPCMD_INTERSECT_ALL: + setopstate->numOutput = + (pergroup->numLeft < pergroup->numRight) ? + pergroup->numLeft : pergroup->numRight; + break; + case SETOPCMD_EXCEPT: + if (pergroup->numLeft > 0 && pergroup->numRight == 0) + setopstate->numOutput = 1; + else + setopstate->numOutput = 0; + break; + case SETOPCMD_EXCEPT_ALL: + setopstate->numOutput = + (pergroup->numLeft < pergroup->numRight) ? + 0 : (pergroup->numLeft - pergroup->numRight); + break; + default: + elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd); + break; + } +} + + /* ---------------------------------------------------------------- * ExecSetOp * ---------------------------------------------------------------- @@ -47,14 +206,7 @@ TupleTableSlot * /* return: a tuple or NULL */ ExecSetOp(SetOpState *node) { SetOp *plannode = (SetOp *) node->ps.plan; - TupleTableSlot *resultTupleSlot; - PlanState *outerPlan; - - /* - * get information from the node - */ - outerPlan = outerPlanState(node); - resultTupleSlot = node->ps.ps_ResultTupleSlot; + TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot; /* * If the previously-returned tuple needs to be returned more than once, @@ -66,142 +218,218 @@ ExecSetOp(SetOpState *node) return resultTupleSlot; } - /* Flag that we have no current tuple */ - ExecClearTuple(resultTupleSlot); + /* Otherwise, we're done if we are out of groups */ + if (node->setop_done) + return NULL; + + /* Fetch the next tuple group according to the correct strategy */ + if (plannode->strategy == SETOP_HASHED) + { + if (!node->table_filled) + setop_fill_hash_table(node); + return setop_retrieve_hash_table(node); + } + else + return setop_retrieve_direct(node); +} + +/* + * ExecSetOp for non-hashed case + */ +static TupleTableSlot * +setop_retrieve_direct(SetOpState *setopstate) +{ + SetOp *node = (SetOp *) setopstate->ps.plan; + PlanState *outerPlan; + SetOpStatePerGroup pergroup; + TupleTableSlot *outerslot; + TupleTableSlot *resultTupleSlot; /* - * Absorb groups of duplicate tuples, counting them, and saving the first - * of each group as a possible return value. At the end of each group, - * decide whether to return anything. - * - * We assume that the tuples arrive in sorted order so we can detect - * duplicates easily. + * get state info from node */ - for (;;) + outerPlan = outerPlanState(setopstate); + pergroup = setopstate->pergroup; + resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; + + /* + * We loop retrieving groups until we find one we should return + */ + while (!setopstate->setop_done) { - TupleTableSlot *inputTupleSlot; - bool endOfGroup; + /* + * If we don't already have the first tuple of the new group, fetch it + * from the outer plan. + */ + if (setopstate->grp_firstTuple == NULL) + { + outerslot = ExecProcNode(outerPlan); + if (!TupIsNull(outerslot)) + { + /* Make a copy of the first input tuple */ + setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot); + } + else + { + /* outer plan produced no tuples at all */ + setopstate->setop_done = true; + return NULL; + } + } /* - * fetch a tuple from the outer subplan, unless we already did. + * Store the copied first input tuple in the tuple table slot + * reserved for it. The tuple will be deleted when it is cleared + * from the slot. */ - if (node->ps.ps_OuterTupleSlot == NULL && - !node->subplan_done) - { - node->ps.ps_OuterTupleSlot = - ExecProcNode(outerPlan); - if (TupIsNull(node->ps.ps_OuterTupleSlot)) - node->subplan_done = true; - } - inputTupleSlot = node->ps.ps_OuterTupleSlot; + ExecStoreTuple(setopstate->grp_firstTuple, + resultTupleSlot, + InvalidBuffer, + true); + setopstate->grp_firstTuple = NULL; /* don't keep two pointers */ - if (TupIsNull(resultTupleSlot)) - { - /* - * First of group: save a copy in result slot, and reset - * duplicate-counters for new group. - */ - if (node->subplan_done) - return NULL; /* no more tuples */ - ExecCopySlot(resultTupleSlot, inputTupleSlot); - node->numLeft = 0; - node->numRight = 0; - endOfGroup = false; - } - else if (node->subplan_done) - { - /* - * Reached end of input, so finish processing final group - */ - endOfGroup = true; - } - else - { - /* - * Else test if the new tuple and the previously saved tuple - * match. - */ - if (execTuplesMatch(inputTupleSlot, - resultTupleSlot, - plannode->numCols, plannode->dupColIdx, - node->eqfunctions, - node->tempContext)) - endOfGroup = false; - else - endOfGroup = true; - } + /* Initialize working state for a new input tuple group */ + initialize_counts(setopstate, pergroup); - if (endOfGroup) + /* Count the first input tuple */ + advance_counts(setopstate, pergroup, resultTupleSlot); + + /* + * Scan the outer plan until we exhaust it or cross a group boundary. + */ + for (;;) { - /* - * We've reached the end of the group containing resultTuple. - * Decide how many copies (if any) to emit. This logic is - * straight from the SQL92 specification. - */ - switch (plannode->cmd) + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) { - case SETOPCMD_INTERSECT: - if (node->numLeft > 0 && node->numRight > 0) - node->numOutput = 1; - else - node->numOutput = 0; - break; - case SETOPCMD_INTERSECT_ALL: - node->numOutput = - (node->numLeft < node->numRight) ? - node->numLeft : node->numRight; - break; - case SETOPCMD_EXCEPT: - if (node->numLeft > 0 && node->numRight == 0) - node->numOutput = 1; - else - node->numOutput = 0; - break; - case SETOPCMD_EXCEPT_ALL: - node->numOutput = - (node->numLeft < node->numRight) ? - 0 : (node->numLeft - node->numRight); - break; - default: - elog(ERROR, "unrecognized set op: %d", - (int) plannode->cmd); - break; - } - /* Fall out of for-loop if we have tuples to emit */ - if (node->numOutput > 0) + /* no more outer-plan tuples available */ + setopstate->setop_done = true; break; - /* Else flag that we have no current tuple, and loop around */ - ExecClearTuple(resultTupleSlot); - } - else - { - /* - * Current tuple is member of same group as resultTuple. Count it - * in the appropriate counter. - */ - int flag; - bool isNull; + } - flag = DatumGetInt32(slot_getattr(inputTupleSlot, - plannode->flagColIdx, - &isNull)); - Assert(!isNull); - if (flag) - node->numRight++; - else - node->numLeft++; - /* Set flag to fetch a new input tuple, and loop around */ - node->ps.ps_OuterTupleSlot = NULL; + /* + * Check whether we've crossed a group boundary. + */ + if (!execTuplesMatch(resultTupleSlot, + outerslot, + node->numCols, node->dupColIdx, + setopstate->eqfunctions, + setopstate->tempContext)) + { + /* + * Save the first input tuple of the next group. + */ + setopstate->grp_firstTuple = ExecCopySlotTuple(outerslot); + break; + } + + /* Still in same group, so count this tuple */ + advance_counts(setopstate, pergroup, outerslot); + } + + /* + * Done scanning input tuple group. See if we should emit any + * copies of result tuple, and if so return the first copy. + */ + set_output_count(setopstate, pergroup); + + if (setopstate->numOutput > 0) + { + setopstate->numOutput--; + return resultTupleSlot; } } + /* No more groups */ + ExecClearTuple(resultTupleSlot); + return NULL; +} + +/* + * ExecSetOp for hashed case: phase 1, read input and build hash table + */ +static void +setop_fill_hash_table(SetOpState *setopstate) +{ + PlanState *outerPlan; + SetOpHashEntry entry; + TupleTableSlot *outerslot; + /* - * If we fall out of loop, then we need to emit at least one copy of - * resultTuple. + * get state info from node */ - Assert(node->numOutput > 0); - node->numOutput--; - return resultTupleSlot; + outerPlan = outerPlanState(setopstate); + + /* + * Process each outer-plan tuple, and then fetch the next one, until we + * exhaust the outer plan. + */ + for (;;) + { + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + break; + + /* Find or build hashtable entry for this tuple's group */ + entry = lookup_hash_entry(setopstate, outerslot); + + /* Advance the counts */ + advance_counts(setopstate, &entry->pergroup, outerslot); + } + + setopstate->table_filled = true; + /* Initialize to walk the hash table */ + ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter); +} + +/* + * ExecSetOp for hashed case: phase 2, retrieving groups from hash table + */ +static TupleTableSlot * +setop_retrieve_hash_table(SetOpState *setopstate) +{ + SetOpHashEntry entry; + TupleTableSlot *resultTupleSlot; + + /* + * get state info from node + */ + resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; + + /* + * We loop retrieving groups until we find one we should return + */ + while (!setopstate->setop_done) + { + /* + * Find the next entry in the hash table + */ + entry = (SetOpHashEntry) ScanTupleHashTable(&setopstate->hashiter); + if (entry == NULL) + { + /* No more entries in hashtable, so done */ + setopstate->setop_done = true; + return NULL; + } + + /* + * See if we should emit any copies of this tuple, and if so return + * the first copy. + */ + set_output_count(setopstate, &entry->pergroup); + + if (setopstate->numOutput > 0) + { + setopstate->numOutput--; + return ExecStoreMinimalTuple(entry->shared.firstTuple, + resultTupleSlot, + false); + } + } + + /* No more groups */ + ExecClearTuple(resultTupleSlot); + return NULL; } /* ---------------------------------------------------------------- @@ -226,9 +454,14 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) setopstate->ps.plan = (Plan *) node; setopstate->ps.state = estate; - setopstate->ps.ps_OuterTupleSlot = NULL; - setopstate->subplan_done = false; + setopstate->eqfunctions = NULL; + setopstate->hashfunctions = NULL; + setopstate->setop_done = false; setopstate->numOutput = 0; + setopstate->pergroup = NULL; + setopstate->grp_firstTuple = NULL; + setopstate->hashtable = NULL; + setopstate->tableContext = NULL; /* * Miscellaneous initialization @@ -244,6 +477,19 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + /* + * If hashing, we also need a longer-lived context to store the hash + * table. The table can't just be kept in the per-query context because + * we want to be able to throw it away in ExecReScanSetOp. + */ + if (node->strategy == SETOP_HASHED) + setopstate->tableContext = + AllocSetContextCreate(CurrentMemoryContext, + "SetOp hash table", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + #define SETOP_NSLOTS 1 /* @@ -252,8 +498,13 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) ExecInitResultTupleSlot(estate, &setopstate->ps); /* - * then initialize outer plan + * initialize child nodes + * + * If we are hashing then the child plan does not need + * to handle REWIND efficiently; see ExecReScanSetOp. */ + if (node->strategy == SETOP_HASHED) + eflags &= ~EXEC_FLAG_REWIND; outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags); /* @@ -264,11 +515,30 @@ ExecInitSetOp(SetOp *node, EState *estate, int eflags) setopstate->ps.ps_ProjInfo = NULL; /* - * Precompute fmgr lookup data for inner loop + * Precompute fmgr lookup data for inner loop. We need both equality and + * hashing functions to do it by hashing, but only equality if not + * hashing. */ - setopstate->eqfunctions = - execTuplesMatchPrepare(node->numCols, - node->dupOperators); + if (node->strategy == SETOP_HASHED) + execTuplesHashPrepare(node->numCols, + node->dupOperators, + &setopstate->eqfunctions, + &setopstate->hashfunctions); + else + setopstate->eqfunctions = + execTuplesMatchPrepare(node->numCols, + node->dupOperators); + + if (node->strategy == SETOP_HASHED) + { + build_hash_table(setopstate); + setopstate->table_filled = false; + } + else + { + setopstate->pergroup = + (SetOpStatePerGroup) palloc0(sizeof(SetOpStatePerGroupData)); + } return setopstate; } @@ -293,9 +563,11 @@ ExecEndSetOp(SetOpState *node) { /* clean up tuple table */ ExecClearTuple(node->ps.ps_ResultTupleSlot); - node->ps.ps_OuterTupleSlot = NULL; + /* free subsidiary stuff including hashtable */ MemoryContextDelete(node->tempContext); + if (node->tableContext) + MemoryContextDelete(node->tableContext); ExecEndNode(outerPlanState(node)); } @@ -305,10 +577,50 @@ void ExecReScanSetOp(SetOpState *node, ExprContext *exprCtxt) { ExecClearTuple(node->ps.ps_ResultTupleSlot); - node->ps.ps_OuterTupleSlot = NULL; - node->subplan_done = false; + node->setop_done = false; node->numOutput = 0; + if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) + { + /* + * In the hashed case, if we haven't yet built the hash table then we + * can just return; nothing done yet, so nothing to undo. If subnode's + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else no reason to re-scan it at all. + */ + if (!node->table_filled) + return; + + /* + * If we do have the hash table and the subplan does not have any + * parameter changes, then we can just rescan the existing hash table; + * no need to build it again. + */ + if (((PlanState *) node)->lefttree->chgParam == NULL) + { + ResetTupleHashIterator(node->hashtable, &node->hashiter); + return; + } + } + + /* Release first tuple of group, if we have made a copy */ + if (node->grp_firstTuple != NULL) + { + heap_freetuple(node->grp_firstTuple); + node->grp_firstTuple = NULL; + } + + /* Release any hashtable storage */ + if (node->tableContext) + MemoryContextResetAndDeleteChildren(node->tableContext); + + /* And rebuild empty hashtable if needed */ + if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) + { + build_hash_table(node); + node->table_filled = false; + } + /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index e562949d93..4767b924e9 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -15,7 +15,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.397 2008/08/07 01:11:46 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.398 2008/08/07 03:04:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -652,10 +652,12 @@ _copySetOp(SetOp *from) * copy remainder of node */ COPY_SCALAR_FIELD(cmd); + COPY_SCALAR_FIELD(strategy); COPY_SCALAR_FIELD(numCols); COPY_POINTER_FIELD(dupColIdx, from->numCols * sizeof(AttrNumber)); COPY_POINTER_FIELD(dupOperators, from->numCols * sizeof(Oid)); COPY_SCALAR_FIELD(flagColIdx); + COPY_SCALAR_FIELD(numGroups); return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 26f8fb3810..8440864b0e 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.331 2008/08/07 01:11:48 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.332 2008/08/07 03:04:03 tgl Exp $ * * NOTES * Every node type that can appear in stored rules' parsetrees *must* @@ -599,6 +599,7 @@ _outSetOp(StringInfo str, SetOp *node) _outPlanInfo(str, (Plan *) node); WRITE_ENUM_FIELD(cmd, SetOpCmd); + WRITE_ENUM_FIELD(strategy, SetOpStrategy); WRITE_INT_FIELD(numCols); appendStringInfo(str, " :dupColIdx"); @@ -610,6 +611,7 @@ _outSetOp(StringInfo str, SetOp *node) appendStringInfo(str, " %u", node->dupOperators[i]); WRITE_INT_FIELD(flagColIdx); + WRITE_LONG_FIELD(numGroups); } static void diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 80b6ed2edf..4cf2a5208b 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.242 2008/08/02 21:32:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.243 2008/08/07 03:04:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -3108,8 +3108,9 @@ make_unique(Plan *lefttree, List *distinctList) * already be sorted accordingly. */ SetOp * -make_setop(SetOpCmd cmd, Plan *lefttree, - List *distinctList, AttrNumber flagColIdx) +make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree, + List *distinctList, AttrNumber flagColIdx, long numGroups, + double outputRows) { SetOp *node = makeNode(SetOp); Plan *plan = &node->plan; @@ -3120,20 +3121,13 @@ make_setop(SetOpCmd cmd, Plan *lefttree, ListCell *slitem; copy_plan_costsize(plan, lefttree); + plan->plan_rows = outputRows; /* * Charge one cpu_operator_cost per comparison per input tuple. We assume * all columns get compared at most of the tuples. */ - plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols; - - /* - * We make the unsupported assumption that there will be 10% as many - * tuples out as in. Any way to do better? - */ - plan->plan_rows *= 0.1; - if (plan->plan_rows < 1) - plan->plan_rows = 1; + plan->total_cost += cpu_operator_cost * lefttree->plan_rows * numCols; plan->targetlist = lefttree->targetlist; plan->qual = NIL; @@ -3160,10 +3154,12 @@ make_setop(SetOpCmd cmd, Plan *lefttree, } node->cmd = cmd; + node->strategy = strategy; node->numCols = numCols; node->dupColIdx = dupColIdx; node->dupOperators = dupOperators; node->flagColIdx = flagColIdx; + node->numGroups = numGroups; return node; } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 750d59a951..3a155c0d0a 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -22,7 +22,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.150 2008/08/07 01:11:50 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.151 2008/08/07 03:04:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -60,6 +60,7 @@ static Plan *generate_union_plan(SetOperationStmt *op, PlannerInfo *root, double tuple_fraction, List *refnames_tlist, List **sortClauses); static Plan *generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root, + double tuple_fraction, List *refnames_tlist, List **sortClauses); static List *recurse_union_children(Node *setOp, PlannerInfo *root, double tuple_fraction, @@ -229,7 +230,7 @@ recurse_set_operations(Node *setOp, PlannerInfo *root, refnames_tlist, sortClauses); else - plan = generate_nonunion_plan(op, root, + plan = generate_nonunion_plan(op, root, tuple_fraction, refnames_tlist, sortClauses); @@ -341,6 +342,7 @@ generate_union_plan(SetOperationStmt *op, PlannerInfo *root, */ static Plan * generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root, + double tuple_fraction, List *refnames_tlist, List **sortClauses) { @@ -351,6 +353,10 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root, *groupList, *planlist, *child_sortclauses; + double dNumDistinctRows; + double dNumOutputRows; + long numDistinctRows; + bool use_hash; SetOpCmd cmd; /* Recurse on children, ensuring their outputs are marked */ @@ -393,10 +399,32 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root, return plan; } + /* + * XXX for the moment, take the number of distinct groups as being the + * total input size, ie, the worst case. This is too conservative, but + * we don't want to risk having the hashtable overrun memory; also, + * it's not clear how to get a decent estimate of the true size. + */ + dNumDistinctRows = plan->plan_rows; + + /* Also convert to long int --- but 'ware overflow! */ + numDistinctRows = (long) Min(dNumDistinctRows, (double) LONG_MAX); + + /* + * The output size is taken as 10% of that, which is a completely bogus + * guess, but it's what we've used historically. + */ + dNumOutputRows = ceil(dNumDistinctRows * 0.1); + /* * Decide whether to hash or sort, and add a sort node if needed. */ - plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan); + use_hash = choose_hashed_setop(root, groupList, plan, + tuple_fraction, dNumDistinctRows, + (op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT"); + + if (!use_hash) + plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan); /* * Finally, add a SetOp plan node to generate the correct output. @@ -414,9 +442,12 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root, cmd = SETOPCMD_INTERSECT; /* keep compiler quiet */ break; } - plan = (Plan *) make_setop(cmd, plan, groupList, list_length(op->colTypes) + 1); + plan = (Plan *) make_setop(cmd, use_hash ? SETOP_HASHED : SETOP_SORTED, + plan, groupList, list_length(op->colTypes) + 1, + numDistinctRows, dNumOutputRows); - *sortClauses = groupList; + /* Result is sorted only if we're not hashing */ + *sortClauses = use_hash ? NIL : groupList; return plan; } diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 96b7a16084..ce41c750af 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.185 2008/07/26 19:15:35 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.186 2008/08/07 03:04:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1414,21 +1414,31 @@ typedef struct HashState /* ---------------- * SetOpState information * - * SetOp nodes are used "on top of" sort nodes to discard - * duplicate tuples returned from the sort phase. These are - * more complex than a simple Unique since we have to count - * how many duplicates to return. + * Even in "sorted" mode, SetOp nodes are more complex than a simple + * Unique, since we have to count how many duplicates to return. But + * we also support hashing, so this is really more like a cut-down + * form of Agg. * ---------------- */ +/* this struct is private in nodeSetOp.c: */ +typedef struct SetOpStatePerGroupData *SetOpStatePerGroup; + typedef struct SetOpState { PlanState ps; /* its first field is NodeTag */ - FmgrInfo *eqfunctions; /* per-field lookup data for equality fns */ - bool subplan_done; /* has subplan returned EOF? */ - long numLeft; /* number of left-input dups of cur group */ - long numRight; /* number of right-input dups of cur group */ + FmgrInfo *eqfunctions; /* per-grouping-field equality fns */ + FmgrInfo *hashfunctions; /* per-grouping-field hash fns */ + bool setop_done; /* indicates completion of output scan */ long numOutput; /* number of dups left to output */ MemoryContext tempContext; /* short-term context for comparisons */ + /* these fields are used in SETOP_SORTED mode: */ + SetOpStatePerGroup pergroup; /* per-group working state */ + HeapTuple grp_firstTuple; /* copy of first tuple of current group */ + /* these fields are used in SETOP_HASHED mode: */ + TupleHashTable hashtable; /* hash table with one entry per group */ + MemoryContext tableContext; /* memory context containing hash table */ + bool table_filled; /* hash table filled yet? */ + TupleHashIterator hashiter; /* for iterating through hash table */ } SetOpState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 6fb93e810d..e2eaadd29f 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.100 2008/04/13 20:51:21 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.101 2008/08/07 03:04:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -525,15 +525,23 @@ typedef enum SetOpCmd SETOPCMD_EXCEPT_ALL } SetOpCmd; +typedef enum SetOpStrategy +{ + SETOP_SORTED, /* input must be sorted */ + SETOP_HASHED /* use internal hashtable */ +} SetOpStrategy; + typedef struct SetOp { Plan plan; SetOpCmd cmd; /* what to do */ + SetOpStrategy strategy; /* how to do it */ int numCols; /* number of columns to check for * duplicate-ness */ AttrNumber *dupColIdx; /* their indexes in the target list */ Oid *dupOperators; /* equality operators to compare with */ AttrNumber flagColIdx; /* where is the flag column, if any */ + long numGroups; /* estimated number of groups in input */ } SetOp; /* ---------------- diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 0b46a17370..616f0ea2ad 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.108 2008/05/02 21:26:10 tgl Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.109 2008/08/07 03:04:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -61,8 +61,9 @@ extern Plan *materialize_finished_plan(Plan *subplan); extern Unique *make_unique(Plan *lefttree, List *distinctList); extern Limit *make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, int64 offset_est, int64 count_est); -extern SetOp *make_setop(SetOpCmd cmd, Plan *lefttree, - List *distinctList, AttrNumber flagColIdx); +extern SetOp *make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree, + List *distinctList, AttrNumber flagColIdx, long numGroups, + double outputRows); extern Result *make_result(PlannerInfo *root, List *tlist, Node *resconstantqual, Plan *subplan); extern bool is_projection_capable_plan(Plan *plan); diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out index 722f9651be..295dacea86 100644 --- a/src/test/regress/expected/union.out +++ b/src/test/regress/expected/union.out @@ -275,21 +275,21 @@ SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl; 4567890123456789 (3 rows) -SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl; +SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1; q2 ------------------- -4567890123456789 456 (2 rows) -SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl; +SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl ORDER BY 1; q2 ------------------- -4567890123456789 456 (2 rows) -SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl; +SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1; q2 ------------------- -4567890123456789 @@ -326,7 +326,7 @@ SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl; 0 (1 row) -SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl; +SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1; f1 ----------------------- -1.2345678901234e+200 @@ -369,14 +369,14 @@ SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2 -4567890123456789 (7 rows) -SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl; +SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1; q1 ------------------- -4567890123456789 456 (2 rows) -SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl))); +SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1))); q1 ------------------- 123 @@ -388,7 +388,7 @@ SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FR 456 (7 rows) -(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl; +(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1; q1 ------------------- -4567890123456789 diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql index 0b83d6b185..81e19d165c 100644 --- a/src/test/regress/sql/union.sql +++ b/src/test/regress/sql/union.sql @@ -97,11 +97,11 @@ SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl; SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl; -SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl; +SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1; -SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl; +SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl ORDER BY 1; -SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl; +SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1; SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl; @@ -115,7 +115,7 @@ SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl; SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl; -SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl; +SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1; -- -- Operator precedence and (((((extra))))) parentheses @@ -127,11 +127,11 @@ SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2 (((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl))) UNION ALL SELECT q2 FROM int8_tbl; -SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl; +SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1; -SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl))); +SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1))); -(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl; +(((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1; -- -- Subqueries with ORDER BY & LIMIT clauses