From 2d1d96b1cea8f67a095e8f28372af4081605f681 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 7 Aug 2008 01:11:52 +0000 Subject: [PATCH] Teach the system how to use hashing for UNION. (INTERSECT/EXCEPT will follow, but seem like a separate patch since most of the remaining work is on the executor side.) I took the opportunity to push selection of the grouping operators for set operations into the parser where it belongs. Otherwise this is just a small exercise in making prepunion.c consider both alternatives. As with the recent DISTINCT patch, this means we can UNION on datatypes that can hash but not sort, and it means that UNION without ORDER BY is no longer certain to produce sorted output. --- src/backend/catalog/dependency.c | 11 +- src/backend/nodes/copyfuncs.c | 3 +- src/backend/nodes/equalfuncs.c | 3 +- src/backend/nodes/outfuncs.c | 3 +- src/backend/nodes/readfuncs.c | 3 +- src/backend/optimizer/plan/planner.c | 110 +-------- src/backend/optimizer/prep/prepunion.c | 282 +++++++++++++++++++---- src/backend/optimizer/util/clauses.c | 5 +- src/backend/optimizer/util/pathnode.c | 29 +-- src/backend/optimizer/util/tlist.c | 106 ++++++++- src/backend/parser/analyze.c | 29 ++- src/backend/parser/parse_clause.c | 21 +- src/include/catalog/catversion.h | 4 +- src/include/nodes/parsenodes.h | 11 +- src/include/optimizer/tlist.h | 7 +- src/include/parser/parse_clause.h | 5 +- src/test/regress/expected/copyselect.out | 4 +- src/test/regress/expected/union.out | 17 +- src/test/regress/sql/copyselect.sql | 4 +- src/test/regress/sql/union.sql | 17 +- 20 files changed, 462 insertions(+), 212 deletions(-) diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index d58b9d4a2f..5f5c91366f 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/dependency.c,v 1.77 2008/08/02 21:31:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/dependency.c,v 1.78 2008/08/07 01:11:46 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1597,6 +1597,15 @@ find_expr_references_walker(Node *node, context->rtables = list_delete_first(context->rtables); return result; } + else if (IsA(node, SetOperationStmt)) + { + SetOperationStmt *setop = (SetOperationStmt *) node; + + /* we need to look at the groupClauses for operator references */ + find_expr_references_walker((Node *) setop->groupClauses, context); + /* fall through to examine child nodes */ + } + return expression_tree_walker(node, find_expr_references_walker, (void *) context); } diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 652c726a67..e562949d93 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -15,7 +15,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.396 2008/08/02 21:31:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/copyfuncs.c,v 1.397 2008/08/07 01:11:46 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1943,6 +1943,7 @@ _copySetOperationStmt(SetOperationStmt *from) COPY_NODE_FIELD(rarg); COPY_NODE_FIELD(colTypes); COPY_NODE_FIELD(colTypmods); + COPY_NODE_FIELD(groupClauses); return newnode; } diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 4e011947ef..dc0b2cca37 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -18,7 +18,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/equalfuncs.c,v 1.325 2008/08/02 21:31:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/equalfuncs.c,v 1.326 2008/08/07 01:11:47 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -839,6 +839,7 @@ _equalSetOperationStmt(SetOperationStmt *a, SetOperationStmt *b) COMPARE_NODE_FIELD(rarg); COMPARE_NODE_FIELD(colTypes); COMPARE_NODE_FIELD(colTypmods); + COMPARE_NODE_FIELD(groupClauses); return true; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 408b9b2a75..26f8fb3810 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.330 2008/08/05 02:43:17 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.331 2008/08/07 01:11:48 tgl Exp $ * * NOTES * Every node type that can appear in stored rules' parsetrees *must* @@ -1780,6 +1780,7 @@ _outSetOperationStmt(StringInfo str, SetOperationStmt *node) WRITE_NODE_FIELD(rarg); WRITE_NODE_FIELD(colTypes); WRITE_NODE_FIELD(colTypmods); + WRITE_NODE_FIELD(groupClauses); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index e7de4898c0..c46593d22c 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/readfuncs.c,v 1.211 2008/08/02 21:31:59 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/readfuncs.c,v 1.212 2008/08/07 01:11:49 tgl Exp $ * * NOTES * Path and Plan nodes do not have any readfuncs support, because we @@ -232,6 +232,7 @@ _readSetOperationStmt(void) READ_NODE_FIELD(rarg); READ_NODE_FIELD(colTypes); READ_NODE_FIELD(colTypmods); + READ_NODE_FIELD(groupClauses); READ_DONE(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 5b7fde2533..c818f0ddf1 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.239 2008/08/05 16:03:10 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.240 2008/08/07 01:11:50 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -68,10 +68,6 @@ static double preprocess_limit(PlannerInfo *root, double tuple_fraction, int64 *offset_est, int64 *count_est); static void preprocess_groupclause(PlannerInfo *root); -static Oid *extract_grouping_ops(List *groupClause); -static AttrNumber *extract_grouping_cols(List *groupClause, List *tlist); -static bool grouping_is_sortable(List *groupClause); -static bool grouping_is_hashable(List *groupClause); static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, double limit_tuples, Path *cheapest_path, Path *sorted_path, @@ -784,10 +780,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) /* * If there's a top-level ORDER BY, assume we have to fetch all the - * tuples. This might seem too simplistic given all the hackery below - * to possibly avoid the sort ... but a nonzero tuple_fraction is only - * of use to plan_set_operations() when the setop is UNION ALL, and - * the result of UNION ALL is always unsorted. + * tuples. This might be too simplistic given all the hackery below + * to possibly avoid the sort; but the odds of accurate estimates + * here are pretty low anyway. */ if (parse->sortClause) tuple_fraction = 0.0; @@ -818,7 +813,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) */ Assert(parse->commandType == CMD_SELECT); - tlist = postprocess_setop_tlist(result_plan->targetlist, tlist); + tlist = postprocess_setop_tlist(copyObject(result_plan->targetlist), + tlist); /* * Can't handle FOR UPDATE/SHARE here (parser should have checked @@ -1714,100 +1710,6 @@ preprocess_groupclause(PlannerInfo *root) parse->groupClause = new_groupclause; } -/* - * extract_grouping_ops - make an array of the equality operator OIDs - * for a SortGroupClause list - */ -static Oid * -extract_grouping_ops(List *groupClause) -{ - int numCols = list_length(groupClause); - int colno = 0; - Oid *groupOperators; - ListCell *glitem; - - groupOperators = (Oid *) palloc(sizeof(Oid) * numCols); - - foreach(glitem, groupClause) - { - SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem); - - groupOperators[colno] = groupcl->eqop; - Assert(OidIsValid(groupOperators[colno])); - colno++; - } - - return groupOperators; -} - -/* - * extract_grouping_cols - make an array of the grouping column resnos - * for a SortGroupClause list - */ -static AttrNumber * -extract_grouping_cols(List *groupClause, List *tlist) -{ - AttrNumber *grpColIdx; - int numCols = list_length(groupClause); - int colno = 0; - ListCell *glitem; - - grpColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); - - foreach(glitem, groupClause) - { - SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem); - TargetEntry *tle = get_sortgroupclause_tle(groupcl, tlist); - - grpColIdx[colno++] = tle->resno; - } - - return grpColIdx; -} - -/* - * grouping_is_sortable - is it possible to implement grouping list by sorting? - * - * This is easy since the parser will have included a sortop if one exists. - */ -static bool -grouping_is_sortable(List *groupClause) -{ - ListCell *glitem; - - foreach(glitem, groupClause) - { - SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem); - - if (!OidIsValid(groupcl->sortop)) - return false; - } - return true; -} - -/* - * grouping_is_hashable - is it possible to implement grouping list by hashing? - * - * We assume hashing is OK if the equality operators are marked oprcanhash. - * (If there isn't actually a supporting hash function, the executor will - * complain at runtime; but this is a misdeclaration of the operator, not - * a system bug.) - */ -static bool -grouping_is_hashable(List *groupClause) -{ - ListCell *glitem; - - foreach(glitem, groupClause) - { - SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem); - - if (!op_hashjoinable(groupcl->eqop)) - return false; - } - return true; -} - /* * choose_hashed_grouping - should we use hashed grouping? * diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 31ba005edb..750d59a951 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -22,7 +22,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.149 2008/08/02 21:32:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.150 2008/08/07 01:11:50 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -32,8 +32,12 @@ #include "access/heapam.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" +#include "miscadmin.h" #include "nodes/makefuncs.h" #include "optimizer/clauses.h" +#include "optimizer/cost.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" #include "optimizer/plancat.h" #include "optimizer/planmain.h" #include "optimizer/planner.h" @@ -61,6 +65,13 @@ static List *recurse_union_children(Node *setOp, PlannerInfo *root, double tuple_fraction, SetOperationStmt *top_union, List *refnames_tlist); +static Plan *make_union_unique(SetOperationStmt *op, Plan *plan, + PlannerInfo *root, double tuple_fraction, + List **sortClauses); +static bool choose_hashed_setop(PlannerInfo *root, List *groupClauses, + Plan *input_plan, + double tuple_fraction, double dNumDistinctRows, + const char *construct); static List *generate_setop_tlist(List *colTypes, int flag, Index varno, bool hack_constants, @@ -69,7 +80,7 @@ static List *generate_setop_tlist(List *colTypes, int flag, static List *generate_append_tlist(List *colTypes, bool flag, List *input_plans, List *refnames_tlist); -static List *generate_setop_sortlist(List *targetlist); +static List *generate_setop_grouplist(SetOperationStmt *op, List *targetlist); static void expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti); static void make_inh_translation_lists(Relation oldrelation, @@ -99,7 +110,8 @@ static List *adjust_inherited_tlist(List *tlist, * top level has already been factored into tuple_fraction. * * *sortClauses is an output argument: it is set to a list of SortGroupClauses - * representing the result ordering of the topmost set operation. + * representing the result ordering of the topmost set operation. (This will + * be NIL if the output isn't ordered.) */ Plan * plan_set_operations(PlannerInfo *root, double tuple_fraction, @@ -287,8 +299,8 @@ generate_union_plan(SetOperationStmt *op, PlannerInfo *root, /* * If any of my children are identical UNION nodes (same op, all-flag, and * colTypes) then they can be merged into this node so that we generate - * only one Append and Sort for the lot. Recurse to find such nodes and - * compute their children's plans. + * only one Append and unique-ification for the lot. Recurse to find such + * nodes and compute their children's plans. */ planlist = list_concat(recurse_union_children(op->larg, root, tuple_fraction, @@ -314,22 +326,12 @@ generate_union_plan(SetOperationStmt *op, PlannerInfo *root, /* * For UNION ALL, we just need the Append plan. For UNION, need to add - * Sort and Unique nodes to produce unique output. + * node(s) to remove duplicates. */ - if (!op->all) - { - List *sortList; - - sortList = generate_setop_sortlist(tlist); - if (sortList) - { - plan = (Plan *) make_sort_from_sortclauses(root, sortList, plan); - plan = (Plan *) make_unique(plan, sortList); - } - *sortClauses = sortList; - } + if (op->all) + *sortClauses = NIL; /* result of UNION ALL is always unsorted */ else - *sortClauses = NIL; + plan = make_union_unique(op, plan, root, tuple_fraction, sortClauses); return plan; } @@ -346,7 +348,7 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root, *rplan, *plan; List *tlist, - *sortList, + *groupList, *planlist, *child_sortclauses; SetOpCmd cmd; @@ -381,19 +383,24 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root, */ plan = (Plan *) make_append(planlist, false, tlist); - /* - * Sort the child results, then add a SetOp plan node to generate the - * correct output. - */ - sortList = generate_setop_sortlist(tlist); + /* Identify the grouping semantics */ + groupList = generate_setop_grouplist(op, tlist); - if (sortList == NIL) /* nothing to sort on? */ + /* punt if nothing to group on (can this happen?) */ + if (groupList == NIL) { *sortClauses = NIL; return plan; } - plan = (Plan *) make_sort_from_sortclauses(root, sortList, plan); + /* + * Decide whether to hash or sort, and add a sort node if needed. + */ + plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan); + + /* + * Finally, add a SetOp plan node to generate the correct output. + */ switch (op->op) { case SETOP_INTERSECT: @@ -403,14 +410,13 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root, cmd = op->all ? SETOPCMD_EXCEPT_ALL : SETOPCMD_EXCEPT; break; default: - elog(ERROR, "unrecognized set op: %d", - (int) op->op); + elog(ERROR, "unrecognized set op: %d", (int) op->op); cmd = SETOPCMD_INTERSECT; /* keep compiler quiet */ break; } - plan = (Plan *) make_setop(cmd, plan, sortList, list_length(op->colTypes) + 1); + plan = (Plan *) make_setop(cmd, plan, groupList, list_length(op->colTypes) + 1); - *sortClauses = sortList; + *sortClauses = groupList; return plan; } @@ -465,6 +471,171 @@ recurse_union_children(Node *setOp, PlannerInfo *root, &child_sortclauses)); } +/* + * Add nodes to the given plan tree to unique-ify the result of a UNION. + */ +static Plan * +make_union_unique(SetOperationStmt *op, Plan *plan, + PlannerInfo *root, double tuple_fraction, + List **sortClauses) +{ + List *groupList; + double dNumDistinctRows; + long numDistinctRows; + + /* Identify the grouping semantics */ + groupList = generate_setop_grouplist(op, plan->targetlist); + + /* punt if nothing to group on (can this happen?) */ + if (groupList == NIL) + { + *sortClauses = NIL; + return plan; + } + + /* + * XXX for the moment, take the number of distinct groups as being the + * total input size, ie, the worst case. This is too conservative, but + * we don't want to risk having the hashtable overrun memory; also, + * it's not clear how to get a decent estimate of the true size. One + * should note as well the propensity of novices to write UNION rather + * than UNION ALL even when they don't expect any duplicates... + */ + dNumDistinctRows = plan->plan_rows; + + /* Also convert to long int --- but 'ware overflow! */ + numDistinctRows = (long) Min(dNumDistinctRows, (double) LONG_MAX); + + /* Decide whether to hash or sort */ + if (choose_hashed_setop(root, groupList, plan, + tuple_fraction, dNumDistinctRows, + "UNION")) + { + /* Hashed aggregate plan --- no sort needed */ + plan = (Plan *) make_agg(root, + plan->targetlist, + NIL, + AGG_HASHED, + list_length(groupList), + extract_grouping_cols(groupList, + plan->targetlist), + extract_grouping_ops(groupList), + numDistinctRows, + 0, + plan); + /* Hashed aggregation produces randomly-ordered results */ + *sortClauses = NIL; + } + else + { + /* Sort and Unique */ + plan = (Plan *) make_sort_from_sortclauses(root, groupList, plan); + plan = (Plan *) make_unique(plan, groupList); + plan->plan_rows = dNumDistinctRows; + /* We know the sort order of the result */ + *sortClauses = groupList; + } + + return plan; +} + +/* + * choose_hashed_setop - should we use hashing for a set operation? + */ +static bool +choose_hashed_setop(PlannerInfo *root, List *groupClauses, + Plan *input_plan, + double tuple_fraction, double dNumDistinctRows, + const char *construct) +{ + int numDistinctCols = list_length(groupClauses); + bool can_sort; + bool can_hash; + Size hashentrysize; + List *needed_pathkeys; + Path hashed_p; + Path sorted_p; + + /* Check whether the operators support sorting or hashing */ + can_sort = grouping_is_sortable(groupClauses); + can_hash = grouping_is_hashable(groupClauses); + if (can_hash && can_sort) + { + /* we have a meaningful choice to make, continue ... */ + } + else if (can_hash) + return true; + else if (can_sort) + return false; + else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + /* translator: %s is UNION, INTERSECT, or EXCEPT */ + errmsg("could not implement %s", construct), + errdetail("Some of the datatypes only support hashing, while others only support sorting."))); + + /* Prefer sorting when enable_hashagg is off */ + if (!enable_hashagg) + return false; + + /* + * Don't do it if it doesn't look like the hashtable will fit into + * work_mem. + */ + hashentrysize = MAXALIGN(input_plan->plan_width) + MAXALIGN(sizeof(MinimalTupleData)); + + if (hashentrysize * dNumDistinctRows > work_mem * 1024L) + return false; + + /* + * See if the estimated cost is no more than doing it the other way. + * + * We need to consider input_plan + hashagg versus input_plan + sort + + * group. Note that the actual result plan might involve a SetOp or + * Unique node, not Agg or Group, but the cost estimates for Agg and Group + * should be close enough for our purposes here. + * + * These path variables are dummies that just hold cost fields; we don't + * make actual Paths for these steps. + */ + cost_agg(&hashed_p, root, AGG_HASHED, 0, + numDistinctCols, dNumDistinctRows, + input_plan->startup_cost, input_plan->total_cost, + input_plan->plan_rows); + + /* + * Now for the sorted case. Note that the input is *always* unsorted, + * since it was made by appending unrelated sub-relations together. + */ + sorted_p.startup_cost = input_plan->startup_cost; + sorted_p.total_cost = input_plan->total_cost; + /* XXX this is more expensive than cost_sort really needs: */ + needed_pathkeys = make_pathkeys_for_sortclauses(root, + groupClauses, + input_plan->targetlist, + true); + cost_sort(&sorted_p, root, needed_pathkeys, sorted_p.total_cost, + input_plan->plan_rows, input_plan->plan_width, -1.0); + cost_group(&sorted_p, root, numDistinctCols, dNumDistinctRows, + sorted_p.startup_cost, sorted_p.total_cost, + input_plan->plan_rows); + + /* + * Now make the decision using the top-level tuple fraction. First we + * have to convert an absolute count (LIMIT) into fractional form. + */ + if (tuple_fraction >= 1.0) + tuple_fraction /= dNumDistinctRows; + + if (compare_fractional_path_costs(&hashed_p, &sorted_p, + tuple_fraction) < 0) + { + /* Hashed is cheaper, so use it */ + return true; + } + return false; +} + /* * Generate targetlist for a set-operation plan node * @@ -677,30 +848,47 @@ generate_append_tlist(List *colTypes, bool flag, } /* - * generate_setop_sortlist - * Build a SortGroupClause list enumerating all the non-resjunk - * tlist entries, using default ordering properties. + * generate_setop_grouplist + * Build a SortGroupClause list defining the sort/grouping properties + * of the setop's output columns. * - * For now, we require all the items to be sortable. Eventually we - * should implement hashing setops and allow hash-only datatypes. + * Parse analysis already determined the properties and built a suitable + * list, except that the entries do not have sortgrouprefs set because + * the parser output representation doesn't include a tlist for each + * setop. So what we need to do here is copy that list and install + * proper sortgrouprefs into it and into the targetlist. */ static List * -generate_setop_sortlist(List *targetlist) +generate_setop_grouplist(SetOperationStmt *op, List *targetlist) { - List *sortlist = NIL; - ListCell *l; + List *grouplist = (List *) copyObject(op->groupClauses); + ListCell *lg; + ListCell *lt; + Index refno = 1; - foreach(l, targetlist) + lg = list_head(grouplist); + foreach(lt, targetlist) { - TargetEntry *tle = (TargetEntry *) lfirst(l); + TargetEntry *tle = (TargetEntry *) lfirst(lt); + SortGroupClause *sgc; - if (!tle->resjunk) - sortlist = addTargetToGroupList(NULL, tle, - sortlist, targetlist, - true, /* XXX fixme someday */ - false); + /* tlist shouldn't have any sortgrouprefs yet */ + Assert(tle->ressortgroupref == 0); + + if (tle->resjunk) + continue; /* ignore resjunk columns */ + + /* non-resjunk columns should have grouping clauses */ + Assert(lg != NULL); + sgc = (SortGroupClause *) lfirst(lg); + lg = lnext(lg); + Assert(sgc->tleSortGroupRef == 0); + + /* we could use assignSortGroupRef here, but seems a bit silly */ + sgc->tleSortGroupRef = tle->ressortgroupref = refno++; } - return sortlist; + Assert(lg == NULL); + return grouplist; } diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 04ef2224c9..858d4abcbd 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/clauses.c,v 1.260 2008/08/02 21:32:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/clauses.c,v 1.261 2008/08/07 01:11:50 tgl Exp $ * * HISTORY * AUTHOR DATE MAJOR EVENT @@ -3933,6 +3933,8 @@ expression_tree_walker(Node *node, return true; if (walker(setop->rarg, context)) return true; + + /* groupClauses are deemed uninteresting */ } break; case T_InClauseInfo: @@ -4535,6 +4537,7 @@ expression_tree_mutator(Node *node, FLATCOPY(newnode, setop, SetOperationStmt); MUTATE(newnode->larg, setop->larg, Node *); MUTATE(newnode->rarg, setop->rarg, Node *); + /* We do not mutate groupClauses by default */ return (Node *) newnode; } break; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index ad3c10ac27..655443e9ef 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.144 2008/08/02 21:32:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.145 2008/08/07 01:11:50 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -24,7 +24,6 @@ #include "optimizer/paths.h" #include "optimizer/tlist.h" #include "parser/parse_expr.h" -#include "parser/parse_oper.h" #include "parser/parsetree.h" #include "utils/selfuncs.h" #include "utils/lsyscache.h" @@ -1003,12 +1002,6 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) /* * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row, * except with ALL. - * - * XXX this code knows that prepunion.c will adopt the default sort/group - * operators for each column datatype to determine uniqueness. It'd - * probably be better if these operators were chosen at parse time and - * stored into the parsetree, instead of leaving bits of the planner to - * decide semantics. */ if (query->setOperations) { @@ -1019,24 +1012,26 @@ query_is_distinct_for(Query *query, List *colnos, List *opids) if (!topop->all) { + ListCell *lg; + /* We're good if all the nonjunk output columns are in colnos */ + lg = list_head(topop->groupClauses); foreach(l, query->targetList) { TargetEntry *tle = (TargetEntry *) lfirst(l); - Oid tle_eq_opr; + SortGroupClause *sgc; if (tle->resjunk) continue; /* ignore resjunk columns */ + /* non-resjunk columns should have grouping clauses */ + Assert(lg != NULL); + sgc = (SortGroupClause *) lfirst(lg); + lg = lnext(lg); + opid = distinct_col_search(tle->resno, colnos, opids); - if (!OidIsValid(opid)) - break; /* exit early if no match */ - /* check for compatible semantics */ - get_sort_group_operators(exprType((Node *) tle->expr), - false, false, false, - NULL, &tle_eq_opr, NULL); - if (!OidIsValid(tle_eq_opr) || - !equality_ops_are_compatible(opid, tle_eq_opr)) + if (!OidIsValid(opid) || + !equality_ops_are_compatible(opid, sgc->eqop)) break; /* exit early if no match */ } if (l == NULL) /* had matches for all? */ diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c index fc0880ad56..a2c627fd4d 100644 --- a/src/backend/optimizer/util/tlist.c +++ b/src/backend/optimizer/util/tlist.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/tlist.c,v 1.79 2008/08/02 21:32:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/tlist.c,v 1.80 2008/08/07 01:11:50 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -18,6 +18,7 @@ #include "optimizer/tlist.h" #include "optimizer/var.h" #include "parser/parse_expr.h" +#include "utils/lsyscache.h" /***************************************************************************** @@ -202,6 +203,109 @@ get_sortgrouplist_exprs(List *sgClauses, List *targetList) } +/***************************************************************************** + * Functions to extract data from a list of SortGroupClauses + * + * These don't really belong in tlist.c, but they are sort of related to the + * functions just above, and they don't seem to deserve their own file. + *****************************************************************************/ + +/* + * extract_grouping_ops - make an array of the equality operator OIDs + * for a SortGroupClause list + */ +Oid * +extract_grouping_ops(List *groupClause) +{ + int numCols = list_length(groupClause); + int colno = 0; + Oid *groupOperators; + ListCell *glitem; + + groupOperators = (Oid *) palloc(sizeof(Oid) * numCols); + + foreach(glitem, groupClause) + { + SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem); + + groupOperators[colno] = groupcl->eqop; + Assert(OidIsValid(groupOperators[colno])); + colno++; + } + + return groupOperators; +} + +/* + * extract_grouping_cols - make an array of the grouping column resnos + * for a SortGroupClause list + */ +AttrNumber * +extract_grouping_cols(List *groupClause, List *tlist) +{ + AttrNumber *grpColIdx; + int numCols = list_length(groupClause); + int colno = 0; + ListCell *glitem; + + grpColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); + + foreach(glitem, groupClause) + { + SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem); + TargetEntry *tle = get_sortgroupclause_tle(groupcl, tlist); + + grpColIdx[colno++] = tle->resno; + } + + return grpColIdx; +} + +/* + * grouping_is_sortable - is it possible to implement grouping list by sorting? + * + * This is easy since the parser will have included a sortop if one exists. + */ +bool +grouping_is_sortable(List *groupClause) +{ + ListCell *glitem; + + foreach(glitem, groupClause) + { + SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem); + + if (!OidIsValid(groupcl->sortop)) + return false; + } + return true; +} + +/* + * grouping_is_hashable - is it possible to implement grouping list by hashing? + * + * We assume hashing is OK if the equality operators are marked oprcanhash. + * (If there isn't actually a supporting hash function, the executor will + * complain at runtime; but this is a misdeclaration of the operator, not + * a system bug.) + */ +bool +grouping_is_hashable(List *groupClause) +{ + ListCell *glitem; + + foreach(glitem, groupClause) + { + SortGroupClause *groupcl = (SortGroupClause *) lfirst(glitem); + + if (!op_hashjoinable(groupcl->eqop)) + return false; + } + return true; +} + + + /* * Does tlist have same output datatypes as listed in colTypes? * diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 2cc3f28d20..3de232ba71 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -17,7 +17,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/parser/analyze.c,v 1.375 2008/08/02 21:32:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/parser/analyze.c,v 1.376 2008/08/07 01:11:51 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -33,6 +33,7 @@ #include "parser/parse_clause.h" #include "parser/parse_coerce.h" #include "parser/parse_expr.h" +#include "parser/parse_oper.h" #include "parser/parse_relation.h" #include "parser/parse_target.h" #include "parser/parsetree.h" @@ -1326,6 +1327,7 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt) op->colTypes = NIL; op->colTypmods = NIL; + op->groupClauses = NIL; /* don't have a "foreach4", so chase two of the lists by hand */ lcm = list_head(lcoltypmods); rcm = list_head(rcoltypmods); @@ -1349,6 +1351,31 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt) op->colTypes = lappend_oid(op->colTypes, rescoltype); op->colTypmods = lappend_int(op->colTypmods, rescoltypmod); + /* + * For all cases except UNION ALL, identify the grouping operators + * (and, if available, sorting operators) that will be used to + * eliminate duplicates. + */ + if (op->op != SETOP_UNION || !op->all) + { + SortGroupClause *grpcl = makeNode(SortGroupClause); + Oid sortop; + Oid eqop; + + /* determine the eqop and optional sortop */ + get_sort_group_operators(rescoltype, + false, true, false, + &sortop, &eqop, NULL); + + /* we don't have a tlist yet, so can't assign sortgrouprefs */ + grpcl->tleSortGroupRef = 0; + grpcl->eqop = eqop; + grpcl->sortop = sortop; + grpcl->nulls_first = false; /* OK with or without sortop */ + + op->groupClauses = lappend(op->groupClauses, grpcl); + } + lcm = lnext(lcm); rcm = lnext(rcm); } diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index 2b04ee5e33..79b45414d4 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.174 2008/08/05 02:43:17 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.175 2008/08/07 01:11:51 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -70,6 +70,9 @@ static List *addTargetToSortList(ParseState *pstate, TargetEntry *tle, List *sortlist, List *targetlist, SortByDir sortby_dir, SortByNulls sortby_nulls, List *sortby_opname, bool resolveUnknown); +static List *addTargetToGroupList(ParseState *pstate, TargetEntry *tle, + List *grouplist, List *targetlist, + bool resolveUnknown); /* @@ -1355,7 +1358,7 @@ transformGroupClause(ParseState *pstate, List *grouplist, if (!found) result = addTargetToGroupList(pstate, tle, result, *targetlist, - false, true); + true); } return result; @@ -1456,7 +1459,7 @@ transformDistinctClause(ParseState *pstate, continue; /* ignore junk */ result = addTargetToGroupList(pstate, tle, result, *targetlist, - false, true); + true); } return result; @@ -1551,7 +1554,7 @@ transformDistinctOnClause(ParseState *pstate, List *distinctlist, errmsg("SELECT DISTINCT ON expressions must match initial ORDER BY expressions"))); result = addTargetToGroupList(pstate, tle, result, *targetlist, - false, true); + true); } return result; @@ -1679,10 +1682,6 @@ addTargetToSortList(ParseState *pstate, TargetEntry *tle, * the TLE is considered "already in the list" if it appears there with any * sorting semantics. * - * If requireSortOp is TRUE, we require a sorting operator to be found too. - * XXX this argument should eventually be obsolete, but for now there are - * parts of the system that can't support non-sortable grouping lists. - * * If resolveUnknown is TRUE, convert TLEs of type UNKNOWN to TEXT. If not, * do nothing (which implies the search for an equality operator will fail). * pstate should be provided if resolveUnknown is TRUE, but can be NULL @@ -1690,10 +1689,10 @@ addTargetToSortList(ParseState *pstate, TargetEntry *tle, * * Returns the updated SortGroupClause list. */ -List * +static List * addTargetToGroupList(ParseState *pstate, TargetEntry *tle, List *grouplist, List *targetlist, - bool requireSortOp, bool resolveUnknown) + bool resolveUnknown) { Oid restype = exprType((Node *) tle->expr); Oid sortop; @@ -1716,7 +1715,7 @@ addTargetToGroupList(ParseState *pstate, TargetEntry *tle, /* determine the eqop and optional sortop */ get_sort_group_operators(restype, - requireSortOp, true, false, + false, true, false, &sortop, &eqop, NULL); grpcl->tleSortGroupRef = assignSortGroupRef(tle, targetlist); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 6be25b05f9..b844577c88 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.475 2008/08/05 12:09:30 mha Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.476 2008/08/07 01:11:51 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200808051 +#define CATALOG_VERSION_NO 200808061 #endif diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index f1a1e828fc..65c3698a84 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.370 2008/08/02 21:32:00 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.371 2008/08/07 01:11:51 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -797,7 +797,12 @@ typedef struct SelectStmt * top-level Query node containing the leaf SELECTs as subqueries in its * range table. Its setOperations field shows the tree of set operations, * with leaf SelectStmt nodes replaced by RangeTblRef nodes, and internal - * nodes replaced by SetOperationStmt nodes. + * nodes replaced by SetOperationStmt nodes. Information about the output + * column types is added, too. (Note that the child nodes do not necessarily + * produce these types directly, but we've checked that their output types + * can be coerced to the output column type.) Also, if it's not UNION ALL, + * information about the types' sort/group semantics is provided in the form + * of a SortGroupClause list (same representation as, eg, DISTINCT). * ---------------------- */ typedef struct SetOperationStmt @@ -812,6 +817,8 @@ typedef struct SetOperationStmt /* Fields derived during parse analysis: */ List *colTypes; /* OID list of output column type OIDs */ List *colTypmods; /* integer list of output column typmods */ + List *groupClauses; /* a list of SortGroupClause's */ + /* groupClauses is NIL if UNION ALL, but must be set otherwise */ } SetOperationStmt; diff --git a/src/include/optimizer/tlist.h b/src/include/optimizer/tlist.h index 8a8966213b..9899b14c82 100644 --- a/src/include/optimizer/tlist.h +++ b/src/include/optimizer/tlist.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/tlist.h,v 1.50 2008/08/02 21:32:01 tgl Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/tlist.h,v 1.51 2008/08/07 01:11:52 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -32,6 +32,11 @@ extern Node *get_sortgroupclause_expr(SortGroupClause *sgClause, extern List *get_sortgrouplist_exprs(List *sgClauses, List *targetList); +extern Oid *extract_grouping_ops(List *groupClause); +extern AttrNumber *extract_grouping_cols(List *groupClause, List *tlist); +extern bool grouping_is_sortable(List *groupClause); +extern bool grouping_is_hashable(List *groupClause); + extern bool tlist_same_datatypes(List *tlist, List *colTypes, bool junkOK); #endif /* TLIST_H */ diff --git a/src/include/parser/parse_clause.h b/src/include/parser/parse_clause.h index 357a2947cb..ffea3466b5 100644 --- a/src/include/parser/parse_clause.h +++ b/src/include/parser/parse_clause.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/parser/parse_clause.h,v 1.51 2008/08/02 21:32:01 tgl Exp $ + * $PostgreSQL: pgsql/src/include/parser/parse_clause.h,v 1.52 2008/08/07 01:11:52 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -35,9 +35,6 @@ extern List *transformDistinctClause(ParseState *pstate, extern List *transformDistinctOnClause(ParseState *pstate, List *distinctlist, List **targetlist, List *sortClause); -extern List *addTargetToGroupList(ParseState *pstate, TargetEntry *tle, - List *grouplist, List *targetlist, - bool requireSortOp, bool resolveUnknown); extern Index assignSortGroupRef(TargetEntry *tle, List *tlist); extern bool targetIsInSortList(TargetEntry *tle, Oid sortop, List *sortList); diff --git a/src/test/regress/expected/copyselect.out b/src/test/regress/expected/copyselect.out index c42bad143e..8a42b0e3d8 100644 --- a/src/test/regress/expected/copyselect.out +++ b/src/test/regress/expected/copyselect.out @@ -73,7 +73,7 @@ copy (select * from test1 join test2 using (id)) to stdout; -- -- Test UNION SELECT -- -copy (select t from test1 where id = 1 UNION select * from v_test1) to stdout; +copy (select t from test1 where id = 1 UNION select * from v_test1 ORDER BY 1) to stdout; a v_a v_b @@ -83,7 +83,7 @@ v_e -- -- Test subselect -- -copy (select * from (select t from test1 where id = 1 UNION select * from v_test1) t1) to stdout; +copy (select * from (select t from test1 where id = 1 UNION select * from v_test1 ORDER BY 1) t1) to stdout; a v_a v_b diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out index abcbc9503a..722f9651be 100644 --- a/src/test/regress/expected/union.out +++ b/src/test/regress/expected/union.out @@ -102,7 +102,7 @@ SELECT 1.1 AS three UNION SELECT 2 UNION SELECT 3; 3 (3 rows) -SELECT 1.1::float8 AS two UNION SELECT 2 UNION SELECT 2.0::float8; +SELECT 1.1::float8 AS two UNION SELECT 2 UNION SELECT 2.0::float8 ORDER BY 1; two ----- 1.1 @@ -129,7 +129,8 @@ SELECT 1.1 AS two UNION (SELECT 2 UNION ALL SELECT 2); -- SELECT f1 AS five FROM FLOAT8_TBL UNION -SELECT f1 FROM FLOAT8_TBL; +SELECT f1 FROM FLOAT8_TBL +ORDER BY 1; five ----------------------- -1.2345678901234e+200 @@ -158,7 +159,8 @@ SELECT f1 FROM FLOAT8_TBL; SELECT f1 AS nine FROM FLOAT8_TBL UNION -SELECT f1 FROM INT4_TBL; +SELECT f1 FROM INT4_TBL +ORDER BY 1; nine ----------------------- -1.2345678901234e+200 @@ -205,7 +207,8 @@ SELECT f1 FROM INT4_TBL SELECT CAST(f1 AS char(4)) AS three FROM VARCHAR_TBL UNION -SELECT f1 FROM CHAR_TBL; +SELECT f1 FROM CHAR_TBL +ORDER BY 1; three ------- a @@ -215,7 +218,8 @@ SELECT f1 FROM CHAR_TBL; SELECT f1 AS three FROM VARCHAR_TBL UNION -SELECT CAST(f1 AS varchar) FROM CHAR_TBL; +SELECT CAST(f1 AS varchar) FROM CHAR_TBL +ORDER BY 1; three ------- a @@ -242,7 +246,8 @@ SELECT f1 AS five FROM TEXT_TBL UNION SELECT f1 FROM VARCHAR_TBL UNION -SELECT TRIM(TRAILING FROM f1) FROM CHAR_TBL; +SELECT TRIM(TRAILING FROM f1) FROM CHAR_TBL +ORDER BY 1; five ------------------- a diff --git a/src/test/regress/sql/copyselect.sql b/src/test/regress/sql/copyselect.sql index c2526487c8..beca507ae7 100644 --- a/src/test/regress/sql/copyselect.sql +++ b/src/test/regress/sql/copyselect.sql @@ -53,11 +53,11 @@ copy (select * from test1 join test2 using (id)) to stdout; -- -- Test UNION SELECT -- -copy (select t from test1 where id = 1 UNION select * from v_test1) to stdout; +copy (select t from test1 where id = 1 UNION select * from v_test1 ORDER BY 1) to stdout; -- -- Test subselect -- -copy (select * from (select t from test1 where id = 1 UNION select * from v_test1) t1) to stdout; +copy (select * from (select t from test1 where id = 1 UNION select * from v_test1 ORDER BY 1) t1) to stdout; -- -- Test headers, CSV and quotes -- diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql index 0f846091cd..0b83d6b185 100644 --- a/src/test/regress/sql/union.sql +++ b/src/test/regress/sql/union.sql @@ -34,7 +34,7 @@ SELECT 1.0::float8 AS two UNION ALL SELECT 1; SELECT 1.1 AS three UNION SELECT 2 UNION SELECT 3; -SELECT 1.1::float8 AS two UNION SELECT 2 UNION SELECT 2.0::float8; +SELECT 1.1::float8 AS two UNION SELECT 2 UNION SELECT 2.0::float8 ORDER BY 1; SELECT 1.1 AS three UNION SELECT 2 UNION ALL SELECT 2; @@ -46,7 +46,8 @@ SELECT 1.1 AS two UNION (SELECT 2 UNION ALL SELECT 2); SELECT f1 AS five FROM FLOAT8_TBL UNION -SELECT f1 FROM FLOAT8_TBL; +SELECT f1 FROM FLOAT8_TBL +ORDER BY 1; SELECT f1 AS ten FROM FLOAT8_TBL UNION ALL @@ -54,7 +55,8 @@ SELECT f1 FROM FLOAT8_TBL; SELECT f1 AS nine FROM FLOAT8_TBL UNION -SELECT f1 FROM INT4_TBL; +SELECT f1 FROM INT4_TBL +ORDER BY 1; SELECT f1 AS ten FROM FLOAT8_TBL UNION ALL @@ -68,11 +70,13 @@ SELECT f1 FROM INT4_TBL SELECT CAST(f1 AS char(4)) AS three FROM VARCHAR_TBL UNION -SELECT f1 FROM CHAR_TBL; +SELECT f1 FROM CHAR_TBL +ORDER BY 1; SELECT f1 AS three FROM VARCHAR_TBL UNION -SELECT CAST(f1 AS varchar) FROM CHAR_TBL; +SELECT CAST(f1 AS varchar) FROM CHAR_TBL +ORDER BY 1; SELECT f1 AS eight FROM VARCHAR_TBL UNION ALL @@ -82,7 +86,8 @@ SELECT f1 AS five FROM TEXT_TBL UNION SELECT f1 FROM VARCHAR_TBL UNION -SELECT TRIM(TRAILING FROM f1) FROM CHAR_TBL; +SELECT TRIM(TRAILING FROM f1) FROM CHAR_TBL +ORDER BY 1; -- -- INTERSECT and EXCEPT