Split out into a separate function the code in grouping_planner() that

decides whether to use hashed grouping instead of sort-plus-uniq
grouping. The function needs an annoyingly large number of parameters,
but this still seems like a win for legibility, since it removes over
a hundred lines from grouping_planner (which is still too big :-().
This commit is contained in:
Tom Lane 2005-04-10 19:50:08 +00:00
parent 313de22c85
commit 6985592967
1 changed files with 156 additions and 140 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.182 2005/04/06 16:34:05 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.183 2005/04/10 19:50:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -58,6 +58,10 @@ static Node *preprocess_expression(Query *parse, Node *expr, int kind);
static void preprocess_qual_conditions(Query *parse, Node *jtnode);
static Plan *inheritance_planner(Query *parse, List *inheritlist);
static Plan *grouping_planner(Query *parse, double tuple_fraction);
static bool choose_hashed_grouping(Query *parse, double tuple_fraction,
Path *cheapest_path, Path *sorted_path,
List *sort_pathkeys, List *group_pathkeys,
double dNumGroups, AggClauseCounts *agg_counts);
static bool hash_safe_grouping(Query *parse);
static List *make_subplanTargetList(Query *parse, List *tlist,
AttrNumber **groupColIdx, bool *need_tlist_eval);
@ -920,34 +924,25 @@ grouping_planner(Query *parse, double tuple_fraction)
sort_pathkeys = canonicalize_pathkeys(parse, sort_pathkeys);
/*
* Consider whether we might want to use hashed grouping.
* If grouping, estimate the number of groups. (We can't do this
* until after running query_planner(), either.) Then decide
* whether we want to use hashed grouping.
*/
if (parse->groupClause)
{
List *groupExprs;
double cheapest_path_rows;
int cheapest_path_width;
/*
* Beware in this section of the possibility that
* cheapest_path->parent is NULL. This could happen if user
* does something silly like SELECT 'foo' GROUP BY 1;
* Beware of the possibility that cheapest_path->parent is NULL.
* This could happen if user does something silly like
* SELECT 'foo' GROUP BY 1;
*/
if (cheapest_path->parent)
{
cheapest_path_rows = cheapest_path->parent->rows;
cheapest_path_width = cheapest_path->parent->width;
}
else
{
cheapest_path_rows = 1; /* assume non-set result */
cheapest_path_width = 100; /* arbitrary */
}
/*
* Always estimate the number of groups. We can't do this
* until after running query_planner(), either.
*/
groupExprs = get_sortgrouplist_exprs(parse->groupClause,
parse->targetList);
dNumGroups = estimate_num_groups(parse,
@ -956,130 +951,11 @@ grouping_planner(Query *parse, double tuple_fraction)
/* Also want it as a long int --- but 'ware overflow! */
numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
/*
* Check can't-do-it conditions, including whether the
* grouping operators are hashjoinable.
*
* Executor doesn't support hashed aggregation with DISTINCT
* aggregates. (Doing so would imply storing *all* the input
* values in the hash table, which seems like a certain
* loser.)
*/
if (!enable_hashagg || !hash_safe_grouping(parse))
use_hashed_grouping = false;
else if (agg_counts.numDistinctAggs != 0)
use_hashed_grouping = false;
else
{
/*
* Use hashed grouping if (a) we think we can fit the
* hashtable into work_mem, *and* (b) the estimated cost
* is no more than doing it the other way. While avoiding
* the need for sorted input is usually a win, the fact
* that the output won't be sorted may be a loss; so we
* need to do an actual cost comparison.
*/
Size hashentrysize;
/* Estimate per-hash-entry space at tuple width... */
hashentrysize = cheapest_path_width;
/* plus space for pass-by-ref transition values... */
hashentrysize += agg_counts.transitionSpace;
/* plus the per-hash-entry overhead */
hashentrysize += hash_agg_entry_size(agg_counts.numAggs);
if (hashentrysize * dNumGroups <= work_mem * 1024L)
{
/*
* Okay, do the cost comparison. We need to consider
* cheapest_path + hashagg [+ final sort] versus
* either cheapest_path [+ sort] + group or agg [+
* final sort] or presorted_path + group or agg [+
* final sort] where brackets indicate a step that may
* not be needed. We assume query_planner() will have
* returned a presorted path only if it's a winner
* compared to cheapest_path for this purpose.
*
* These path variables are dummies that just hold cost
* fields; we don't make actual Paths for these steps.
*/
Path hashed_p;
Path sorted_p;
cost_agg(&hashed_p, parse,
AGG_HASHED, agg_counts.numAggs,
numGroupCols, dNumGroups,
cheapest_path->startup_cost,
cheapest_path->total_cost,
cheapest_path_rows);
/* Result of hashed agg is always unsorted */
if (sort_pathkeys)
cost_sort(&hashed_p, parse, sort_pathkeys,
hashed_p.total_cost,
dNumGroups,
cheapest_path_width);
if (sorted_path)
{
sorted_p.startup_cost = sorted_path->startup_cost;
sorted_p.total_cost = sorted_path->total_cost;
current_pathkeys = sorted_path->pathkeys;
}
else
{
sorted_p.startup_cost = cheapest_path->startup_cost;
sorted_p.total_cost = cheapest_path->total_cost;
current_pathkeys = cheapest_path->pathkeys;
}
if (!pathkeys_contained_in(group_pathkeys,
current_pathkeys))
{
cost_sort(&sorted_p, parse, group_pathkeys,
sorted_p.total_cost,
cheapest_path_rows,
cheapest_path_width);
current_pathkeys = group_pathkeys;
}
if (parse->hasAggs)
cost_agg(&sorted_p, parse,
AGG_SORTED, agg_counts.numAggs,
numGroupCols, dNumGroups,
sorted_p.startup_cost,
sorted_p.total_cost,
cheapest_path_rows);
else
cost_group(&sorted_p, parse,
numGroupCols, dNumGroups,
sorted_p.startup_cost,
sorted_p.total_cost,
cheapest_path_rows);
/* The Agg or Group node will preserve ordering */
if (sort_pathkeys &&
!pathkeys_contained_in(sort_pathkeys,
current_pathkeys))
{
cost_sort(&sorted_p, parse, sort_pathkeys,
sorted_p.total_cost,
dNumGroups,
cheapest_path_width);
}
/*
* Now make the decision using the top-level tuple
* fraction. First we have to convert an absolute
* count (LIMIT) into fractional form.
*/
if (tuple_fraction >= 1.0)
tuple_fraction /= dNumGroups;
if (compare_fractional_path_costs(&hashed_p, &sorted_p,
tuple_fraction) < 0)
{
/* Hashed is cheaper, so use it */
use_hashed_grouping = true;
}
}
}
use_hashed_grouping =
choose_hashed_grouping(parse, tuple_fraction,
cheapest_path, sorted_path,
sort_pathkeys, group_pathkeys,
dNumGroups, &agg_counts);
}
/*
@ -1331,6 +1207,146 @@ grouping_planner(Query *parse, double tuple_fraction)
return result_plan;
}
/*
* choose_hashed_grouping - should we use hashed grouping?
*/
static bool
choose_hashed_grouping(Query *parse, double tuple_fraction,
Path *cheapest_path, Path *sorted_path,
List *sort_pathkeys, List *group_pathkeys,
double dNumGroups, AggClauseCounts *agg_counts)
{
int numGroupCols = list_length(parse->groupClause);
double cheapest_path_rows;
int cheapest_path_width;
Size hashentrysize;
List *current_pathkeys;
Path hashed_p;
Path sorted_p;
/*
* Check can't-do-it conditions, including whether the grouping operators
* are hashjoinable.
*
* Executor doesn't support hashed aggregation with DISTINCT aggregates.
* (Doing so would imply storing *all* the input values in the hash table,
* which seems like a certain loser.)
*/
if (!enable_hashagg)
return false;
if (agg_counts->numDistinctAggs != 0)
return false;
if (!hash_safe_grouping(parse))
return false;
/*
* Don't do it if it doesn't look like the hashtable will fit into
* work_mem.
*
* Beware here of the possibility that cheapest_path->parent is NULL.
* This could happen if user does something silly like
* SELECT 'foo' GROUP BY 1;
*/
if (cheapest_path->parent)
{
cheapest_path_rows = cheapest_path->parent->rows;
cheapest_path_width = cheapest_path->parent->width;
}
else
{
cheapest_path_rows = 1; /* assume non-set result */
cheapest_path_width = 100; /* arbitrary */
}
/* Estimate per-hash-entry space at tuple width... */
hashentrysize = cheapest_path_width;
/* plus space for pass-by-ref transition values... */
hashentrysize += agg_counts->transitionSpace;
/* plus the per-hash-entry overhead */
hashentrysize += hash_agg_entry_size(agg_counts->numAggs);
if (hashentrysize * dNumGroups > work_mem * 1024L)
return false;
/*
* See if the estimated cost is no more than doing it the other way.
* While avoiding the need for sorted input is usually a win, the fact
* that the output won't be sorted may be a loss; so we need to do an
* actual cost comparison.
*
* We need to consider
* cheapest_path + hashagg [+ final sort]
* versus either
* cheapest_path [+ sort] + group or agg [+ final sort]
* or
* presorted_path + group or agg [+ final sort]
* where brackets indicate a step that may not be needed. We assume
* query_planner() will have returned a presorted path only if it's a
* winner compared to cheapest_path for this purpose.
*
* These path variables are dummies that just hold cost fields; we don't
* make actual Paths for these steps.
*/
cost_agg(&hashed_p, parse, AGG_HASHED, agg_counts->numAggs,
numGroupCols, dNumGroups,
cheapest_path->startup_cost, cheapest_path->total_cost,
cheapest_path_rows);
/* Result of hashed agg is always unsorted */
if (sort_pathkeys)
cost_sort(&hashed_p, parse, sort_pathkeys, hashed_p.total_cost,
dNumGroups, cheapest_path_width);
if (sorted_path)
{
sorted_p.startup_cost = sorted_path->startup_cost;
sorted_p.total_cost = sorted_path->total_cost;
current_pathkeys = sorted_path->pathkeys;
}
else
{
sorted_p.startup_cost = cheapest_path->startup_cost;
sorted_p.total_cost = cheapest_path->total_cost;
current_pathkeys = cheapest_path->pathkeys;
}
if (!pathkeys_contained_in(group_pathkeys,
current_pathkeys))
{
cost_sort(&sorted_p, parse, group_pathkeys, sorted_p.total_cost,
cheapest_path_rows, cheapest_path_width);
current_pathkeys = group_pathkeys;
}
if (parse->hasAggs)
cost_agg(&sorted_p, parse, AGG_SORTED, agg_counts->numAggs,
numGroupCols, dNumGroups,
sorted_p.startup_cost, sorted_p.total_cost,
cheapest_path_rows);
else
cost_group(&sorted_p, parse, numGroupCols, dNumGroups,
sorted_p.startup_cost, sorted_p.total_cost,
cheapest_path_rows);
/* The Agg or Group node will preserve ordering */
if (sort_pathkeys &&
!pathkeys_contained_in(sort_pathkeys, current_pathkeys))
cost_sort(&sorted_p, parse, sort_pathkeys, sorted_p.total_cost,
dNumGroups, cheapest_path_width);
/*
* Now make the decision using the top-level tuple fraction. First we
* have to convert an absolute count (LIMIT) into fractional form.
*/
if (tuple_fraction >= 1.0)
tuple_fraction /= dNumGroups;
if (compare_fractional_path_costs(&hashed_p, &sorted_p,
tuple_fraction) < 0)
{
/* Hashed is cheaper, so use it */
return true;
}
return false;
}
/*
* hash_safe_grouping - are grouping operators hashable?
*