Change the division of labor between grouping_planner and query_planner

so that the latter estimates the number of groups that grouping will
produce.  This is needed because it is primarily query_planner that
makes the decision between fast-start and fast-finish plans, and in the
original coding it was unable to make more than a crude rule-of-thumb
choice when the query involved grouping.  This revision helps us make
saner choices for queries like SELECT ... GROUP BY ... LIMIT, as in a
recent example from Mark Kirkwood.  Also move the responsibility for
canonicalizing sort_pathkeys and group_pathkeys into query_planner;
this information has to be available anyway to support the first change,
and doing it this way lets us get rid of compare_noncanonical_pathkeys
entirely.
This commit is contained in:
Tom Lane 2005-08-27 22:13:44 +00:00
parent 9e56c5a4cf
commit 4e5fbb34b3
7 changed files with 144 additions and 206 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.259 2005/08/01 20:31:08 tgl Exp $
* $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.260 2005/08/27 22:13:43 tgl Exp $
*
* NOTES
* Every node type that can appear in stored rules' parsetrees *must*
@ -1169,6 +1169,9 @@ _outPlannerInfo(StringInfo str, PlannerInfo *node)
WRITE_NODE_FIELD(full_join_clauses);
WRITE_NODE_FIELD(in_info_list);
WRITE_NODE_FIELD(query_pathkeys);
WRITE_NODE_FIELD(group_pathkeys);
WRITE_NODE_FIELD(sort_pathkeys);
WRITE_FLOAT_FIELD(tuple_fraction, "%.4f");
WRITE_BOOL_FIELD(hasJoinRTEs);
WRITE_BOOL_FIELD(hasOuterJoins);
WRITE_BOOL_FIELD(hasHavingQual);

View File

@ -11,7 +11,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/path/pathkeys.c,v 1.71 2005/07/28 22:27:00 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/path/pathkeys.c,v 1.72 2005/08/27 22:13:43 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -800,54 +800,6 @@ compare_pathkeys(List *keys1, List *keys2)
return PATHKEYS_BETTER2; /* key2 is longer */
}
/*
* compare_noncanonical_pathkeys
* Compare two pathkeys to see if they are equivalent, and if not whether
* one is "better" than the other. This is used when we must compare
* non-canonicalized pathkeys.
*
* A pathkey can be considered better than another if it is a superset:
* it contains all the keys of the other plus more. For example, either
* ((A) (B)) or ((A B)) is better than ((A)).
*
* Currently, the only user of this routine is grouping_planner(),
* and it will only pass single-element sublists (from
* make_pathkeys_for_sortclauses). Therefore we don't have to do the
* full two-way-subset-inclusion test on each pair of sublists that is
* implied by the above statement. Instead we just verify they are
* singleton lists and then do an equal(). This could be improved if
* necessary.
*/
PathKeysComparison
compare_noncanonical_pathkeys(List *keys1, List *keys2)
{
ListCell *key1,
*key2;
forboth(key1, keys1, key2, keys2)
{
List *subkey1 = (List *) lfirst(key1);
List *subkey2 = (List *) lfirst(key2);
Assert(list_length(subkey1) == 1);
Assert(list_length(subkey2) == 1);
if (!equal(subkey1, subkey2))
return PATHKEYS_DIFFERENT; /* no need to keep looking */
}
/*
* If we reached the end of only one list, the other is longer and
* therefore not a subset. (We assume the additional sublist(s) of
* the other list are not NIL --- no pathkey list should ever have a
* NIL sublist.)
*/
if (key1 == NULL && key2 == NULL)
return PATHKEYS_EQUAL;
if (key1 != NULL)
return PATHKEYS_BETTER1; /* key1 is longer */
return PATHKEYS_BETTER2; /* key2 is longer */
}
/*
* pathkeys_contained_in
* Common special case of compare_pathkeys: we just want to know
@ -867,24 +819,6 @@ pathkeys_contained_in(List *keys1, List *keys2)
return false;
}
/*
* noncanonical_pathkeys_contained_in
* The same, when we don't have canonical pathkeys.
*/
bool
noncanonical_pathkeys_contained_in(List *keys1, List *keys2)
{
switch (compare_noncanonical_pathkeys(keys1, keys2))
{
case PATHKEYS_EQUAL:
case PATHKEYS_BETTER2:
return true;
default:
break;
}
return false;
}
/*
* get_cheapest_path_for_pathkeys
* Find the cheapest path (according to the specified criterion) that

View File

@ -14,7 +14,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.86 2005/07/02 23:00:41 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.87 2005/08/27 22:13:43 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -25,9 +25,11 @@
#include "optimizer/pathnode.h"
#include "optimizer/paths.h"
#include "optimizer/planmain.h"
#include "optimizer/tlist.h"
#include "utils/selfuncs.h"
/*--------------------
/*
* query_planner
* Generate a path (that is, a simplified plan) for a basic query,
* which may involve joins but not any fancier features.
@ -51,6 +53,8 @@
* *cheapest_path receives the overall-cheapest path for the query
* *sorted_path receives the cheapest presorted path for the query,
* if any (NULL if there is no useful presorted path)
* *num_groups receives the estimated number of groups, or 1 if query
* does not use grouping
*
* Note: the PlannerInfo node also includes a query_pathkeys field, which is
* both an input and an output of query_planner(). The input value signals
@ -61,17 +65,21 @@
* PlannerInfo field and not a passed parameter is that the low-level routines
* in indxpath.c need to see it.)
*
* Note: the PlannerInfo node also includes group_pathkeys and sort_pathkeys,
* which like query_pathkeys need to be canonicalized once the info is
* available.
*
* tuple_fraction is interpreted as follows:
* 0: expect all tuples to be retrieved (normal case)
* 0 < tuple_fraction < 1: expect the given fraction of tuples available
* from the plan to be retrieved
* tuple_fraction >= 1: tuple_fraction is the absolute number of tuples
* expected to be retrieved (ie, a LIMIT specification)
*--------------------
*/
void
query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
Path **cheapest_path, Path **sorted_path)
Path **cheapest_path, Path **sorted_path,
double *num_groups)
{
Query *parse = root->parse;
List *constant_quals;
@ -82,6 +90,8 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
/* Make tuple_fraction accessible to lower-level routines */
root->tuple_fraction = tuple_fraction;
*num_groups = 1; /* default result */
/*
* If the query has an empty join tree, then it's something easy like
* "SELECT 2+2;" or "INSERT ... VALUES()". Fall through quickly.
@ -156,9 +166,12 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
/*
* We should now have all the pathkey equivalence sets built, so it's
* now possible to convert the requested query_pathkeys to canonical
* form.
* form. Also canonicalize the groupClause and sortClause pathkeys
* for use later.
*/
root->query_pathkeys = canonicalize_pathkeys(root, root->query_pathkeys);
root->group_pathkeys = canonicalize_pathkeys(root, root->group_pathkeys);
root->sort_pathkeys = canonicalize_pathkeys(root, root->sort_pathkeys);
/*
* Ready to do the primary planning.
@ -169,12 +182,87 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction,
elog(ERROR, "failed to construct the join relation");
/*
* Now that we have an estimate of the final rel's size, we can
* convert a tuple_fraction specified as an absolute count (ie, a
* LIMIT option) into a fraction of the total tuples.
* If there's grouping going on, estimate the number of result groups.
* We couldn't do this any earlier because it depends on relation size
* estimates that were set up above.
*
* Then convert tuple_fraction to fractional form if it is absolute,
* and adjust it based on the knowledge that grouping_planner will be
* doing grouping or aggregation work with our result.
*
* This introduces some undesirable coupling between this code and
* grouping_planner, but the alternatives seem even uglier; we couldn't
* pass back completed paths without making these decisions here.
*/
if (tuple_fraction >= 1.0)
tuple_fraction /= final_rel->rows;
if (parse->groupClause)
{
List *groupExprs;
groupExprs = get_sortgrouplist_exprs(parse->groupClause,
parse->targetList);
*num_groups = estimate_num_groups(root,
groupExprs,
final_rel->rows);
/*
* In GROUP BY mode, an absolute LIMIT is relative to the number
* of groups not the number of tuples. If the caller gave us
* a fraction, keep it as-is. (In both cases, we are effectively
* assuming that all the groups are about the same size.)
*/
if (tuple_fraction >= 1.0)
tuple_fraction /= *num_groups;
/*
* If both GROUP BY and ORDER BY are specified, we will need two
* levels of sort --- and, therefore, certainly need to read all
* the tuples --- unless ORDER BY is a subset of GROUP BY.
*/
if (parse->groupClause && parse->sortClause &&
!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
tuple_fraction = 0.0;
}
else if (parse->hasAggs || root->hasHavingQual)
{
/*
* Ungrouped aggregate will certainly want to read all the tuples,
* and it will deliver a single result row (so leave *num_groups 1).
*/
tuple_fraction = 0.0;
}
else if (parse->distinctClause)
{
/*
* Since there was no grouping or aggregation, it's reasonable to
* assume the UNIQUE filter has effects comparable to GROUP BY.
* Return the estimated number of output rows for use by caller.
* (If DISTINCT is used with grouping, we ignore its effects for
* rowcount estimation purposes; this amounts to assuming the grouped
* rows are distinct already.)
*/
List *distinctExprs;
distinctExprs = get_sortgrouplist_exprs(parse->distinctClause,
parse->targetList);
*num_groups = estimate_num_groups(root,
distinctExprs,
final_rel->rows);
/*
* Adjust tuple_fraction the same way as for GROUP BY, too.
*/
if (tuple_fraction >= 1.0)
tuple_fraction /= *num_groups;
}
else
{
/*
* Plain non-grouped, non-aggregated query: an absolute tuple
* fraction can be divided by the number of tuples.
*/
if (tuple_fraction >= 1.0)
tuple_fraction /= final_rel->rows;
}
/*
* Pick out the cheapest-total path and the cheapest presorted path

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.191 2005/08/18 17:51:11 tgl Exp $
* $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.192 2005/08/27 22:13:43 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -63,7 +63,6 @@ static double preprocess_limit(PlannerInfo *root,
int *offset_est, int *count_est);
static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
Path *cheapest_path, Path *sorted_path,
List *sort_pathkeys, List *group_pathkeys,
double dNumGroups, AggClauseCounts *agg_counts);
static bool hash_safe_grouping(PlannerInfo *root);
static List *make_subplanTargetList(PlannerInfo *root, List *tlist,
@ -655,6 +654,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
Plan *result_plan;
List *current_pathkeys;
List *sort_pathkeys;
double dNumGroups = 0;
/* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */
if (parse->limitCount || parse->limitOffset)
@ -727,11 +727,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
AttrNumber *groupColIdx = NULL;
bool need_tlist_eval = true;
QualCost tlist_cost;
double sub_tuple_fraction;
Path *cheapest_path;
Path *sorted_path;
Path *best_path;
double dNumGroups = 0;
long numGroups = 0;
AggClauseCounts agg_counts;
int numGroupCols = list_length(parse->groupClause);
@ -750,13 +748,14 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
&groupColIdx, &need_tlist_eval);
/*
* Calculate pathkeys that represent grouping/ordering
* requirements
* Calculate pathkeys that represent grouping/ordering requirements.
* Stash them in PlannerInfo so that query_planner can canonicalize
* them.
*/
group_pathkeys = make_pathkeys_for_sortclauses(parse->groupClause,
tlist);
sort_pathkeys = make_pathkeys_for_sortclauses(parse->sortClause,
tlist);
root->group_pathkeys =
make_pathkeys_for_sortclauses(parse->groupClause, tlist);
root->sort_pathkeys =
make_pathkeys_for_sortclauses(parse->sortClause, tlist);
/*
* Will need actual number of aggregates for estimating costs.
@ -787,112 +786,36 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
* Needs more thought...)
*/
if (parse->groupClause)
root->query_pathkeys = group_pathkeys;
root->query_pathkeys = root->group_pathkeys;
else if (parse->sortClause)
root->query_pathkeys = sort_pathkeys;
root->query_pathkeys = root->sort_pathkeys;
else
root->query_pathkeys = NIL;
/*
* With grouping or aggregation, the tuple fraction to pass to
* query_planner() may be different from what it is at top level.
*/
sub_tuple_fraction = tuple_fraction;
if (parse->groupClause)
{
/*
* In GROUP BY mode, we have the little problem that we don't
* really know how many input tuples will be needed to make a
* group, so we can't translate an output LIMIT count into an
* input count. For lack of a better idea, assume 25% of the
* input data will be processed if there is any output limit.
* However, if the caller gave us a fraction rather than an
* absolute count, we can keep using that fraction (which
* amounts to assuming that all the groups are about the same
* size).
*/
if (sub_tuple_fraction >= 1.0)
sub_tuple_fraction = 0.25;
/*
* If both GROUP BY and ORDER BY are specified, we will need
* two levels of sort --- and, therefore, certainly need to
* read all the input tuples --- unless ORDER BY is a subset
* of GROUP BY. (We have not yet canonicalized the pathkeys,
* so must use the slower noncanonical comparison method.)
*/
if (parse->groupClause && parse->sortClause &&
!noncanonical_pathkeys_contained_in(sort_pathkeys,
group_pathkeys))
sub_tuple_fraction = 0.0;
}
else if (parse->hasAggs)
{
/*
* Ungrouped aggregate will certainly want all the input
* tuples.
*/
sub_tuple_fraction = 0.0;
}
else if (parse->distinctClause)
{
/*
* SELECT DISTINCT, like GROUP, will absorb an unpredictable
* number of input tuples per output tuple. Handle the same
* way.
*/
if (sub_tuple_fraction >= 1.0)
sub_tuple_fraction = 0.25;
}
/*
* Generate the best unsorted and presorted paths for this Query
* (but note there may not be any presorted path).
* (but note there may not be any presorted path). query_planner
* will also estimate the number of groups in the query, and
* canonicalize all the pathkeys.
*/
query_planner(root, sub_tlist, sub_tuple_fraction,
&cheapest_path, &sorted_path);
query_planner(root, sub_tlist, tuple_fraction,
&cheapest_path, &sorted_path, &dNumGroups);
group_pathkeys = root->group_pathkeys;
sort_pathkeys = root->sort_pathkeys;
/*
* We couldn't canonicalize group_pathkeys and sort_pathkeys
* before running query_planner(), so do it now.
*/
group_pathkeys = canonicalize_pathkeys(root, group_pathkeys);
sort_pathkeys = canonicalize_pathkeys(root, sort_pathkeys);
/*
* If grouping, estimate the number of groups. (We can't do this
* until after running query_planner(), either.) Then decide
* whether we want to use hashed grouping.
* If grouping, decide whether we want to use hashed grouping.
*/
if (parse->groupClause)
{
List *groupExprs;
double cheapest_path_rows;
/*
* Beware of the possibility that cheapest_path->parent is NULL.
* This could happen if user does something silly like
* SELECT 'foo' GROUP BY 1;
*/
if (cheapest_path->parent)
cheapest_path_rows = cheapest_path->parent->rows;
else
cheapest_path_rows = 1; /* assume non-set result */
groupExprs = get_sortgrouplist_exprs(parse->groupClause,
parse->targetList);
dNumGroups = estimate_num_groups(root,
groupExprs,
cheapest_path_rows);
/* Also want it as a long int --- but 'ware overflow! */
numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
use_hashed_grouping =
choose_hashed_grouping(root, tuple_fraction,
cheapest_path, sorted_path,
sort_pathkeys, group_pathkeys,
dNumGroups, &agg_counts);
/* Also convert # groups to long int --- but 'ware overflow! */
numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
}
/*
@ -1130,19 +1053,10 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
/*
* If there was grouping or aggregation, leave plan_rows as-is
* (ie, assume the result was already mostly unique). If not,
* it's reasonable to assume the UNIQUE filter has effects
* comparable to GROUP BY.
* use the number of distinct-groups calculated by query_planner.
*/
if (!parse->groupClause && !root->hasHavingQual && !parse->hasAggs)
{
List *distinctExprs;
distinctExprs = get_sortgrouplist_exprs(parse->distinctClause,
parse->targetList);
result_plan->plan_rows = estimate_num_groups(root,
distinctExprs,
result_plan->plan_rows);
}
result_plan->plan_rows = dNumGroups;
}
/*
@ -1360,7 +1274,6 @@ preprocess_limit(PlannerInfo *root, double tuple_fraction,
static bool
choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
Path *cheapest_path, Path *sorted_path,
List *sort_pathkeys, List *group_pathkeys,
double dNumGroups, AggClauseCounts *agg_counts)
{
int numGroupCols = list_length(root->parse->groupClause);
@ -1439,8 +1352,8 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
cheapest_path->startup_cost, cheapest_path->total_cost,
cheapest_path_rows);
/* Result of hashed agg is always unsorted */
if (sort_pathkeys)
cost_sort(&hashed_p, root, sort_pathkeys, hashed_p.total_cost,
if (root->sort_pathkeys)
cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost,
dNumGroups, cheapest_path_width);
if (sorted_path)
@ -1455,12 +1368,11 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
sorted_p.total_cost = cheapest_path->total_cost;
current_pathkeys = cheapest_path->pathkeys;
}
if (!pathkeys_contained_in(group_pathkeys,
current_pathkeys))
if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys))
{
cost_sort(&sorted_p, root, group_pathkeys, sorted_p.total_cost,
cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost,
cheapest_path_rows, cheapest_path_width);
current_pathkeys = group_pathkeys;
current_pathkeys = root->group_pathkeys;
}
if (root->parse->hasAggs)
@ -1473,9 +1385,9 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction,
sorted_p.startup_cost, sorted_p.total_cost,
cheapest_path_rows);
/* The Agg or Group node will preserve ordering */
if (sort_pathkeys &&
!pathkeys_contained_in(sort_pathkeys, current_pathkeys))
cost_sort(&sorted_p, root, sort_pathkeys, sorted_p.total_cost,
if (root->sort_pathkeys &&
!pathkeys_contained_in(root->sort_pathkeys, current_pathkeys))
cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost,
dNumGroups, cheapest_path_width);
/*

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.117 2005/07/23 21:05:48 tgl Exp $
* $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.118 2005/08/27 22:13:43 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -101,6 +101,9 @@ typedef struct PlannerInfo
List *query_pathkeys; /* desired pathkeys for query_planner(),
* and actual pathkeys afterwards */
List *group_pathkeys; /* groupClause pathkeys, if any */
List *sort_pathkeys; /* sortClause pathkeys, if any */
double tuple_fraction; /* tuple_fraction passed to query_planner */
bool hasJoinRTEs; /* true if any RTEs are RTE_JOIN kind */

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.86 2005/07/28 20:26:22 tgl Exp $
* $PostgreSQL: pgsql/src/include/optimizer/paths.h,v 1.87 2005/08/27 22:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -102,9 +102,6 @@ extern void generate_implied_equalities(PlannerInfo *root);
extern List *canonicalize_pathkeys(PlannerInfo *root, List *pathkeys);
extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2);
extern bool pathkeys_contained_in(List *keys1, List *keys2);
extern PathKeysComparison compare_noncanonical_pathkeys(List *keys1,
List *keys2);
extern bool noncanonical_pathkeys_contained_in(List *keys1, List *keys2);
extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
CostSelector cost_criterion);
extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths,

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.87 2005/08/18 17:51:12 tgl Exp $
* $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.88 2005/08/27 22:13:44 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -22,7 +22,8 @@
*/
extern void query_planner(PlannerInfo *root, List *tlist,
double tuple_fraction,
Path **cheapest_path, Path **sorted_path);
Path **cheapest_path, Path **sorted_path,
double *num_groups);
/*
* prototypes for plan/planagg.c