From 0f61d4dd1b4f95832dcd81c9688dac56fd6b5687 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 19 Nov 2010 17:31:50 -0500 Subject: [PATCH] Improve relation width estimation for subqueries. As per the ancient comment for set_rel_width, it really wasn't much good for relations that aren't plain tables: it would never find any stats and would always fall back on datatype-based estimates, which are often pretty silly. Fix that by copying up width estimates from the subquery planning process. At some point we might want to do this for CTEs too, but that would be a significantly more invasive patch because the sub-PlannerInfo is no longer accessible by the time it's needed. I refrained from doing anything about that, partly for fear of breaking the unmerged CTE-related patches. In passing, also generate less bogus width estimates for whole-row Vars. Per a gripe from Jon Nelson. --- src/backend/optimizer/path/allpaths.c | 5 +- src/backend/optimizer/path/costsize.c | 137 ++++++++++++++++++++++++-- src/backend/optimizer/plan/planner.c | 2 +- src/backend/optimizer/util/plancat.c | 22 +++-- src/include/optimizer/cost.h | 2 + src/include/optimizer/plancat.h | 2 +- 6 files changed, 150 insertions(+), 20 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index aa9a90cbfa..ce893a77be 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -758,11 +758,8 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel, rel->subrtable = subroot->parse->rtable; rel->subrowmark = subroot->rowMarks; - /* Copy number of output rows from subplan */ - rel->tuples = rel->subplan->plan_rows; - /* Mark rel with estimated output rows, width, etc */ - set_baserel_size_estimates(root, rel); + set_subquery_size_estimates(root, rel, subroot); /* Convert subquery pathkeys to outer representation */ pathkeys = convert_subquery_pathkeys(root, rel, subroot->query_pathkeys); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 16a5d0a3ca..0724f9a6c9 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -76,6 +76,7 @@ #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/placeholder.h" +#include "optimizer/plancat.h" #include "optimizer/planmain.h" #include "optimizer/restrictinfo.h" #include "parser/parsetree.h" @@ -2986,7 +2987,7 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals) * Set the size estimates for the given base relation. * * The rel's targetlist and restrictinfo list must have been constructed - * already. + * already, and rel->tuples must be set. * * We set the following fields of the rel node: * rows: the estimated number of output tuples (after applying @@ -3151,6 +3152,76 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel, rel->rows = clamp_row_est(nrows); } +/* + * set_subquery_size_estimates + * Set the size estimates for a base relation that is a subquery. + * + * The rel's targetlist and restrictinfo list must have been constructed + * already, and the plan for the subquery must have been completed. + * We look at the subquery's plan and PlannerInfo to extract data. + * + * We set the same fields as set_baserel_size_estimates. + */ +void +set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel, + PlannerInfo *subroot) +{ + RangeTblEntry *rte; + ListCell *lc; + + /* Should only be applied to base relations that are subqueries */ + Assert(rel->relid > 0); + rte = planner_rt_fetch(rel->relid, root); + Assert(rte->rtekind == RTE_SUBQUERY); + + /* Copy raw number of output rows from subplan */ + rel->tuples = rel->subplan->plan_rows; + + /* + * Compute per-output-column width estimates by examining the subquery's + * targetlist. For any output that is a plain Var, get the width estimate + * that was made while planning the subquery. Otherwise, fall back on a + * datatype-based estimate. + */ + foreach(lc, subroot->parse->targetList) + { + TargetEntry *te = (TargetEntry *) lfirst(lc); + Node *texpr = (Node *) te->expr; + int32 item_width; + + Assert(IsA(te, TargetEntry)); + /* junk columns aren't visible to upper query */ + if (te->resjunk) + continue; + + /* + * XXX This currently doesn't work for subqueries containing set + * operations, because the Vars in their tlists are bogus references + * to the first leaf subquery, which wouldn't give the right answer + * even if we could still get to its PlannerInfo. So fall back on + * datatype in that case. + */ + if (IsA(texpr, Var) && + subroot->parse->setOperations == NULL) + { + Var *var = (Var *) texpr; + RelOptInfo *subrel = find_base_rel(subroot, var->varno); + + item_width = subrel->attr_widths[var->varattno - subrel->min_attr]; + } + else + { + item_width = get_typavgwidth(exprType(texpr), exprTypmod(texpr)); + } + Assert(item_width > 0); + Assert(te->resno >= rel->min_attr && te->resno <= rel->max_attr); + rel->attr_widths[te->resno - rel->min_attr] = item_width; + } + + /* Now estimate number of output rows, etc */ + set_baserel_size_estimates(root, rel); +} + /* * set_function_size_estimates * Set the size estimates for a base relation that is a function call. @@ -3251,11 +3322,17 @@ set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, Plan *cteplan) * set_rel_width * Set the estimated output width of a base relation. * + * The estimated output width is the sum of the per-attribute width estimates + * for the actually-referenced columns, plus any PHVs or other expressions + * that have to be calculated at this relation. This is the amount of data + * we'd need to pass upwards in case of a sort, hash, etc. + * * NB: this works best on plain relations because it prefers to look at - * real Vars. It will fail to make use of pg_statistic info when applied - * to a subquery relation, even if the subquery outputs are simple vars - * that we could have gotten info for. Is it worth trying to be smarter - * about subqueries? + * real Vars. For subqueries, set_subquery_size_estimates will already have + * copied up whatever per-column estimates were made within the subquery, + * and for other types of rels there isn't much we can do anyway. We fall + * back on (fairly stupid) datatype-based width estimates if we can't get + * any better number. * * The per-attribute width estimates are cached for possible re-use while * building join relations. @@ -3265,6 +3342,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel) { Oid reloid = planner_rt_fetch(rel->relid, root)->relid; int32 tuple_width = 0; + bool have_wholerow_var = false; ListCell *lc; foreach(lc, rel->reltargetlist) @@ -3284,8 +3362,18 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel) ndx = var->varattno - rel->min_attr; /* - * The width probably hasn't been cached yet, but may as well - * check + * If it's a whole-row Var, we'll deal with it below after we + * have already cached as many attr widths as possible. + */ + if (var->varattno == 0) + { + have_wholerow_var = true; + continue; + } + + /* + * The width may have been cached already (especially if it's + * a subquery), so don't duplicate effort. */ if (rel->attr_widths[ndx] > 0) { @@ -3294,7 +3382,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel) } /* Try to get column width from statistics */ - if (reloid != InvalidOid) + if (reloid != InvalidOid && var->varattno > 0) { item_width = get_attavgwidth(reloid, var->varattno); if (item_width > 0) @@ -3335,6 +3423,39 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel) tuple_width += item_width; } } + + /* + * If we have a whole-row reference, estimate its width as the sum of + * per-column widths plus sizeof(HeapTupleHeaderData). + */ + if (have_wholerow_var) + { + int32 wholerow_width = sizeof(HeapTupleHeaderData); + + if (reloid != InvalidOid) + { + /* Real relation, so estimate true tuple width */ + wholerow_width += get_relation_data_width(reloid, + rel->attr_widths - rel->min_attr); + } + else + { + /* Do what we can with info for a phony rel */ + AttrNumber i; + + for (i = 1; i <= rel->max_attr; i++) + wholerow_width += rel->attr_widths[i - rel->min_attr]; + } + + rel->attr_widths[0 - rel->min_attr] = wholerow_width; + + /* + * Include the whole-row Var as part of the output tuple. Yes, + * that really is what happens at runtime. + */ + tuple_width += wholerow_width; + } + Assert(tuple_width >= 0); rel->width = tuple_width; } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 6324bce240..a1e5900592 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3102,7 +3102,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) * set_baserel_size_estimates, just do a quick hack for rows and width. */ rel->rows = rel->tuples; - rel->width = get_relation_data_width(tableOid); + rel->width = get_relation_data_width(tableOid, NULL); root->total_table_pages = rel->pages; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 7ffa11588d..aafaf843fc 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -322,7 +322,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * estimate_rel_size - estimate # pages and # tuples in a table or index * * If attr_widths isn't NULL, it points to the zero-index entry of the - * relation's attr_width[] cache; we fill this in if we have need to compute + * relation's attr_widths[] cache; we fill this in if we have need to compute * the attribute widths for estimation purposes. */ void @@ -435,8 +435,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths, * get_rel_data_width * * Estimate the average width of (the data part of) the relation's tuples. - * If attr_widths isn't NULL, also store per-column width estimates into - * that array. + * + * If attr_widths isn't NULL, it points to the zero-index entry of the + * relation's attr_widths[] cache; use and update that cache as appropriate. * * Currently we ignore dropped columns. Ideally those should be included * in the result, but we haven't got any way to get info about them; and @@ -456,6 +457,14 @@ get_rel_data_width(Relation rel, int32 *attr_widths) if (att->attisdropped) continue; + + /* use previously cached data, if any */ + if (attr_widths != NULL && attr_widths[i] > 0) + { + tuple_width += attr_widths[i]; + continue; + } + /* This should match set_rel_width() in costsize.c */ item_width = get_attavgwidth(RelationGetRelid(rel), i); if (item_width <= 0) @@ -474,10 +483,11 @@ get_rel_data_width(Relation rel, int32 *attr_widths) /* * get_relation_data_width * - * External API for get_rel_data_width + * External API for get_rel_data_width: same behavior except we have to + * open the relcache entry. */ int32 -get_relation_data_width(Oid relid) +get_relation_data_width(Oid relid, int32 *attr_widths) { int32 result; Relation relation; @@ -485,7 +495,7 @@ get_relation_data_width(Oid relid) /* As above, assume relation is already locked */ relation = heap_open(relid, NoLock); - result = get_rel_data_width(relation, NULL); + result = get_rel_data_width(relation, attr_widths); heap_close(relation, NoLock); diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index e1dcd6df14..8df1b95abe 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -121,6 +121,8 @@ extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel, RelOptInfo *inner_rel, SpecialJoinInfo *sjinfo, List *restrictlist); +extern void set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel, + PlannerInfo *subroot); extern void set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index de7de84cb3..ca7b2c6469 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -31,7 +31,7 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, extern void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples); -extern int32 get_relation_data_width(Oid relid); +extern int32 get_relation_data_width(Oid relid, int32 *attr_widths); extern bool relation_excluded_by_constraints(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte);