Improve relation width estimation for subqueries.

As per the ancient comment for set_rel_width, it really wasn't much good
for relations that aren't plain tables: it would never find any stats and
would always fall back on datatype-based estimates, which are often pretty
silly.  Fix that by copying up width estimates from the subquery planning
process.

At some point we might want to do this for CTEs too, but that would be a
significantly more invasive patch because the sub-PlannerInfo is no longer
accessible by the time it's needed.  I refrained from doing anything about
that, partly for fear of breaking the unmerged CTE-related patches.

In passing, also generate less bogus width estimates for whole-row Vars.

Per a gripe from Jon Nelson.
This commit is contained in:
Tom Lane 2010-11-19 17:31:50 -05:00
parent fe24d78161
commit 0f61d4dd1b
6 changed files with 150 additions and 20 deletions

View File

@ -758,11 +758,8 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
rel->subrtable = subroot->parse->rtable;
rel->subrowmark = subroot->rowMarks;
/* Copy number of output rows from subplan */
rel->tuples = rel->subplan->plan_rows;
/* Mark rel with estimated output rows, width, etc */
set_baserel_size_estimates(root, rel);
set_subquery_size_estimates(root, rel, subroot);
/* Convert subquery pathkeys to outer representation */
pathkeys = convert_subquery_pathkeys(root, rel, subroot->query_pathkeys);

View File

@ -76,6 +76,7 @@
#include "optimizer/cost.h"
#include "optimizer/pathnode.h"
#include "optimizer/placeholder.h"
#include "optimizer/plancat.h"
#include "optimizer/planmain.h"
#include "optimizer/restrictinfo.h"
#include "parser/parsetree.h"
@ -2986,7 +2987,7 @@ approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
* Set the size estimates for the given base relation.
*
* The rel's targetlist and restrictinfo list must have been constructed
* already.
* already, and rel->tuples must be set.
*
* We set the following fields of the rel node:
* rows: the estimated number of output tuples (after applying
@ -3151,6 +3152,76 @@ set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
rel->rows = clamp_row_est(nrows);
}
/*
* set_subquery_size_estimates
* Set the size estimates for a base relation that is a subquery.
*
* The rel's targetlist and restrictinfo list must have been constructed
* already, and the plan for the subquery must have been completed.
* We look at the subquery's plan and PlannerInfo to extract data.
*
* We set the same fields as set_baserel_size_estimates.
*/
void
set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel,
PlannerInfo *subroot)
{
RangeTblEntry *rte;
ListCell *lc;
/* Should only be applied to base relations that are subqueries */
Assert(rel->relid > 0);
rte = planner_rt_fetch(rel->relid, root);
Assert(rte->rtekind == RTE_SUBQUERY);
/* Copy raw number of output rows from subplan */
rel->tuples = rel->subplan->plan_rows;
/*
* Compute per-output-column width estimates by examining the subquery's
* targetlist. For any output that is a plain Var, get the width estimate
* that was made while planning the subquery. Otherwise, fall back on a
* datatype-based estimate.
*/
foreach(lc, subroot->parse->targetList)
{
TargetEntry *te = (TargetEntry *) lfirst(lc);
Node *texpr = (Node *) te->expr;
int32 item_width;
Assert(IsA(te, TargetEntry));
/* junk columns aren't visible to upper query */
if (te->resjunk)
continue;
/*
* XXX This currently doesn't work for subqueries containing set
* operations, because the Vars in their tlists are bogus references
* to the first leaf subquery, which wouldn't give the right answer
* even if we could still get to its PlannerInfo. So fall back on
* datatype in that case.
*/
if (IsA(texpr, Var) &&
subroot->parse->setOperations == NULL)
{
Var *var = (Var *) texpr;
RelOptInfo *subrel = find_base_rel(subroot, var->varno);
item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
}
else
{
item_width = get_typavgwidth(exprType(texpr), exprTypmod(texpr));
}
Assert(item_width > 0);
Assert(te->resno >= rel->min_attr && te->resno <= rel->max_attr);
rel->attr_widths[te->resno - rel->min_attr] = item_width;
}
/* Now estimate number of output rows, etc */
set_baserel_size_estimates(root, rel);
}
/*
* set_function_size_estimates
* Set the size estimates for a base relation that is a function call.
@ -3251,11 +3322,17 @@ set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, Plan *cteplan)
* set_rel_width
* Set the estimated output width of a base relation.
*
* The estimated output width is the sum of the per-attribute width estimates
* for the actually-referenced columns, plus any PHVs or other expressions
* that have to be calculated at this relation. This is the amount of data
* we'd need to pass upwards in case of a sort, hash, etc.
*
* NB: this works best on plain relations because it prefers to look at
* real Vars. It will fail to make use of pg_statistic info when applied
* to a subquery relation, even if the subquery outputs are simple vars
* that we could have gotten info for. Is it worth trying to be smarter
* about subqueries?
* real Vars. For subqueries, set_subquery_size_estimates will already have
* copied up whatever per-column estimates were made within the subquery,
* and for other types of rels there isn't much we can do anyway. We fall
* back on (fairly stupid) datatype-based width estimates if we can't get
* any better number.
*
* The per-attribute width estimates are cached for possible re-use while
* building join relations.
@ -3265,6 +3342,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
{
Oid reloid = planner_rt_fetch(rel->relid, root)->relid;
int32 tuple_width = 0;
bool have_wholerow_var = false;
ListCell *lc;
foreach(lc, rel->reltargetlist)
@ -3284,8 +3362,18 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
ndx = var->varattno - rel->min_attr;
/*
* The width probably hasn't been cached yet, but may as well
* check
* If it's a whole-row Var, we'll deal with it below after we
* have already cached as many attr widths as possible.
*/
if (var->varattno == 0)
{
have_wholerow_var = true;
continue;
}
/*
* The width may have been cached already (especially if it's
* a subquery), so don't duplicate effort.
*/
if (rel->attr_widths[ndx] > 0)
{
@ -3294,7 +3382,7 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
}
/* Try to get column width from statistics */
if (reloid != InvalidOid)
if (reloid != InvalidOid && var->varattno > 0)
{
item_width = get_attavgwidth(reloid, var->varattno);
if (item_width > 0)
@ -3335,6 +3423,39 @@ set_rel_width(PlannerInfo *root, RelOptInfo *rel)
tuple_width += item_width;
}
}
/*
* If we have a whole-row reference, estimate its width as the sum of
* per-column widths plus sizeof(HeapTupleHeaderData).
*/
if (have_wholerow_var)
{
int32 wholerow_width = sizeof(HeapTupleHeaderData);
if (reloid != InvalidOid)
{
/* Real relation, so estimate true tuple width */
wholerow_width += get_relation_data_width(reloid,
rel->attr_widths - rel->min_attr);
}
else
{
/* Do what we can with info for a phony rel */
AttrNumber i;
for (i = 1; i <= rel->max_attr; i++)
wholerow_width += rel->attr_widths[i - rel->min_attr];
}
rel->attr_widths[0 - rel->min_attr] = wholerow_width;
/*
* Include the whole-row Var as part of the output tuple. Yes,
* that really is what happens at runtime.
*/
tuple_width += wholerow_width;
}
Assert(tuple_width >= 0);
rel->width = tuple_width;
}

View File

@ -3102,7 +3102,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
* set_baserel_size_estimates, just do a quick hack for rows and width.
*/
rel->rows = rel->tuples;
rel->width = get_relation_data_width(tableOid);
rel->width = get_relation_data_width(tableOid, NULL);
root->total_table_pages = rel->pages;

View File

@ -322,7 +322,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
* estimate_rel_size - estimate # pages and # tuples in a table or index
*
* If attr_widths isn't NULL, it points to the zero-index entry of the
* relation's attr_width[] cache; we fill this in if we have need to compute
* relation's attr_widths[] cache; we fill this in if we have need to compute
* the attribute widths for estimation purposes.
*/
void
@ -435,8 +435,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
* get_rel_data_width
*
* Estimate the average width of (the data part of) the relation's tuples.
* If attr_widths isn't NULL, also store per-column width estimates into
* that array.
*
* If attr_widths isn't NULL, it points to the zero-index entry of the
* relation's attr_widths[] cache; use and update that cache as appropriate.
*
* Currently we ignore dropped columns. Ideally those should be included
* in the result, but we haven't got any way to get info about them; and
@ -456,6 +457,14 @@ get_rel_data_width(Relation rel, int32 *attr_widths)
if (att->attisdropped)
continue;
/* use previously cached data, if any */
if (attr_widths != NULL && attr_widths[i] > 0)
{
tuple_width += attr_widths[i];
continue;
}
/* This should match set_rel_width() in costsize.c */
item_width = get_attavgwidth(RelationGetRelid(rel), i);
if (item_width <= 0)
@ -474,10 +483,11 @@ get_rel_data_width(Relation rel, int32 *attr_widths)
/*
* get_relation_data_width
*
* External API for get_rel_data_width
* External API for get_rel_data_width: same behavior except we have to
* open the relcache entry.
*/
int32
get_relation_data_width(Oid relid)
get_relation_data_width(Oid relid, int32 *attr_widths)
{
int32 result;
Relation relation;
@ -485,7 +495,7 @@ get_relation_data_width(Oid relid)
/* As above, assume relation is already locked */
relation = heap_open(relid, NoLock);
result = get_rel_data_width(relation, NULL);
result = get_rel_data_width(relation, attr_widths);
heap_close(relation, NoLock);

View File

@ -121,6 +121,8 @@ extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
RelOptInfo *inner_rel,
SpecialJoinInfo *sjinfo,
List *restrictlist);
extern void set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel,
PlannerInfo *subroot);
extern void set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel);
extern void set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel);
extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel,

View File

@ -31,7 +31,7 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
extern void estimate_rel_size(Relation rel, int32 *attr_widths,
BlockNumber *pages, double *tuples);
extern int32 get_relation_data_width(Oid relid);
extern int32 get_relation_data_width(Oid relid, int32 *attr_widths);
extern bool relation_excluded_by_constraints(PlannerInfo *root,
RelOptInfo *rel, RangeTblEntry *rte);