diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 77e1c77d8f..bfe66eb5dc 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1,5 +1,5 @@ @@ -97,13 +97,6 @@ $Header: /cvsroot/pgsql/doc/src/sgml/perform.sgml,v 1.3 2001/03/24 23:03:26 pete by the query (again, without considering the effects of LIMIT). - - Average width is pretty bogus because the thing really doesn't have - any idea of the average length of variable-length columns. I'm thinking - about improving that in the future, but it may not be worth the trouble, - because the width isn't used for very much. - - Here are some examples (using the regress test database after a vacuum analyze, and almost-7.0 sources): diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index bdfbbb1818..7dfe834b77 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -41,7 +41,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.71 2001/05/07 00:43:20 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.72 2001/05/09 00:35:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -61,12 +61,6 @@ #include "utils/syscache.h" -/* - * The length of a variable-length field in bytes (stupid estimate...) - */ -#define _DEFAULT_ATTRIBUTE_WIDTH_ 12 - - #define LOG2(x) (log(x) / 0.693147180559945) #define LOG6(x) (log(x) / 1.79175946922805) @@ -90,7 +84,6 @@ bool enable_hashjoin = true; static bool cost_qual_eval_walker(Node *node, Cost *total); static void set_rel_width(Query *root, RelOptInfo *rel); -static int compute_attribute_width(TargetEntry *tlistentry); static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); @@ -1082,38 +1075,56 @@ set_joinrel_size_estimates(Query *root, RelOptInfo *rel, /* * set_rel_width * Set the estimated output width of the relation. + * + * NB: this works best on base relations because it prefers to look at + * real Vars. It will fail to make use of pg_statistic info when applied + * to a subquery relation, even if the subquery outputs are simple vars + * that we could have gotten info for. Is it worth trying to be smarter + * about subqueries? */ static void set_rel_width(Query *root, RelOptInfo *rel) { - int tuple_width = 0; - List *tle; + int32 tuple_width = 0; + List *tllist; - foreach(tle, rel->targetlist) - tuple_width += compute_attribute_width((TargetEntry *) lfirst(tle)); + foreach(tllist, rel->targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(tllist); + int32 item_width; + + /* + * If it's a Var, try to get statistical info from pg_statistic. + */ + if (tle->expr && IsA(tle->expr, Var)) + { + Var *var = (Var *) tle->expr; + Oid relid; + + relid = getrelid(var->varno, root->rtable); + if (relid != InvalidOid) + { + item_width = get_attavgwidth(relid, var->varattno); + if (item_width > 0) + { + tuple_width += item_width; + continue; + } + } + } + /* + * Not a Var, or can't find statistics for it. Estimate using + * just the type info. + */ + item_width = get_typavgwidth(tle->resdom->restype, + tle->resdom->restypmod); + Assert(item_width > 0); + tuple_width += item_width; + } Assert(tuple_width >= 0); rel->width = tuple_width; } -/* - * compute_attribute_width - * Given a target list entry, find the size in bytes of the attribute. - * - * If a field is variable-length, we make a default assumption. Would be - * better if VACUUM recorded some stats about the average field width... - * also, we have access to the atttypmod, but fail to use it... - */ -static int -compute_attribute_width(TargetEntry *tlistentry) -{ - int width = get_typlen(tlistentry->resdom->restype); - - if (width < 0) - return _DEFAULT_ATTRIBUTE_WIDTH_; - else - return width; -} - /* * relation_byte_size * Estimate the storage space in bytes for a given number of tuples @@ -1122,7 +1133,7 @@ compute_attribute_width(TargetEntry *tlistentry) static double relation_byte_size(double tuples, int width) { - return tuples * ((double) (width + sizeof(HeapTupleData))); + return tuples * ((double) MAXALIGN(width + sizeof(HeapTupleData))); } /* diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 3995de5d7a..ee15a940cc 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/cache/lsyscache.c,v 1.53 2001/05/07 00:43:24 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/cache/lsyscache.c,v 1.54 2001/05/09 00:35:09 tgl Exp $ * * NOTES * Eventually, the index information should go through here, too. @@ -749,6 +749,56 @@ get_typdefault(Oid typid) return returnValue; } +/* + * get_typavgwidth + * + * Given a type OID and a typmod value (pass -1 if typmod is unknown), + * estimate the average width of values of the type. This is used by + * the planner, which doesn't require absolutely correct results; + * it's OK (and expected) to guess if we don't know for sure. + */ +int32 +get_typavgwidth(Oid typid, int32 typmod) +{ + int typlen = get_typlen(typid); + int32 maxwidth; + + /* + * Easy if it's a fixed-width type + */ + if (typlen > 0) + return typlen; + /* + * type_maximum_size knows the encoding of typmod for some datatypes; + * don't duplicate that knowledge here. + */ + maxwidth = type_maximum_size(typid, typmod); + if (maxwidth > 0) + { + /* + * For BPCHAR, the max width is also the only width. Otherwise + * we need to guess about the typical data width given the max. + * A sliding scale for percentage of max width seems reasonable. + */ + if (typid == BPCHAROID) + return maxwidth; + if (maxwidth <= 32) + return maxwidth; /* assume full width */ + if (maxwidth < 1000) + return 32 + (maxwidth - 32) / 2; /* assume 50% */ + /* + * Beyond 1000, assume we're looking at something like + * "varchar(10000)" where the limit isn't actually reached often, + * and use a fixed estimate. + */ + return 32 + (1000 - 32) / 2; + } + /* + * Ooops, we have no idea ... wild guess time. + */ + return 32; +} + /* * get_typtype * @@ -782,6 +832,32 @@ get_typtype(Oid typid) /* ---------- STATISTICS CACHE ---------- */ +/* + * get_attavgwidth + * + * Given the table and attribute number of a column, get the average + * width of entries in the column. Return zero if no data available. + */ +int32 +get_attavgwidth(Oid relid, AttrNumber attnum) +{ + HeapTuple tp; + + tp = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid), + Int16GetDatum(attnum), + 0, 0); + if (HeapTupleIsValid(tp)) + { + int32 stawidth = ((Form_pg_statistic) GETSTRUCT(tp))->stawidth; + + ReleaseSysCache(tp); + if (stawidth > 0) + return stawidth; + } + return 0; +} + /* * get_attstatsslot * diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 6b35deed28..3f18a4aea6 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: lsyscache.h,v 1.31 2001/05/07 00:43:26 tgl Exp $ + * $Id: lsyscache.h,v 1.32 2001/05/09 00:35:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -39,6 +39,8 @@ extern bool get_typbyval(Oid typid); extern void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval); extern char get_typstorage(Oid typid); extern Datum get_typdefault(Oid typid); +extern int32 get_typavgwidth(Oid typid, int32 typmod); +extern int32 get_attavgwidth(Oid relid, AttrNumber attnum); extern bool get_attstatsslot(HeapTuple statstuple, Oid atttype, int32 atttypmod, int reqkind, Oid reqop,