From 6cda3ad8feb9b534fe9fb5866bbe5930596d0027 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 9 May 2001 00:35:09 +0000 Subject: [PATCH] Cause planner to make use of average-column-width statistic that is now collected by ANALYZE. Also, add some modest amount of intelligence to guesses that are used for varlena columns in the absence of any ANALYZE statistics. The 'width' reported by EXPLAIN is finally something less than totally bogus for varlena columns ... and, in consequence, hashjoin estimating should be a little better ... --- doc/src/sgml/perform.sgml | 9 +--- src/backend/optimizer/path/costsize.c | 75 +++++++++++++++----------- src/backend/utils/cache/lsyscache.c | 78 ++++++++++++++++++++++++++- src/include/utils/lsyscache.h | 4 +- 4 files changed, 124 insertions(+), 42 deletions(-) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 77e1c77d8f..bfe66eb5dc 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1,5 +1,5 @@ @@ -97,13 +97,6 @@ $Header: /cvsroot/pgsql/doc/src/sgml/perform.sgml,v 1.3 2001/03/24 23:03:26 pete by the query (again, without considering the effects of LIMIT). - - Average width is pretty bogus because the thing really doesn't have - any idea of the average length of variable-length columns. I'm thinking - about improving that in the future, but it may not be worth the trouble, - because the width isn't used for very much. - - Here are some examples (using the regress test database after a vacuum analyze, and almost-7.0 sources): diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index bdfbbb1818..7dfe834b77 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -41,7 +41,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.71 2001/05/07 00:43:20 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.72 2001/05/09 00:35:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -61,12 +61,6 @@ #include "utils/syscache.h" -/* - * The length of a variable-length field in bytes (stupid estimate...) - */ -#define _DEFAULT_ATTRIBUTE_WIDTH_ 12 - - #define LOG2(x) (log(x) / 0.693147180559945) #define LOG6(x) (log(x) / 1.79175946922805) @@ -90,7 +84,6 @@ bool enable_hashjoin = true; static bool cost_qual_eval_walker(Node *node, Cost *total); static void set_rel_width(Query *root, RelOptInfo *rel); -static int compute_attribute_width(TargetEntry *tlistentry); static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); @@ -1082,38 +1075,56 @@ set_joinrel_size_estimates(Query *root, RelOptInfo *rel, /* * set_rel_width * Set the estimated output width of the relation. + * + * NB: this works best on base relations because it prefers to look at + * real Vars. It will fail to make use of pg_statistic info when applied + * to a subquery relation, even if the subquery outputs are simple vars + * that we could have gotten info for. Is it worth trying to be smarter + * about subqueries? */ static void set_rel_width(Query *root, RelOptInfo *rel) { - int tuple_width = 0; - List *tle; + int32 tuple_width = 0; + List *tllist; - foreach(tle, rel->targetlist) - tuple_width += compute_attribute_width((TargetEntry *) lfirst(tle)); + foreach(tllist, rel->targetlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(tllist); + int32 item_width; + + /* + * If it's a Var, try to get statistical info from pg_statistic. + */ + if (tle->expr && IsA(tle->expr, Var)) + { + Var *var = (Var *) tle->expr; + Oid relid; + + relid = getrelid(var->varno, root->rtable); + if (relid != InvalidOid) + { + item_width = get_attavgwidth(relid, var->varattno); + if (item_width > 0) + { + tuple_width += item_width; + continue; + } + } + } + /* + * Not a Var, or can't find statistics for it. Estimate using + * just the type info. + */ + item_width = get_typavgwidth(tle->resdom->restype, + tle->resdom->restypmod); + Assert(item_width > 0); + tuple_width += item_width; + } Assert(tuple_width >= 0); rel->width = tuple_width; } -/* - * compute_attribute_width - * Given a target list entry, find the size in bytes of the attribute. - * - * If a field is variable-length, we make a default assumption. Would be - * better if VACUUM recorded some stats about the average field width... - * also, we have access to the atttypmod, but fail to use it... - */ -static int -compute_attribute_width(TargetEntry *tlistentry) -{ - int width = get_typlen(tlistentry->resdom->restype); - - if (width < 0) - return _DEFAULT_ATTRIBUTE_WIDTH_; - else - return width; -} - /* * relation_byte_size * Estimate the storage space in bytes for a given number of tuples @@ -1122,7 +1133,7 @@ compute_attribute_width(TargetEntry *tlistentry) static double relation_byte_size(double tuples, int width) { - return tuples * ((double) (width + sizeof(HeapTupleData))); + return tuples * ((double) MAXALIGN(width + sizeof(HeapTupleData))); } /* diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 3995de5d7a..ee15a940cc 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/cache/lsyscache.c,v 1.53 2001/05/07 00:43:24 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/cache/lsyscache.c,v 1.54 2001/05/09 00:35:09 tgl Exp $ * * NOTES * Eventually, the index information should go through here, too. @@ -749,6 +749,56 @@ get_typdefault(Oid typid) return returnValue; } +/* + * get_typavgwidth + * + * Given a type OID and a typmod value (pass -1 if typmod is unknown), + * estimate the average width of values of the type. This is used by + * the planner, which doesn't require absolutely correct results; + * it's OK (and expected) to guess if we don't know for sure. + */ +int32 +get_typavgwidth(Oid typid, int32 typmod) +{ + int typlen = get_typlen(typid); + int32 maxwidth; + + /* + * Easy if it's a fixed-width type + */ + if (typlen > 0) + return typlen; + /* + * type_maximum_size knows the encoding of typmod for some datatypes; + * don't duplicate that knowledge here. + */ + maxwidth = type_maximum_size(typid, typmod); + if (maxwidth > 0) + { + /* + * For BPCHAR, the max width is also the only width. Otherwise + * we need to guess about the typical data width given the max. + * A sliding scale for percentage of max width seems reasonable. + */ + if (typid == BPCHAROID) + return maxwidth; + if (maxwidth <= 32) + return maxwidth; /* assume full width */ + if (maxwidth < 1000) + return 32 + (maxwidth - 32) / 2; /* assume 50% */ + /* + * Beyond 1000, assume we're looking at something like + * "varchar(10000)" where the limit isn't actually reached often, + * and use a fixed estimate. + */ + return 32 + (1000 - 32) / 2; + } + /* + * Ooops, we have no idea ... wild guess time. + */ + return 32; +} + /* * get_typtype * @@ -782,6 +832,32 @@ get_typtype(Oid typid) /* ---------- STATISTICS CACHE ---------- */ +/* + * get_attavgwidth + * + * Given the table and attribute number of a column, get the average + * width of entries in the column. Return zero if no data available. + */ +int32 +get_attavgwidth(Oid relid, AttrNumber attnum) +{ + HeapTuple tp; + + tp = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid), + Int16GetDatum(attnum), + 0, 0); + if (HeapTupleIsValid(tp)) + { + int32 stawidth = ((Form_pg_statistic) GETSTRUCT(tp))->stawidth; + + ReleaseSysCache(tp); + if (stawidth > 0) + return stawidth; + } + return 0; +} + /* * get_attstatsslot * diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 6b35deed28..3f18a4aea6 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: lsyscache.h,v 1.31 2001/05/07 00:43:26 tgl Exp $ + * $Id: lsyscache.h,v 1.32 2001/05/09 00:35:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -39,6 +39,8 @@ extern bool get_typbyval(Oid typid); extern void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval); extern char get_typstorage(Oid typid); extern Datum get_typdefault(Oid typid); +extern int32 get_typavgwidth(Oid typid, int32 typmod); +extern int32 get_attavgwidth(Oid relid, AttrNumber attnum); extern bool get_attstatsslot(HeapTuple statstuple, Oid atttype, int32 atttypmod, int reqkind, Oid reqop,