diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c index 5f9b43bf7f..ada1e78f6f 100644 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -30,6 +30,7 @@ #include "utils/fmgroids.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" #include "utils/syscache.h" #include "utils/typcache.h" @@ -73,13 +74,18 @@ static double dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); static bool dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums); -static bool dependency_implies_attribute(MVDependency *dependency, - AttrNumber attnum); static bool dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum); static MVDependency *find_strongest_dependency(MVDependencies **dependencies, int ndependencies, Bitmapset *attnums); +static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses, + int varRelid, JoinType jointype, + SpecialJoinInfo *sjinfo, + MVDependency **dependencies, + int ndependencies, + AttrNumber *list_attnums, + Bitmapset **estimatedclauses); static void generate_dependencies_recurse(DependencyGenerator state, int index, @@ -613,19 +619,6 @@ dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums) return true; } -/* - * dependency_implies_attribute - * check that the attnum matches is implied by the functional dependency - */ -static bool -dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum) -{ - if (attnum == dependency->attributes[dependency->nattributes - 1]) - return true; - - return false; -} - /* * statext_dependencies_load * Load the functional dependencies for the indicated pg_statistic_ext tuple @@ -985,6 +978,183 @@ find_strongest_dependency(MVDependencies **dependencies, int ndependencies, return strongest; } +/* + * clauselist_apply_dependencies + * Apply the specified functional dependencies to a list of clauses and + * return the estimated selecvitity of the clauses that are compatible + * with any of the given dependencies. + * + * This will estimate all not-already-estimated clauses that are compatible + * with functional dependencies, and which have an attribute mentioned by any + * of the given dependencies (either as an implying or implied attribute). + * + * Given (lists of) clauses on attributes (a,b) and a functional dependency + * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined + * using the formula + * + * P(a,b) = f * P(a) + (1-f) * P(a) * P(b) + * + * where 'f' is the degree of dependency. This reflects the fact that we + * expect a fraction f of all rows to be consistent with the dependency + * (a=>b), and so have a selectivity of P(a), while the remaining rows are + * treated as independent. + * + * In practice, we use a slightly modified version of this formula, which uses + * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result + * should obviously not exceed either column's individual selectivity. I.e., + * we actually combine selectivities using the formula + * + * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) + * + * This can make quite a difference if the specific values matching the + * clauses are not consistent with the functional dependency. + */ +static Selectivity +clauselist_apply_dependencies(PlannerInfo *root, List *clauses, + int varRelid, JoinType jointype, + SpecialJoinInfo *sjinfo, + MVDependency **dependencies, int ndependencies, + AttrNumber *list_attnums, + Bitmapset **estimatedclauses) +{ + Bitmapset *attnums; + int i; + int j; + int nattrs; + Selectivity *attr_sel; + int attidx; + int listidx; + ListCell *l; + Selectivity s1; + + /* + * Extract the attnums of all implying and implied attributes from all the + * given dependencies. Each of these attributes is expected to have at + * least 1 not-already-estimated compatible clause that we will estimate + * here. + */ + attnums = NULL; + for (i = 0; i < ndependencies; i++) + { + for (j = 0; j < dependencies[i]->nattributes; j++) + { + AttrNumber attnum = dependencies[i]->attributes[j]; + + attnums = bms_add_member(attnums, attnum); + } + } + + /* + * Compute per-column selectivity estimates for each of these attributes, + * and mark all the corresponding clauses as estimated. + */ + nattrs = bms_num_members(attnums); + attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs); + + attidx = 0; + i = -1; + while ((i = bms_next_member(attnums, i)) >= 0) + { + List *attr_clauses = NIL; + Selectivity simple_sel; + + listidx = -1; + foreach(l, clauses) + { + Node *clause = (Node *) lfirst(l); + + listidx++; + if (list_attnums[listidx] == i) + { + attr_clauses = lappend(attr_clauses, clause); + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + } + } + + simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid, + jointype, sjinfo, NULL); + attr_sel[attidx++] = simple_sel; + } + + /* + * Now combine these selectivities using the dependency information. For + * chains of dependencies such as a -> b -> c, the b -> c dependency will + * come before the a -> b dependency in the array, so we traverse the + * array backwards to ensure such chains are computed in the right order. + * + * As explained above, pairs of selectivities are combined using the + * formula + * + * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) + * + * to ensure that the combined selectivity is never greater than either + * individual selectivity. + * + * Where multiple dependencies apply (e.g., a -> b -> c), we use + * conditional probabilities to compute the overall result as follows: + * + * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a) + * + * so we replace the selectivities of all implied attributes with + * conditional probabilities, that are conditional on all their implying + * attributes. The selectivities of all other non-implied attributes are + * left as they are. + */ + for (i = ndependencies - 1; i >= 0; i--) + { + MVDependency *dependency = dependencies[i]; + AttrNumber attnum; + Selectivity s2; + double f; + + /* Selectivity of all the implying attributes */ + s1 = 1.0; + for (j = 0; j < dependency->nattributes - 1; j++) + { + attnum = dependency->attributes[j]; + attidx = bms_member_index(attnums, attnum); + s1 *= attr_sel[attidx]; + } + + /* Original selectivity of the implied attribute */ + attnum = dependency->attributes[j]; + attidx = bms_member_index(attnums, attnum); + s2 = attr_sel[attidx]; + + /* + * Replace s2 with the conditional probability s2 given s1, computed + * using the formula P(b|a) = P(a,b) / P(a), which simplifies to + * + * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b) + * + * where P(a) = s1, the selectivity of the implying attributes, and + * P(b) = s2, the selectivity of the implied attribute. + */ + f = dependency->degree; + + if (s1 <= s2) + attr_sel[attidx] = f + (1 - f) * s2; + else + attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2; + } + + /* + * The overall selectivity of all the clauses on all these attributes is + * then the product of all the original (non-implied) probabilities and + * the new conditional (implied) probabilities. + */ + s1 = 1.0; + for (i = 0; i < nattrs; i++) + s1 *= attr_sel[i]; + + CLAMP_PROBABILITY(s1); + + pfree(attr_sel); + bms_free(attnums); + + return s1; +} + /* * dependencies_clauselist_selectivity * Return the estimated selectivity of (a subset of) the given clauses @@ -999,15 +1169,16 @@ find_strongest_dependency(MVDependencies **dependencies, int ndependencies, * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected * dependency, we then combine the per-clause selectivities using the formula * - * P(a,b) = P(a) * [f + (1-f)*P(b)] + * P(a,b) = f * P(a) + (1-f) * P(a) * P(b) * - * where 'f' is the degree of the dependency. + * where 'f' is the degree of the dependency. (Actually we use a slightly + * modified version of this formula -- see clauselist_apply_dependencies()). * * With clauses on more than two attributes, the dependencies are applied * recursively, starting with the widest/strongest dependencies. For example * P(a,b,c) is first split like this: * - * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)] + * P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c) * * assuming (a,b=>c) is the strongest dependency. */ @@ -1023,17 +1194,20 @@ dependencies_clauselist_selectivity(PlannerInfo *root, Selectivity s1 = 1.0; ListCell *l; Bitmapset *clauses_attnums = NULL; - Bitmapset **list_attnums; + AttrNumber *list_attnums; int listidx; - MVDependencies **dependencies = NULL; - int ndependencies = 0; + MVDependencies **func_dependencies; + int nfunc_dependencies; + int total_ndeps; + MVDependency **dependencies; + int ndependencies; int i; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) return 1.0; - list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * + list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * list_length(clauses)); /* @@ -1056,11 +1230,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root, if (!bms_is_member(listidx, *estimatedclauses) && dependency_is_compatible_clause(clause, rel->relid, &attnum)) { - list_attnums[listidx] = bms_make_singleton(attnum); + list_attnums[listidx] = attnum; clauses_attnums = bms_add_member(clauses_attnums, attnum); } else - list_attnums[listidx] = NULL; + list_attnums[listidx] = InvalidAttrNumber; listidx++; } @@ -1072,6 +1246,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root, */ if (bms_num_members(clauses_attnums) < 2) { + bms_free(clauses_attnums); pfree(list_attnums); return 1.0; } @@ -1083,19 +1258,20 @@ dependencies_clauselist_selectivity(PlannerInfo *root, * * To not waste cycles and memory, we deserialize dependencies only for * statistics that match at least two attributes. The array is allocated - * with the assumption that all objects match - we could grow the array - * to make it just the right size, but it's likely wasteful anyway thanks - * to moving the freed chunks to freelists etc. + * with the assumption that all objects match - we could grow the array to + * make it just the right size, but it's likely wasteful anyway thanks to + * moving the freed chunks to freelists etc. */ - ndependencies = 0; - dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) * - list_length(rel->statlist)); + func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) * + list_length(rel->statlist)); + nfunc_dependencies = 0; + total_ndeps = 0; - foreach(l,rel->statlist) + foreach(l, rel->statlist) { - StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l); - Bitmapset *matched; - int num_matched; + StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l); + Bitmapset *matched; + int num_matched; /* skip statistics that are not of the correct type */ if (stat->kind != STATS_EXT_DEPENDENCIES) @@ -1109,104 +1285,65 @@ dependencies_clauselist_selectivity(PlannerInfo *root, if (num_matched < 2) continue; - dependencies[ndependencies++] + func_dependencies[nfunc_dependencies] = statext_dependencies_load(stat->statOid); + + total_ndeps += func_dependencies[nfunc_dependencies]->ndeps; + nfunc_dependencies++; } /* if no matching stats could be found then we've nothing to do */ - if (!ndependencies) + if (nfunc_dependencies == 0) { + pfree(func_dependencies); + bms_free(clauses_attnums); pfree(list_attnums); return 1.0; } /* - * Apply the dependencies recursively, starting with the widest/strongest - * ones, and proceeding to the smaller/weaker ones. At the end of each - * round we factor in the selectivity of clauses on the implied attribute, - * and remove the clauses from the list. + * Work out which dependencies we can apply, starting with the + * widest/stongest ones, and proceeding to smaller/weaker ones. */ + dependencies = (MVDependency **) palloc(sizeof(MVDependency *) * + total_ndeps); + ndependencies = 0; + while (true) { - Selectivity s2 = 1.0; MVDependency *dependency; + AttrNumber attnum; /* the widest/strongest dependency, fully matched by clauses */ - dependency = find_strongest_dependency(dependencies, ndependencies, + dependency = find_strongest_dependency(func_dependencies, + nfunc_dependencies, clauses_attnums); - - /* if no suitable dependency was found, we're done */ if (!dependency) break; - /* - * We found an applicable dependency, so find all the clauses on the - * implied attribute - with dependency (a,b => c) we look for clauses - * on 'c'. - */ - listidx = -1; - foreach(l, clauses) - { - Node *clause; - AttrNumber attnum; + dependencies[ndependencies++] = dependency; - listidx++; - - /* - * Skip incompatible clauses, and ones we've already estimated on. - */ - if (!list_attnums[listidx]) - continue; - - /* - * We expect the bitmaps ton contain a single attribute number. - */ - attnum = bms_singleton_member(list_attnums[listidx]); - - /* - * Technically we could find more than one clause for a given - * attnum. Since these clauses must be equality clauses, we choose - * to only take the selectivity estimate from the final clause in - * the list for this attnum. If the attnum happens to be compared - * to a different Const in another clause then no rows will match - * anyway. If it happens to be compared to the same Const, then - * ignoring the additional clause is just the thing to do. - */ - if (dependency_implies_attribute(dependency, attnum)) - { - clause = (Node *) lfirst(l); - - s2 = clause_selectivity(root, clause, varRelid, jointype, - sjinfo); - - /* mark this one as done, so we don't touch it again. */ - *estimatedclauses = bms_add_member(*estimatedclauses, listidx); - - /* - * Mark that we've got and used the dependency on this clause. - * We'll want to ignore this when looking for the next - * strongest dependency above. - */ - clauses_attnums = bms_del_member(clauses_attnums, attnum); - } - } - - /* - * Now factor in the selectivity for all the "implied" clauses into - * the final one, using this formula: - * - * P(a,b) = P(a) * (f + (1-f) * P(b)) - * - * where 'f' is the degree of validity of the dependency. - */ - s1 *= (dependency->degree + (1 - dependency->degree) * s2); + /* Ignore dependencies using this implied attribute in later loops */ + attnum = dependency->attributes[dependency->nattributes - 1]; + clauses_attnums = bms_del_member(clauses_attnums, attnum); } + /* + * If we found applicable dependencies, use them to estimate all + * compatible clauses on attributes that they refer to. + */ + if (ndependencies != 0) + s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype, + sjinfo, dependencies, ndependencies, + list_attnums, estimatedclauses); + /* free deserialized functional dependencies (and then the array) */ - for (i = 0; i < ndependencies; i++) - pfree(dependencies[i]); + for (i = 0; i < nfunc_dependencies; i++) + pfree(func_dependencies[i]); pfree(dependencies); + pfree(func_dependencies); + bms_free(clauses_attnums); pfree(list_attnums); return s1; diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index bd6d8be434..f2f4008283 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -440,6 +440,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE 8 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + estimated | actual +-----------+-------- + 4 | 100 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); estimated | actual -----------+-------- @@ -600,6 +606,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE 200 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); estimated | actual -----------+-------- @@ -719,12 +731,14 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE 1 | 0 (1 row) --- check change of column type doesn't break it +-- changing the type of column c causes its single-column stats to be dropped, +-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple +-- clauses estimated with functional dependencies does not exceed this ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1'); estimated | actual -----------+-------- - 50 | 50 + 25 | 50 (1 row) ANALYZE functional_dependencies; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 897ed3051c..f3b652dc4a 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -280,6 +280,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')'); +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)'); @@ -342,6 +344,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')'); +SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1'''); + SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1'); SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)'); @@ -385,7 +389,9 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])'); --- check change of column type doesn't break it +-- changing the type of column c causes its single-column stats to be dropped, +-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple +-- clauses estimated with functional dependencies does not exceed this ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric; SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');