Prevent functional dependency estimates from exceeding column estimates.

Formerly we applied a functional dependency "a => b with dependency degree f" using the formula P(a,b) = P(a) * [f + (1-f)*P(b)] This leads to the possibility that the combined selectivity P(a,b) could exceed P(b), which is not ideal. The addition of support for IN and OR clauses (commits 8f321bd16c and ccaa3569f5) would seem to make this more likely, since the user-supplied values in such clauses are not necessarily compatible with the functional dependency. Mitigate this by using the formula P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) instead, which guarantees that the combined selectivity is less than each column's individual selectivity. Logically, this is modifies the part of the formula that accounts for dependent rows to handle cases where P(a) > P(b), whilst not changing the second term which accounts for independent rows. Additionally, this refactors the way that functional dependencies are applied, so now dependencies_clauselist_selectivity() estimates both the implying clauses and the implied clauses for each functional dependency (formerly only the implied clauses were estimated), and now all clauses for each attribute are taken into account (formerly only one clause for each implied attribute was estimated). This removes the previously built-in assumption that only equality clauses will be seen, which is no longer true, and opens up the possibility of applying functional dependencies to more general clauses. Patch by me, reviewed by Tomas Vondra. Discussion: https://postgr.es/m/CAEZATCXaNFZyOhR4XXAfkvj1tibRBEjje6ZbXwqWUB_tqbH%3Drw%40mail.gmail.com Discussion: https://postgr.es/m/20200318002946.6dvblukm3cfmgir2%40development
2020-03-28 12:48:34 +00:00 · 2020-03-28 12:48:34 +00:00 · 87779aa474
parent 145cb16d3b
commit 87779aa474
3 changed files with 266 additions and 109 deletions
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@ -30,6 +30,7 @@
 #include "utils/fmgroids.h"
 #include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
 #include "utils/syscache.h"
 #include "utils/typcache.h"

@ -73,13 +74,18 @@ static double dependency_degree(int numrows, HeapTuple *rows, int k,
 								AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
 static bool dependency_is_fully_matched(MVDependency *dependency,
 										Bitmapset *attnums);
-static bool dependency_implies_attribute(MVDependency *dependency,
-										 AttrNumber attnum);
 static bool dependency_is_compatible_clause(Node *clause, Index relid,
 											AttrNumber *attnum);
 static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
 											   int ndependencies,
 											   Bitmapset *attnums);
+static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
+												 int varRelid, JoinType jointype,
+												 SpecialJoinInfo *sjinfo,
+												 MVDependency **dependencies,
+												 int ndependencies,
+												 AttrNumber *list_attnums,
+												 Bitmapset **estimatedclauses);

 static void
 generate_dependencies_recurse(DependencyGenerator state, int index,
@ -613,19 +619,6 @@ dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
 	return true;
 }

-/*
- * dependency_implies_attribute
- *		check that the attnum matches is implied by the functional dependency
- */
-static bool
-dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum)
-{
-	if (attnum == dependency->attributes[dependency->nattributes - 1])
-		return true;
-
-	return false;
-}
-
 /*
 * statext_dependencies_load
 *		Load the functional dependencies for the indicated pg_statistic_ext tuple
@ -985,6 +978,183 @@ find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
 	return strongest;
 }

+/*
+ * clauselist_apply_dependencies
+ *		Apply the specified functional dependencies to a list of clauses and
+ *		return the estimated selecvitity of the clauses that are compatible
+ *		with any of the given dependencies.
+ *
+ * This will estimate all not-already-estimated clauses that are compatible
+ * with functional dependencies, and which have an attribute mentioned by any
+ * of the given dependencies (either as an implying or implied attribute).
+ *
+ * Given (lists of) clauses on attributes (a,b) and a functional dependency
+ * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined
+ * using the formula
+ *
+ *		P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
+ *
+ * where 'f' is the degree of dependency.  This reflects the fact that we
+ * expect a fraction f of all rows to be consistent with the dependency
+ * (a=>b), and so have a selectivity of P(a), while the remaining rows are
+ * treated as independent.
+ *
+ * In practice, we use a slightly modified version of this formula, which uses
+ * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result
+ * should obviously not exceed either column's individual selectivity.  I.e.,
+ * we actually combine selectivities using the formula
+ *
+ *		P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
+ *
+ * This can make quite a difference if the specific values matching the
+ * clauses are not consistent with the functional dependency.
+ */
+static Selectivity
+clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
+							  int varRelid, JoinType jointype,
+							  SpecialJoinInfo *sjinfo,
+							  MVDependency **dependencies, int ndependencies,
+							  AttrNumber *list_attnums,
+							  Bitmapset **estimatedclauses)
+{
+	Bitmapset  *attnums;
+	int			i;
+	int			j;
+	int			nattrs;
+	Selectivity *attr_sel;
+	int			attidx;
+	int			listidx;
+	ListCell   *l;
+	Selectivity s1;
+
+	/*
+	 * Extract the attnums of all implying and implied attributes from all the
+	 * given dependencies.  Each of these attributes is expected to have at
+	 * least 1 not-already-estimated compatible clause that we will estimate
+	 * here.
+	 */
+	attnums = NULL;
+	for (i = 0; i < ndependencies; i++)
+	{
+		for (j = 0; j < dependencies[i]->nattributes; j++)
+		{
+			AttrNumber	attnum = dependencies[i]->attributes[j];
+
+			attnums = bms_add_member(attnums, attnum);
+		}
+	}
+
+	/*
+	 * Compute per-column selectivity estimates for each of these attributes,
+	 * and mark all the corresponding clauses as estimated.
+	 */
+	nattrs = bms_num_members(attnums);
+	attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs);
+
+	attidx = 0;
+	i = -1;
+	while ((i = bms_next_member(attnums, i)) >= 0)
+	{
+		List	   *attr_clauses = NIL;
+		Selectivity simple_sel;
+
+		listidx = -1;
+		foreach(l, clauses)
+		{
+			Node	   *clause = (Node *) lfirst(l);
+
+			listidx++;
+			if (list_attnums[listidx] == i)
+			{
+				attr_clauses = lappend(attr_clauses, clause);
+				*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+			}
+		}
+
+		simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid,
+												   jointype, sjinfo, NULL);
+		attr_sel[attidx++] = simple_sel;
+	}
+
+	/*
+	 * Now combine these selectivities using the dependency information.  For
+	 * chains of dependencies such as a -> b -> c, the b -> c dependency will
+	 * come before the a -> b dependency in the array, so we traverse the
+	 * array backwards to ensure such chains are computed in the right order.
+	 *
+	 * As explained above, pairs of selectivities are combined using the
+	 * formula
+	 *
+	 * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
+	 *
+	 * to ensure that the combined selectivity is never greater than either
+	 * individual selectivity.
+	 *
+	 * Where multiple dependencies apply (e.g., a -> b -> c), we use
+	 * conditional probabilities to compute the overall result as follows:
+	 *
+	 * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a)
+	 *
+	 * so we replace the selectivities of all implied attributes with
+	 * conditional probabilities, that are conditional on all their implying
+	 * attributes.  The selectivities of all other non-implied attributes are
+	 * left as they are.
+	 */
+	for (i = ndependencies - 1; i >= 0; i--)
+	{
+		MVDependency *dependency = dependencies[i];
+		AttrNumber	attnum;
+		Selectivity s2;
+		double		f;
+
+		/* Selectivity of all the implying attributes */
+		s1 = 1.0;
+		for (j = 0; j < dependency->nattributes - 1; j++)
+		{
+			attnum = dependency->attributes[j];
+			attidx = bms_member_index(attnums, attnum);
+			s1 *= attr_sel[attidx];
+		}
+
+		/* Original selectivity of the implied attribute */
+		attnum = dependency->attributes[j];
+		attidx = bms_member_index(attnums, attnum);
+		s2 = attr_sel[attidx];
+
+		/*
+		 * Replace s2 with the conditional probability s2 given s1, computed
+		 * using the formula P(b|a) = P(a,b) / P(a), which simplifies to
+		 *
+		 * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
+		 *
+		 * where P(a) = s1, the selectivity of the implying attributes, and
+		 * P(b) = s2, the selectivity of the implied attribute.
+		 */
+		f = dependency->degree;
+
+		if (s1 <= s2)
+			attr_sel[attidx] = f + (1 - f) * s2;
+		else
+			attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2;
+	}
+
+	/*
+	 * The overall selectivity of all the clauses on all these attributes is
+	 * then the product of all the original (non-implied) probabilities and
+	 * the new conditional (implied) probabilities.
+	 */
+	s1 = 1.0;
+	for (i = 0; i < nattrs; i++)
+		s1 *= attr_sel[i];
+
+	CLAMP_PROBABILITY(s1);
+
+	pfree(attr_sel);
+	bms_free(attnums);
+
+	return s1;
+}
+
 /*
 * dependencies_clauselist_selectivity
 *		Return the estimated selectivity of (a subset of) the given clauses
@ -999,15 +1169,16 @@ find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
 * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
 * dependency, we then combine the per-clause selectivities using the formula
 *
- *	   P(a,b) = P(a) * [f + (1-f)*P(b)]
+ *	   P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
 *
- * where 'f' is the degree of the dependency.
+ * where 'f' is the degree of the dependency.  (Actually we use a slightly
+ * modified version of this formula -- see clauselist_apply_dependencies()).
 *
 * With clauses on more than two attributes, the dependencies are applied
 * recursively, starting with the widest/strongest dependencies. For example
 * P(a,b,c) is first split like this:
 *
- *	   P(a,b,c) = P(a,b) * [f + (1-f)*P(c)]
+ *	   P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c)
 *
 * assuming (a,b=>c) is the strongest dependency.
 */
@ -1023,17 +1194,20 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 	Selectivity s1 = 1.0;
 	ListCell   *l;
 	Bitmapset  *clauses_attnums = NULL;
-	Bitmapset **list_attnums;
+	AttrNumber *list_attnums;
 	int			listidx;
-	MVDependencies    **dependencies = NULL;
-	int					ndependencies = 0;
+	MVDependencies **func_dependencies;
+	int			nfunc_dependencies;
+	int			total_ndeps;
+	MVDependency **dependencies;
+	int			ndependencies;
 	int			i;

 	/* check if there's any stats that might be useful for us. */
 	if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
 		return 1.0;

-	list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
+	list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
 										 list_length(clauses));

 	/*
@ -1056,11 +1230,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 		if (!bms_is_member(listidx, *estimatedclauses) &&
 			dependency_is_compatible_clause(clause, rel->relid, &attnum))
 		{
-			list_attnums[listidx] = bms_make_singleton(attnum);
+			list_attnums[listidx] = attnum;
 			clauses_attnums = bms_add_member(clauses_attnums, attnum);
 		}
 		else
-			list_attnums[listidx] = NULL;
+			list_attnums[listidx] = InvalidAttrNumber;

 		listidx++;
 	}
@ -1072,6 +1246,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 	 */
 	if (bms_num_members(clauses_attnums) < 2)
 	{
+		bms_free(clauses_attnums);
 		pfree(list_attnums);
 		return 1.0;
 	}
@ -1083,19 +1258,20 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 	 *
 	 * To not waste cycles and memory, we deserialize dependencies only for
 	 * statistics that match at least two attributes. The array is allocated
-	 * with the assumption that all objects match - we could grow the array
-	 * to make it just the right size, but it's likely wasteful anyway thanks
-	 * to moving the freed chunks to freelists etc.
+	 * with the assumption that all objects match - we could grow the array to
+	 * make it just the right size, but it's likely wasteful anyway thanks to
+	 * moving the freed chunks to freelists etc.
 	 */
-	ndependencies = 0;
-	dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
-											  list_length(rel->statlist));
+	func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
+												   list_length(rel->statlist));
+	nfunc_dependencies = 0;
+	total_ndeps = 0;

-	foreach(l,rel->statlist)
+	foreach(l, rel->statlist)
 	{
-		StatisticExtInfo   *stat = (StatisticExtInfo *) lfirst(l);
-		Bitmapset		   *matched;
-		int					num_matched;
+		StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
+		Bitmapset  *matched;
+		int			num_matched;

 		/* skip statistics that are not of the correct type */
 		if (stat->kind != STATS_EXT_DEPENDENCIES)
@ -1109,104 +1285,65 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 		if (num_matched < 2)
 			continue;

-		dependencies[ndependencies++]
+		func_dependencies[nfunc_dependencies]
 			= statext_dependencies_load(stat->statOid);
+
+		total_ndeps += func_dependencies[nfunc_dependencies]->ndeps;
+		nfunc_dependencies++;
 	}

 	/* if no matching stats could be found then we've nothing to do */
-	if (!ndependencies)
+	if (nfunc_dependencies == 0)
 	{
+		pfree(func_dependencies);
+		bms_free(clauses_attnums);
 		pfree(list_attnums);
 		return 1.0;
 	}

 	/*
-	 * Apply the dependencies recursively, starting with the widest/strongest
-	 * ones, and proceeding to the smaller/weaker ones. At the end of each
-	 * round we factor in the selectivity of clauses on the implied attribute,
-	 * and remove the clauses from the list.
+	 * Work out which dependencies we can apply, starting with the
+	 * widest/stongest ones, and proceeding to smaller/weaker ones.
 	 */
+	dependencies = (MVDependency **) palloc(sizeof(MVDependency *) *
+											total_ndeps);
+	ndependencies = 0;
+
 	while (true)
 	{
-		Selectivity s2 = 1.0;
 		MVDependency *dependency;
+		AttrNumber	attnum;

 		/* the widest/strongest dependency, fully matched by clauses */
-		dependency = find_strongest_dependency(dependencies, ndependencies,
+		dependency = find_strongest_dependency(func_dependencies,
+											   nfunc_dependencies,
 											   clauses_attnums);
-
-		/* if no suitable dependency was found, we're done */
 		if (!dependency)
 			break;

-		/*
-		 * We found an applicable dependency, so find all the clauses on the
-		 * implied attribute - with dependency (a,b => c) we look for clauses
-		 * on 'c'.
-		 */
-		listidx = -1;
-		foreach(l, clauses)
-		{
-			Node	   *clause;
-			AttrNumber	attnum;
+		dependencies[ndependencies++] = dependency;

-			listidx++;
-
-			/*
-			 * Skip incompatible clauses, and ones we've already estimated on.
-			 */
-			if (!list_attnums[listidx])
-				continue;
-
-			/*
-			 * We expect the bitmaps ton contain a single attribute number.
-			 */
-			attnum = bms_singleton_member(list_attnums[listidx]);
-
-			/*
-			 * Technically we could find more than one clause for a given
-			 * attnum. Since these clauses must be equality clauses, we choose
-			 * to only take the selectivity estimate from the final clause in
-			 * the list for this attnum. If the attnum happens to be compared
-			 * to a different Const in another clause then no rows will match
-			 * anyway. If it happens to be compared to the same Const, then
-			 * ignoring the additional clause is just the thing to do.
-			 */
-			if (dependency_implies_attribute(dependency, attnum))
-			{
-				clause = (Node *) lfirst(l);
-
-				s2 = clause_selectivity(root, clause, varRelid, jointype,
-										sjinfo);
-
-				/* mark this one as done, so we don't touch it again. */
-				*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
-
-				/*
-				 * Mark that we've got and used the dependency on this clause.
-				 * We'll want to ignore this when looking for the next
-				 * strongest dependency above.
-				 */
-				clauses_attnums = bms_del_member(clauses_attnums, attnum);
-			}
-		}
-
-		/*
-		 * Now factor in the selectivity for all the "implied" clauses into
-		 * the final one, using this formula:
-		 *
-		 * P(a,b) = P(a) * (f + (1-f) * P(b))
-		 *
-		 * where 'f' is the degree of validity of the dependency.
-		 */
-		s1 *= (dependency->degree + (1 - dependency->degree) * s2);
+		/* Ignore dependencies using this implied attribute in later loops */
+		attnum = dependency->attributes[dependency->nattributes - 1];
+		clauses_attnums = bms_del_member(clauses_attnums, attnum);
 	}

+	/*
+	 * If we found applicable dependencies, use them to estimate all
+	 * compatible clauses on attributes that they refer to.
+	 */
+	if (ndependencies != 0)
+		s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype,
+										   sjinfo, dependencies, ndependencies,
+										   list_attnums, estimatedclauses);
+
 	/* free deserialized functional dependencies (and then the array) */
-	for (i = 0; i < ndependencies; i++)
-		pfree(dependencies[i]);
+	for (i = 0; i < nfunc_dependencies; i++)
+		pfree(func_dependencies[i]);

 	pfree(dependencies);
+	pfree(func_dependencies);
+	bms_free(clauses_attnums);
 	pfree(list_attnums);

 	return s1;
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@ -440,6 +440,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
         8 |    200
 (1 row)

+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         4 |    100
+(1 row)
+
 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
 estimated | actual 
 -----------+--------
@ -600,6 +606,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
       200 |    200
 (1 row)

+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
 estimated | actual 
 -----------+--------
@ -719,12 +731,14 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
         1 |      0
 (1 row)

-- check change of column type doesn't break it
+-- changing the type of column c causes its single-column stats to be dropped,
+-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
+-- clauses estimated with functional dependencies does not exceed this
 ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
 estimated | actual 
 -----------+--------
-        50 |     50
+        25 |     50
 (1 row)

 ANALYZE functional_dependencies;
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@ -280,6 +280,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE

 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');

+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+
 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');

 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
@ -342,6 +344,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE

 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');

+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+
 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');

 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
@ -385,7 +389,9 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE

 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');

-- check change of column type doesn't break it
+-- changing the type of column c causes its single-column stats to be dropped,
+-- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
+-- clauses estimated with functional dependencies does not exceed this
 ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;

 SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');