Rewrite choose_bitmap_and() to make it more robust in the presence of

competing alternatives for indexes to use in a bitmap scan. The former coding took estimated selectivity as an overriding factor, causing it to sometimes choose indexes that were much slower to scan than ones with a slightly worse selectivity. It was also too narrow-minded about which combinations of indexes to consider ANDing. The rewrite makes it pay more attention to index scan cost than selectivity; this seems sane since it's impossible to have very bad selectivity with low cost, whereas the reverse isn't true. Also, we now consider each index alone, as well as adding each index to an AND-group led by each prior index, for a total of about O(N^2) rather than O(N) combinations considered. This makes the results much less dependent on the exact order in which the indexes are considered. It's still a lot cheaper than an O(2^N) exhaustive search. A prefilter step eliminates all but the cheapest of those indexes using the same set of WHERE conditions, to keep the effective value of N down in scenarios where the DBA has created lots of partially-redundant indexes.
2007-04-17 20:03:03 +00:00 · 2007-04-17 20:03:03 +00:00 · 2e824a8ea9
parent 2b99411df2
commit 2e824a8ea9
1 changed files with 286 additions and 157 deletions
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.220 2007/04/15 20:09:28 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.221 2007/04/17 20:03:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -45,6 +45,16 @@
 	((opfamily) == BOOL_BTREE_FAM_OID || (opfamily) == BOOL_HASH_FAM_OID)


+/* Per-path data used within choose_bitmap_and() */
+typedef struct
+{
+	Path	   *path;			/* IndexPath, BitmapAndPath, or BitmapOrPath */
+	List	   *quals;			/* the WHERE clauses it uses */
+	List	   *preds;			/* predicates of its partial index(es) */
+	Bitmapset  *clauseids;		/* quals+preds represented as a bitmapset */
+} PathClauseUsage;
+
+
 static List *find_usable_indexes(PlannerInfo *root, RelOptInfo *rel,
 					List *clauses, List *outer_clauses,
 					bool istoplevel, RelOptInfo *outer_rel,
@ -54,11 +64,15 @@ static List *find_saop_paths(PlannerInfo *root, RelOptInfo *rel,
 				bool istoplevel, RelOptInfo *outer_rel);
 static Path *choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 				  List *paths, RelOptInfo *outer_rel);
-static int	bitmap_path_comparator(const void *a, const void *b);
+static int	path_usage_comparator(const void *a, const void *b);
+static Cost bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel,
+					 Path *ipath, RelOptInfo *outer_rel);
 static Cost bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel,
 					List *paths, RelOptInfo *outer_rel);
+static PathClauseUsage *classify_index_clause_usage(Path *path,
+													List **clauselist);
 static void find_indexpath_quals(Path *bitmapqual, List **quals, List **preds);
-static bool lists_intersect(List *list1, List *list2);
+static int	find_list_position(Node *node, List **nodelist);
 static bool match_clause_to_indexcol(IndexOptInfo *index,
 						 int indexcol, Oid opfamily,
 						 RestrictInfo *rinfo,
@ -562,12 +576,12 @@ choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 				  List *paths, RelOptInfo *outer_rel)
 {
 	int			npaths = list_length(paths);
-	Path	  **patharray;
-	Cost		costsofar;
-	List	   *qualsofar;
-	List	   *firstpred;
-	ListCell   *lastcell;
-	int			i;
+	PathClauseUsage **pathinfoarray;
+	PathClauseUsage *pathinfo;
+	List	   *clauselist;
+	List	   *bestpaths = NIL;
+	Cost		bestcost = 0;
+	int			i, j;
 	ListCell   *l;

 	Assert(npaths > 0);			/* else caller error */
@ -578,156 +592,231 @@ choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 	 * In theory we should consider every nonempty subset of the given paths.
 	 * In practice that seems like overkill, given the crude nature of the
 	 * estimates, not to mention the possible effects of higher-level AND and
-	 * OR clauses.	As a compromise, we sort the paths by selectivity.	We
-	 * always take the first, and sequentially add on paths that result in a
-	 * lower estimated cost.
+	 * OR clauses.  Moreover, it's completely impractical if there are a large
+	 * number of paths, since the work would grow as O(2^N).
 	 *
-	 * We also make some effort to detect directly redundant input paths, as
-	 * can happen if there are multiple possibly usable indexes.  (Another way
-	 * it can happen is that best_inner_indexscan will find the same OR join
-	 * clauses that create_or_index_quals has pulled OR restriction clauses
-	 * out of, and then both versions show up as duplicate paths.)	We
-	 * consider an index redundant if any of its index conditions were already
-	 * used by earlier indexes.  (We could use predicate_implied_by to have a
-	 * more intelligent, but much more expensive, check --- but in most cases
-	 * simple equality should suffice.)
+	 * As a heuristic, we first check for paths using exactly the same
+	 * sets of WHERE clauses + index predicate conditions, and reject all
+	 * but the cheapest-to-scan in any such group.  This primarily gets rid
+	 * of indexes that include the interesting columns but also irrelevant
+	 * columns.  (In situations where the DBA has gone overboard on creating
+	 * variant indexes, this can make for a very large reduction in the number
+	 * of paths considered further.)
 	 *
-	 * You might think the condition for redundancy should be "all index
-	 * conditions already used", not "any", but this turns out to be wrong.
-	 * For example, if we use an index on A, and then come to an index with
-	 * conditions on A and B, the only way that the second index can be later
-	 * in the selectivity-order sort is if the condition on B is completely
-	 * non-selective.  In any case, we'd surely be drastically misestimating
-	 * the selectivity if we count the same condition twice.
+	 * We then sort the surviving paths with the cheapest-to-scan first,
+	 * and for each path, consider using that path alone as the basis for
+	 * a bitmap scan.  Then we consider bitmap AND scans formed from that
+	 * path plus each subsequent (higher-cost) path, adding on a subsequent
+	 * path if it results in a reduction in the estimated total scan cost.
+	 * This means we consider about O(N^2) rather than O(2^N) path
+	 * combinations, which is quite tolerable, especially given than N is
+	 * usually reasonably small because of the prefiltering step.  The
+	 * cheapest of these is returned.
 	 *
-	 * We must also consider index predicate conditions in checking for
-	 * redundancy, because the estimated selectivity of a partial index
-	 * includes its predicate even if the explicit index conditions don't.
-	 * Here we have to work harder than just checking expression equality:
-	 * we check to see if any of the predicate clauses are implied by
-	 * index conditions or predicate clauses of previous paths.  This covers
-	 * cases such as a condition "x = 42" used with a plain index, followed
-	 * by a clauseless scan of a partial index "WHERE x >= 40 AND x < 50".
-	 * Also, we reject indexes that have a qual condition matching any
-	 * previously-used index's predicate (by including predicate conditions
-	 * into qualsofar).  It should be sufficient to check equality in this
-	 * case, not implication, since we've sorted the paths by selectivity
-	 * and so tighter conditions are seen first --- only for exactly equal
-	 * cases might the partial index come first.
+	 * We will only consider AND combinations in which no two indexes use
+	 * the same WHERE clause.  This is a bit of a kluge: it's needed because
+	 * costsize.c and clausesel.c aren't very smart about redundant clauses.
+	 * They will usually double-count the redundant clauses, producing a
+	 * too-small selectivity that makes a redundant AND step look like it
+	 * reduces the total cost.  Perhaps someday that code will be smarter and
+	 * we can remove this limitation.  (But note that this also defends
+	 * against flat-out duplicate input paths, which can happen because
+	 * best_inner_indexscan will find the same OR join clauses that
+	 * create_or_index_quals has pulled OR restriction clauses out of.)
 	 *
-	 * XXX the reason we need all these redundancy checks is that costsize.c
-	 * and clausesel.c aren't very smart about redundant clauses: they will
-	 * usually double-count the redundant clauses, producing a too-small
-	 * selectivity that makes a redundant AND look like it reduces the total
-	 * cost.  Perhaps someday that code will be smarter and we can remove
-	 * these heuristics.
-	 *
-	 * Note: outputting the selected sub-paths in selectivity order is a good
-	 * thing even if we weren't using that as part of the selection method,
-	 * because it makes the short-circuit case in MultiExecBitmapAnd() more
-	 * likely to apply.
+	 * For the same reason, we reject AND combinations in which an index
+	 * predicate clause duplicates another clause.  Here we find it necessary
+	 * to be even stricter: we'll reject a partial index if any of its
+	 * predicate clauses are implied by the set of WHERE clauses and predicate
+	 * clauses used so far.  This covers cases such as a condition "x = 42"
+	 * used with a plain index, followed by a clauseless scan of a partial
+	 * index "WHERE x >= 40 AND x < 50".  The partial index has been accepted
+	 * only because "x = 42" was present, and so allowing it would partially
+	 * double-count selectivity.  (We could use predicate_implied_by on
+	 * regular qual clauses too, to have a more intelligent, but much more
+	 * expensive, check for redundancy --- but in most cases simple equality
+	 * seems to suffice.)
 	 */

-	/* Convert list to array so we can apply qsort */
-	patharray = (Path **) palloc(npaths * sizeof(Path *));
-	i = 0;
+	/*
+	 * Extract clause usage info and detect any paths that use exactly
+	 * the same set of clauses; keep only the cheapest-to-scan of any such
+	 * groups.  The surviving paths are put into an array for qsort'ing.
+	 */
+	pathinfoarray = (PathClauseUsage **)
+		palloc(npaths * sizeof(PathClauseUsage *));
+	clauselist = NIL;
+	npaths = 0;
 	foreach(l, paths)
 	{
-		patharray[i++] = (Path *) lfirst(l);
-	}
-	qsort(patharray, npaths, sizeof(Path *), bitmap_path_comparator);
+		Path   *ipath = (Path *) lfirst(l);

-	paths = list_make1(patharray[0]);
-	costsofar = bitmap_and_cost_est(root, rel, paths, outer_rel);
-	find_indexpath_quals(patharray[0], &qualsofar, &firstpred);
-	qualsofar = list_concat(qualsofar, firstpred);
-	lastcell = list_head(paths);	/* for quick deletions */
-
-	for (i = 1; i < npaths; i++)
-	{
-		Path	   *newpath = patharray[i];
-		List	   *newqual;
-		List	   *newpred;
-		Cost		newcost;
-
-		find_indexpath_quals(newpath, &newqual, &newpred);
-		if (lists_intersect(newqual, qualsofar))
-			continue;			/* consider it redundant */
-		if (newpred)
+		pathinfo = classify_index_clause_usage(ipath, &clauselist);
+		for (i = 0; i < npaths; i++)
 		{
-			bool	redundant = false;
-
-			/* we check each predicate clause separately */
-			foreach(l, newpred)
-			{
-				Node	   *np = (Node *) lfirst(l);
-
-				if (predicate_implied_by(list_make1(np), qualsofar))
-				{
-					redundant = true;
-					break;		/* out of inner loop */
-				}
-			}
-			if (redundant)
-				continue;
+			if (bms_equal(pathinfo->clauseids, pathinfoarray[i]->clauseids))
+				break;
 		}
-		/* tentatively add newpath to paths, so we can estimate cost */
-		paths = lappend(paths, newpath);
-		newcost = bitmap_and_cost_est(root, rel, paths, outer_rel);
-		if (newcost < costsofar)
+		if (i < npaths)
 		{
-			/* keep newpath in paths, update subsidiary variables */
-			costsofar = newcost;
-			qualsofar = list_concat(list_concat(qualsofar, newqual), newpred);
-			lastcell = lnext(lastcell);
+			/* duplicate clauseids, keep the cheaper one */
+			Cost		ncost;
+			Cost		ocost;
+			Selectivity nselec;
+			Selectivity oselec;
+
+			cost_bitmap_tree_node(pathinfo->path, &ncost, &nselec);
+			cost_bitmap_tree_node(pathinfoarray[i]->path, &ocost, &oselec);
+			if (ncost < ocost)
+				pathinfoarray[i] = pathinfo;
 		}
 		else
 		{
-			/* reject newpath, remove it from paths list */
-			paths = list_delete_cell(paths, lnext(lastcell), lastcell);
+			/* not duplicate clauseids, add to array */
+			pathinfoarray[npaths++] = pathinfo;
 		}
-		Assert(lnext(lastcell) == NULL);
 	}

-	if (list_length(paths) == 1)
-		return (Path *) linitial(paths);		/* no need for AND */
-	return (Path *) create_bitmap_and_path(root, rel, paths);
+	/* If only one surviving path, we're done */
+	if (npaths == 1)
+		return pathinfoarray[0]->path;
+
+	/* Sort the surviving paths by index access cost */
+	qsort(pathinfoarray, npaths, sizeof(PathClauseUsage *),
+		  path_usage_comparator);
+
+	/*
+	 * For each surviving index, consider it as an "AND group leader", and
+	 * see whether adding on any of the later indexes results in an AND path
+	 * with cheaper total cost than before.  Then take the cheapest AND group.
+	 */
+	for (i = 0; i < npaths; i++)
+	{
+		Cost		costsofar;
+		List	   *qualsofar;
+		Bitmapset  *clauseidsofar;
+		ListCell   *lastcell;
+
+		pathinfo = pathinfoarray[i];
+		paths = list_make1(pathinfo->path);
+		costsofar = bitmap_scan_cost_est(root, rel, pathinfo->path, outer_rel);
+		qualsofar = list_concat(list_copy(pathinfo->quals),
+								list_copy(pathinfo->preds));
+		clauseidsofar = bms_copy(pathinfo->clauseids);
+		lastcell = list_head(paths);	/* for quick deletions */
+
+		for (j = i+1; j < npaths; j++)
+		{
+			Cost		newcost;
+
+			pathinfo = pathinfoarray[j];
+			/* Check for redundancy */
+			if (bms_overlap(pathinfo->clauseids, clauseidsofar))
+				continue;			/* consider it redundant */
+			if (pathinfo->preds)
+			{
+				bool	redundant = false;
+
+				/* we check each predicate clause separately */
+				foreach(l, pathinfo->preds)
+				{
+					Node	   *np = (Node *) lfirst(l);
+
+					if (predicate_implied_by(list_make1(np), qualsofar))
+					{
+						redundant = true;
+						break;		/* out of inner foreach loop */
+					}
+				}
+				if (redundant)
+					continue;
+			}
+			/* tentatively add new path to paths, so we can estimate cost */
+			paths = lappend(paths, pathinfo->path);
+			newcost = bitmap_and_cost_est(root, rel, paths, outer_rel);
+			if (newcost < costsofar)
+			{
+				/* keep new path in paths, update subsidiary variables */
+				costsofar = newcost;
+				qualsofar = list_concat(qualsofar,
+										list_copy(pathinfo->quals));
+				qualsofar = list_concat(qualsofar,
+										list_copy(pathinfo->preds));
+				clauseidsofar = bms_add_members(clauseidsofar,
+												pathinfo->clauseids);
+				lastcell = lnext(lastcell);
+			}
+			else
+			{
+				/* reject new path, remove it from paths list */
+				paths = list_delete_cell(paths, lnext(lastcell), lastcell);
+			}
+			Assert(lnext(lastcell) == NULL);
+		}
+
+		/* Keep the cheapest AND-group (or singleton) */
+		if (i == 0 || costsofar < bestcost)
+		{
+			bestpaths = paths;
+			bestcost = costsofar;
+		}
+
+		/* some easy cleanup (we don't try real hard though) */
+		list_free(qualsofar);
+	}
+
+	if (list_length(bestpaths) == 1)
+		return (Path *) linitial(bestpaths);		/* no need for AND */
+	return (Path *) create_bitmap_and_path(root, rel, bestpaths);
 }

-/* qsort comparator to sort in increasing selectivity order */
+/* qsort comparator to sort in increasing index access cost order */
 static int
-bitmap_path_comparator(const void *a, const void *b)
+path_usage_comparator(const void *a, const void *b)
 {
-	Path	   *pa = *(Path *const *) a;
-	Path	   *pb = *(Path *const *) b;
+	PathClauseUsage *pa = *(PathClauseUsage *const *) a;
+	PathClauseUsage *pb = *(PathClauseUsage *const *) b;
 	Cost		acost;
 	Cost		bcost;
 	Selectivity aselec;
 	Selectivity bselec;

-	cost_bitmap_tree_node(pa, &acost, &aselec);
-	cost_bitmap_tree_node(pb, &bcost, &bselec);
+	cost_bitmap_tree_node(pa->path, &acost, &aselec);
+	cost_bitmap_tree_node(pb->path, &bcost, &bselec);

 	/*
-	 * If selectivities are the same, sort by cost.  (Note: there used to be
-	 * logic here to do "fuzzy comparison", but that's a bad idea because it
-	 * fails to be transitive, which will confuse qsort terribly.)
+	 * If costs are the same, sort by selectivity.
 	 */
-	if (aselec < bselec)
-		return -1;
-	if (aselec > bselec)
-		return 1;
-
 	if (acost < bcost)
 		return -1;
 	if (acost > bcost)
 		return 1;

+	if (aselec < bselec)
+		return -1;
+	if (aselec > bselec)
+		return 1;
+
 	return 0;
 }

 /*
- * Estimate the cost of actually executing a BitmapAnd with the given
+ * Estimate the cost of actually executing a bitmap scan with a single
+ * index path (no BitmapAnd, at least not at this level).
+ */
+static Cost
+bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel,
+					 Path *ipath, RelOptInfo *outer_rel)
+{
+	Path		bpath;
+
+	cost_bitmap_heap_scan(&bpath, root, rel, ipath, outer_rel);
+
+	return bpath.total_cost;
+}
+
+/*
+ * Estimate the cost of actually executing a BitmapAnd scan with the given
 * inputs.
 */
 static Cost
@ -749,11 +838,65 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel,
 	return bpath.total_cost;
 }

+
+/*
+ * classify_index_clause_usage
+ *		Construct a PathClauseUsage struct describing the WHERE clauses and
+ *		index predicate clauses used by the given indexscan path.
+ *		We consider two clauses the same if they are equal().
+ *
+ * At some point we might want to migrate this info into the Path data
+ * structure proper, but for the moment it's only needed within
+ * choose_bitmap_and().
+ *
+ * *clauselist is used and expanded as needed to identify all the distinct
+ * clauses seen across successive calls.  Caller must initialize it to NIL
+ * before first call of a set.
+ */
+static PathClauseUsage *
+classify_index_clause_usage(Path *path, List **clauselist)
+{
+	PathClauseUsage *result;
+	Bitmapset  *clauseids;
+	ListCell   *lc;
+
+	result = (PathClauseUsage *) palloc(sizeof(PathClauseUsage));
+	result->path = path;
+
+	/* Recursively find the quals and preds used by the path */
+	result->quals = NIL;
+	result->preds = NIL;
+	find_indexpath_quals(path, &result->quals, &result->preds);
+
+	/* Build up a bitmapset representing the quals and preds */
+	clauseids = NULL;
+	foreach(lc, result->quals)
+	{
+		Node   *node = (Node *) lfirst(lc);
+
+		clauseids = bms_add_member(clauseids,
+								   find_list_position(node, clauselist));
+	}
+	foreach(lc, result->preds)
+	{
+		Node   *node = (Node *) lfirst(lc);
+
+		clauseids = bms_add_member(clauseids,
+								   find_list_position(node, clauselist));
+	}
+	result->clauseids = clauseids;
+
+	return result;
+}
+
+
 /*
 * find_indexpath_quals
 *
 * Given the Path structure for a plain or bitmap indexscan, extract lists
 * of all the indexquals and index predicate conditions used in the Path.
+ * These are appended to the initial contents of *quals and *preds (hence
+ * caller should initialize those to NIL).
 *
 * This is sort of a simplified version of make_restrictinfo_from_bitmapqual;
 * here, we are not trying to produce an accurate representation of the AND/OR
@ -766,45 +909,32 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel,
 static void
 find_indexpath_quals(Path *bitmapqual, List **quals, List **preds)
 {
-	ListCell   *l;
-
-	*quals = NIL;
-	*preds = NIL;
-
 	if (IsA(bitmapqual, BitmapAndPath))
 	{
 		BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
+		ListCell   *l;

 		foreach(l, apath->bitmapquals)
 		{
-			List	   *subquals;
-			List	   *subpreds;
-
-			find_indexpath_quals((Path *) lfirst(l), &subquals, &subpreds);
-			*quals = list_concat(*quals, subquals);
-			*preds = list_concat(*preds, subpreds);
+			find_indexpath_quals((Path *) lfirst(l), quals, preds);
 		}
 	}
 	else if (IsA(bitmapqual, BitmapOrPath))
 	{
 		BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
+		ListCell   *l;

 		foreach(l, opath->bitmapquals)
 		{
-			List	   *subquals;
-			List	   *subpreds;
-
-			find_indexpath_quals((Path *) lfirst(l), &subquals, &subpreds);
-			*quals = list_concat(*quals, subquals);
-			*preds = list_concat(*preds, subpreds);
+			find_indexpath_quals((Path *) lfirst(l), quals, preds);
 		}
 	}
 	else if (IsA(bitmapqual, IndexPath))
 	{
 		IndexPath  *ipath = (IndexPath *) bitmapqual;

-		*quals = get_actual_clauses(ipath->indexclauses);
-		*preds = list_copy(ipath->indexinfo->indpred);
+		*quals = list_concat(*quals, get_actual_clauses(ipath->indexclauses));
+		*preds = list_concat(*preds, list_copy(ipath->indexinfo->indpred));
 	}
 	else
 		elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
@ -812,31 +942,30 @@ find_indexpath_quals(Path *bitmapqual, List **quals, List **preds)


 /*
- * lists_intersect
- *		Detect whether two lists have a nonempty intersection,
- *		using equal() to compare members.
- *
- * This possibly should go into list.c, but it doesn't yet have any use
- * except in choose_bitmap_and.
+ * find_list_position
+ *		Return the given node's position (counting from 0) in the given
+ *		list of nodes.  If it's not equal() to any existing list member,
+ *		add it at the end, and return that position.
 */
-static bool
-lists_intersect(List *list1, List *list2)
+static int
+find_list_position(Node *node, List **nodelist)
 {
-	ListCell   *cell1;
+	int			i;
+	ListCell   *lc;

-	foreach(cell1, list1)
+	i = 0;
+	foreach(lc, *nodelist)
 	{
-		void	   *datum1 = lfirst(cell1);
-		ListCell   *cell2;
+		Node   *oldnode = (Node *) lfirst(lc);

-		foreach(cell2, list2)
-		{
-			if (equal(lfirst(cell2), datum1))
-				return true;
-		}
+		if (equal(node, oldnode))
+			return i;
+		i++;
 	}

-	return false;
+	*nodelist = lappend(*nodelist, node);
+
+	return i;
 }