From 281f40187f866f0346ba19ccf033afccf0410d45 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 15 Dec 2006 18:42:26 +0000
Subject: [PATCH] Fix some planner bugs exposed by reports from Arjen van der
 Meijden.  These are all in new-in-8.2 logic associated with indexability of
 ScalarArrayOpExpr (IN-clauses) or amortization of indexscan costs across
 repeated indexscans on the inside of a nestloop.  In particular:

Fix some logic errors in the estimation for multiple scans induced by a
ScalarArrayOpExpr indexqual.

Include a small cost component in bitmap index scans to reflect the costs of
manipulating the bitmap itself; this is mainly to prevent a bitmap scan from
appearing to have the same cost as a plain indexscan for fetching a single
tuple.

Also add a per-index-scan-startup CPU cost component; while prior releases
were clearly too pessimistic about the cost of repeated indexscans, the
original 8.2 coding allowed the cost of an indexscan to effectively go to zero
if repeated often enough, which is overly optimistic.

Pay some attention to index correlation when estimating costs for a nestloop
inner indexscan: this is significant when the plan fetches multiple heap
tuples per iteration, since high correlation means those tuples are probably
on the same or adjacent heap pages.
---
 src/backend/optimizer/path/costsize.c | 56 ++++++++++++++++++++-------
 src/backend/utils/adt/selfuncs.c      | 56 ++++++++++++++++++++++-----
 2 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index ffe06f23ff..71685643ef 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.169 2006/11/11 01:14:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.170 2006/12/15 18:42:26 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -276,13 +276,12 @@ cost_index(IndexPath *path, PlannerInfo *root,
 	if (outer_rel != NULL && outer_rel->rows > 1)
 	{
 		/*
-		 * For repeated indexscans, scale up the number of tuples fetched in
+		 * For repeated indexscans, the appropriate estimate for the
+		 * uncorrelated case is to scale up the number of tuples fetched in
 		 * the Mackert and Lohman formula by the number of scans, so that we
-		 * estimate the number of pages fetched by all the scans. Then
+		 * estimate the number of pages fetched by all the scans; then
 		 * pro-rate the costs for one scan.  In this case we assume all the
-		 * fetches are random accesses.  XXX it'd be good to include
-		 * correlation in this model, but it's not clear how to do that
-		 * without double-counting cache effects.
+		 * fetches are random accesses.
 		 */
 		double		num_scans = outer_rel->rows;
 
@@ -291,7 +290,27 @@ cost_index(IndexPath *path, PlannerInfo *root,
 											(double) index->pages,
 											root);
 
-		run_cost += (pages_fetched * random_page_cost) / num_scans;
+		max_IO_cost = (pages_fetched * random_page_cost) / num_scans;
+
+		/*
+		 * In the perfectly correlated case, the number of pages touched
+		 * by each scan is selectivity * table_size, and we can use the
+		 * Mackert and Lohman formula at the page level to estimate how
+		 * much work is saved by caching across scans.  We still assume
+		 * all the fetches are random, though, which is an overestimate
+		 * that's hard to correct for without double-counting the cache
+		 * effects.  (But in most cases where such a plan is actually
+		 * interesting, only one page would get fetched per scan anyway,
+		 * so it shouldn't matter much.)
+		 */
+		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
+
+		pages_fetched = index_pages_fetched(pages_fetched * num_scans,
+											baserel->pages,
+											(double) index->pages,
+											root);
+
+		min_IO_cost = (pages_fetched * random_page_cost) / num_scans;
 	}
 	else
 	{
@@ -312,16 +331,16 @@ cost_index(IndexPath *path, PlannerInfo *root,
 		min_IO_cost = random_page_cost;
 		if (pages_fetched > 1)
 			min_IO_cost += (pages_fetched - 1) * seq_page_cost;
-
-		/*
-		 * Now interpolate based on estimated index order correlation to get
-		 * total disk I/O cost for main table accesses.
-		 */
-		csquared = indexCorrelation * indexCorrelation;
-
-		run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
 	}
 
+	/*
+	 * Now interpolate based on estimated index order correlation to get
+	 * total disk I/O cost for main table accesses.
+	 */
+	csquared = indexCorrelation * indexCorrelation;
+
+	run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
+
 	/*
 	 * Estimate CPU costs per tuple.
 	 *
@@ -614,6 +633,13 @@ cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
 	{
 		*cost = ((IndexPath *) path)->indextotalcost;
 		*selec = ((IndexPath *) path)->indexselectivity;
+		/*
+		 * Charge a small amount per retrieved tuple to reflect the costs of
+		 * manipulating the bitmap.  This is mostly to make sure that a bitmap
+		 * scan doesn't look to be the same cost as an indexscan to retrieve
+		 * a single tuple.
+		 */
+		*cost += 0.1 * cpu_operator_cost * ((IndexPath *) path)->rows;
 	}
 	else if (IsA(path, BitmapAndPath))
 	{
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 4eed9619b7..230ce549f3 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.214 2006/10/04 00:29:59 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.215 2006/12/15 18:42:26 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -4673,14 +4673,18 @@ genericcostestimate(PlannerInfo *root,
 	 * way in order to get the right answer for partial indexes.
 	 */
 	if (numIndexTuples <= 0.0)
+	{
 		numIndexTuples = *indexSelectivity * index->rel->tuples;
 
-	/*
-	 * The estimate obtained so far counts all the tuples returned by all
-	 * scans of ScalarArrayOpExpr calls.  We want to consider the per-scan
-	 * number, so adjust.  This is a handy place to round to integer, too.
-	 */
-	numIndexTuples = rint(numIndexTuples / num_sa_scans);
+		/*
+		 * The above calculation counts all the tuples visited across all
+		 * scans induced by ScalarArrayOpExpr nodes.  We want to consider the
+		 * average per-indexscan number, so adjust.  This is a handy place to
+		 * round to integer, too.  (If caller supplied tuple estimate, it's
+		 * responsible for handling these considerations.)
+		 */
+		numIndexTuples = rint(numIndexTuples / num_sa_scans);
+	}
 
 	/*
 	 * We can bound the number of tuples by the index size in any case. Also,
@@ -4786,7 +4790,9 @@ genericcostestimate(PlannerInfo *root,
 	 * evaluated once at the start of the scan to reduce them to runtime keys
 	 * to pass to the index AM (see nodeIndexscan.c).  We model the per-tuple
 	 * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per
-	 * indexqual operator.
+	 * indexqual operator.  Because we have numIndexTuples as a per-scan
+	 * number, we have to multiply by num_sa_scans to get the correct result
+	 * for ScalarArrayOpExpr cases.
 	 *
 	 * Note: this neglects the possible costs of rechecking lossy operators
 	 * and OR-clause expressions.  Detecting that that might be needed seems
@@ -4801,7 +4807,22 @@ genericcostestimate(PlannerInfo *root,
 		qual_arg_cost = 0;
 	*indexStartupCost = qual_arg_cost;
 	*indexTotalCost += qual_arg_cost;
-	*indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost);
+	*indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost);
+
+	/*
+	 * We also add a CPU-cost component to represent the general costs of 
+	 * starting an indexscan, such as analysis of btree index keys and
+	 * initial tree descent.  This is estimated at 100x cpu_operator_cost,
+	 * which is a bit arbitrary but seems the right order of magnitude.
+	 * (As noted above, we don't charge any I/O for touching upper tree
+	 * levels, but charging nothing at all has been found too optimistic.)
+	 *
+	 * Although this is startup cost with respect to any one scan, we add
+	 * it to the "total" cost component because it's only very interesting
+	 * in the many-ScalarArrayOpExpr-scan case, and there it will be paid
+	 * over the life of the scan node.
+	 */
+	*indexTotalCost += num_sa_scans * 100.0 * cpu_operator_cost;
 
 	/*
 	 * Generic assumption about index correlation: there isn't any.
@@ -4829,6 +4850,7 @@ btcostestimate(PG_FUNCTION_ARGS)
 	int			indexcol;
 	bool		eqQualHere;
 	bool		found_saop;
+	double		num_sa_scans;
 	ListCell   *l;
 
 	/*
@@ -4852,6 +4874,7 @@ btcostestimate(PG_FUNCTION_ARGS)
 	indexcol = 0;
 	eqQualHere = false;
 	found_saop = false;
+	num_sa_scans = 1;
 	foreach(l, indexQuals)
 	{
 		RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
@@ -4928,6 +4951,15 @@ btcostestimate(PG_FUNCTION_ARGS)
 		Assert(op_strategy != 0);		/* not a member of opclass?? */
 		if (op_strategy == BTEqualStrategyNumber)
 			eqQualHere = true;
+		/* count up number of SA scans induced by indexBoundQuals only */
+		if (IsA(clause, ScalarArrayOpExpr))
+		{
+			ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+			int			alength = estimate_array_length(lsecond(saop->args));
+
+			if (alength > 1)
+				num_sa_scans *= alength;
+		}
 		indexBoundQuals = lappend(indexBoundQuals, rinfo);
 	}
 
@@ -4949,6 +4981,12 @@ btcostestimate(PG_FUNCTION_ARGS)
 												  index->rel->relid,
 												  JOIN_INNER);
 		numIndexTuples = btreeSelectivity * index->rel->tuples;
+		/*
+		 * As in genericcostestimate(), we have to adjust for any
+		 * ScalarArrayOpExpr quals included in indexBoundQuals, and then
+		 * round to integer.
+		 */
+		numIndexTuples = rint(numIndexTuples / num_sa_scans);
 	}
 
 	genericcostestimate(root, index, indexQuals, outer_rel, numIndexTuples,