From 5edc63bda68a77c4d38f0cbeae1c4271f9ef4100 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 10 Nov 2017 16:50:50 -0500
Subject: [PATCH] Account for the effect of lossy pages when costing bitmap
 scans.

Dilip Kumar, reviewed by Alexander Kumenkov, Amul Sul, and me.
Some final adjustments by me.

Discussion: http://postgr.es/m/CAFiTN-sYtqUOXQ4SpuhTv0Z9gD0si3YxZGv_PQAAMX8qbOotcg@mail.gmail.com
---
 src/backend/nodes/tidbitmap.c         | 37 +++++++++++------
 src/backend/optimizer/path/costsize.c | 59 ++++++++++++++++++++++-----
 src/include/nodes/tidbitmap.h         |  1 +
 3 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index c47d5849ef..acfe6b263c 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -265,7 +265,6 @@ TIDBitmap *
 tbm_create(long maxbytes, dsa_area *dsa)
 {
 	TIDBitmap  *tbm;
-	long		nbuckets;
 
 	/* Create the TIDBitmap struct and zero all its fields */
 	tbm = makeNode(TIDBitmap);
@@ -273,17 +272,7 @@ tbm_create(long maxbytes, dsa_area *dsa)
 	tbm->mcxt = CurrentMemoryContext;
 	tbm->status = TBM_EMPTY;
 
-	/*
-	 * Estimate number of hashtable entries we can have within maxbytes. This
-	 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
-	 * for our purpose.  Also count an extra Pointer per entry for the arrays
-	 * created during iteration readout.
-	 */
-	nbuckets = maxbytes /
-		(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
-	nbuckets = Min(nbuckets, INT_MAX - 1);	/* safety limit */
-	nbuckets = Max(nbuckets, 16);	/* sanity limit */
-	tbm->maxentries = (int) nbuckets;
+	tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
 	tbm->lossify_start = 0;
 	tbm->dsa = dsa;
 	tbm->dsapagetable = InvalidDsaPointer;
@@ -1546,3 +1535,27 @@ pagetable_free(pagetable_hash *pagetable, void *pointer)
 		tbm->dsapagetableold = InvalidDsaPointer;
 	}
 }
+
+/*
+ * tbm_calculate_entries
+ *
+ * Estimate number of hashtable entries we can have within maxbytes.
+ */
+long
+tbm_calculate_entries(double maxbytes)
+{
+	long		nbuckets;
+
+	/*
+	 * Estimate number of hashtable entries we can have within maxbytes. This
+	 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
+	 * for our purpose.  Also count an extra Pointer per entry for the arrays
+	 * created during iteration readout.
+	 */
+	nbuckets = maxbytes /
+		(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
+	nbuckets = Min(nbuckets, INT_MAX - 1);	/* safety limit */
+	nbuckets = Max(nbuckets, 16);	/* sanity limit */
+
+	return nbuckets;
+}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 98fb16e85a..2d2df60886 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -5171,6 +5171,8 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
 	double		T;
 	double		pages_fetched;
 	double		tuples_fetched;
+	double		heap_pages;
+	long		maxentries;
 
 	/*
 	 * Fetch total cost of obtaining the bitmap, as well as its total
@@ -5185,6 +5187,24 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
 
 	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
 
+	/*
+	 * For a single scan, the number of heap pages that need to be fetched is
+	 * the same as the Mackert and Lohman formula for the case T <= b (ie, no
+	 * re-reads needed).
+	 */
+	pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+
+	/*
+	 * Calculate the number of pages fetched from the heap.  Then based on
+	 * current work_mem estimate get the estimated maxentries in the bitmap.
+	 * (Note that we always do this calculation based on the number of pages
+	 * that would be fetched in a single iteration, even if loop_count > 1.
+	 * That's correct, because only that number of entries will be stored in
+	 * the bitmap at one time.)
+	 */
+	heap_pages = Min(pages_fetched, baserel->pages);
+	maxentries = tbm_calculate_entries(work_mem * 1024L);
+
 	if (loop_count > 1)
 	{
 		/*
@@ -5199,22 +5219,41 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
 											root);
 		pages_fetched /= loop_count;
 	}
-	else
-	{
-		/*
-		 * For a single scan, the number of heap pages that need to be fetched
-		 * is the same as the Mackert and Lohman formula for the case T <= b
-		 * (ie, no re-reads needed).
-		 */
-		pages_fetched =
-			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
-	}
 
 	if (pages_fetched >= T)
 		pages_fetched = T;
 	else
 		pages_fetched = ceil(pages_fetched);
 
+	if (maxentries < heap_pages)
+	{
+		double		exact_pages;
+		double		lossy_pages;
+
+		/*
+		 * Crude approximation of the number of lossy pages.  Because of the
+		 * way tbm_lossify() is coded, the number of lossy pages increases
+		 * very sharply as soon as we run short of memory; this formula has
+		 * that property and seems to perform adequately in testing, but it's
+		 * possible we could do better somehow.
+		 */
+		lossy_pages = Max(0, heap_pages - maxentries / 2);
+		exact_pages = heap_pages - lossy_pages;
+
+		/*
+		 * If there are lossy pages then recompute the  number of tuples
+		 * processed by the bitmap heap node.  We assume here that the chance
+		 * of a given tuple coming from an exact page is the same as the
+		 * chance that a given page is exact.  This might not be true, but
+		 * it's not clear how we can do any better.
+		 */
+		if (lossy_pages > 0)
+			tuples_fetched =
+				clamp_row_est(indexSelectivity *
+							  (exact_pages / heap_pages) * baserel->tuples +
+							  (lossy_pages / heap_pages) * baserel->tuples);
+	}
+
 	if (cost)
 		*cost = indexTotalCost;
 	if (tuple)
diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h
index f9a1902da8..d3ad0a5566 100644
--- a/src/include/nodes/tidbitmap.h
+++ b/src/include/nodes/tidbitmap.h
@@ -70,5 +70,6 @@ extern void tbm_end_iterate(TBMIterator *iterator);
 extern void tbm_end_shared_iterate(TBMSharedIterator *iterator);
 extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa,
 						  dsa_pointer dp);
+extern long tbm_calculate_entries(double maxbytes);
 
 #endif							/* TIDBITMAP_H */