From 7c70996ebf0949b142a99c9445061c3c83ce62b3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 1 Nov 2017 17:38:12 -0400 Subject: [PATCH] Allow bitmap scans to operate as index-only scans when possible. If we don't have to return any columns from heap tuples, and there's no need to recheck qual conditions, and the heap page is all-visible, then we can skip fetching the heap page altogether. Skip prefetching pages too, when possible, on the assumption that the recheck flag will remain the same from one page to the next. While that assumption is hardly bulletproof, it seems like a good bet most of the time, and better than prefetching pages we don't need. This commit installs the executor infrastructure, but doesn't change any planner cost estimates, thus possibly causing bitmap scans to not be chosen in cases where this change renders them the best choice. I (tgl) am not entirely convinced that we need to account for this behavior in the planner, because I think typically the bitmap scan would get chosen anyway if it's the best bet. In any case the submitted patch took way too many shortcuts, resulting in too many clearly-bad choices, to be committable. Alexander Kuzmenkov, reviewed by Alexey Chernyshov, and whacked around rather heavily by me. Discussion: https://postgr.es/m/239a8955-c0fc-f506-026d-c837e86c827b@postgrespro.ru --- src/backend/executor/nodeBitmapHeapscan.c | 169 ++++++++++++++++------ src/backend/optimizer/plan/createplan.c | 9 ++ src/include/nodes/execnodes.h | 10 +- src/test/regress/expected/stats.out | 3 + src/test/regress/sql/stats.sql | 3 + 5 files changed, 152 insertions(+), 42 deletions(-) diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 6035b4dfd4..b885f2a3a6 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -39,6 +39,7 @@ #include "access/relscan.h" #include "access/transam.h" +#include "access/visibilitymap.h" #include "executor/execdebug.h" #include "executor/nodeBitmapHeapscan.h" #include "miscadmin.h" @@ -225,9 +226,31 @@ BitmapHeapNext(BitmapHeapScanState *node) } /* - * Fetch the current heap page and identify candidate tuples. + * We can skip fetching the heap page if we don't need any fields + * from the heap, and the bitmap entries don't need rechecking, + * and all tuples on the page are visible to our transaction. */ - bitgetpage(scan, tbmres); + node->skip_fetch = (node->can_skip_fetch && + !tbmres->recheck && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmres->blockno, + &node->vmbuffer)); + + if (node->skip_fetch) + { + /* + * The number of tuples on this page is put into + * scan->rs_ntuples; note we don't fill scan->rs_vistuples. + */ + scan->rs_ntuples = tbmres->ntuples; + } + else + { + /* + * Fetch the current heap page and identify candidate tuples. + */ + bitgetpage(scan, tbmres); + } if (tbmres->ntuples >= 0) node->exact_pages++; @@ -289,45 +312,55 @@ BitmapHeapNext(BitmapHeapScanState *node) */ BitmapPrefetch(node, scan); - /* - * Okay to fetch the tuple - */ - targoffset = scan->rs_vistuples[scan->rs_cindex]; - dp = (Page) BufferGetPage(scan->rs_cbuf); - lp = PageGetItemId(dp, targoffset); - Assert(ItemIdIsNormal(lp)); - - scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); - scan->rs_ctup.t_len = ItemIdGetLength(lp); - scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; - ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset); - - pgstat_count_heap_fetch(scan->rs_rd); - - /* - * Set up the result slot to point to this tuple. Note that the slot - * acquires a pin on the buffer. - */ - ExecStoreTuple(&scan->rs_ctup, - slot, - scan->rs_cbuf, - false); - - /* - * If we are using lossy info, we have to recheck the qual conditions - * at every tuple. - */ - if (tbmres->recheck) + if (node->skip_fetch) { - econtext->ecxt_scantuple = slot; - ResetExprContext(econtext); + /* + * If we don't have to fetch the tuple, just return nulls. + */ + ExecStoreAllNullTuple(slot); + } + else + { + /* + * Okay to fetch the tuple. + */ + targoffset = scan->rs_vistuples[scan->rs_cindex]; + dp = (Page) BufferGetPage(scan->rs_cbuf); + lp = PageGetItemId(dp, targoffset); + Assert(ItemIdIsNormal(lp)); - if (!ExecQual(node->bitmapqualorig, econtext)) + scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); + scan->rs_ctup.t_len = ItemIdGetLength(lp); + scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; + ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset); + + pgstat_count_heap_fetch(scan->rs_rd); + + /* + * Set up the result slot to point to this tuple. Note that the + * slot acquires a pin on the buffer. + */ + ExecStoreTuple(&scan->rs_ctup, + slot, + scan->rs_cbuf, + false); + + /* + * If we are using lossy info, we have to recheck the qual + * conditions at every tuple. + */ + if (tbmres->recheck) { - /* Fails recheck, so drop it and loop back for another */ - InstrCountFiltered2(node, 1); - ExecClearTuple(slot); - continue; + econtext->ecxt_scantuple = slot; + ResetExprContext(econtext); + + if (!ExecQual(node->bitmapqualorig, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + ExecClearTuple(slot); + continue; + } } } @@ -582,6 +615,7 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan) while (node->prefetch_pages < node->prefetch_target) { TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + bool skip_fetch; if (tbmpre == NULL) { @@ -591,7 +625,26 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan) break; } node->prefetch_pages++; - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + + /* + * If we expect not to have to actually read this heap page, + * skip this prefetch call, but continue to run the prefetch + * logic normally. (Would it be better not to increment + * prefetch_pages?) + * + * This depends on the assumption that the index AM will + * report the same recheck flag for this future heap page as + * it did for the current heap page; which is not a certainty + * but is true in many cases. + */ + skip_fetch = (node->can_skip_fetch && + (node->tbmres ? !node->tbmres->recheck : false) && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmpre->blockno, + &node->pvmbuffer)); + + if (!skip_fetch) + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } @@ -608,6 +661,7 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan) { TBMIterateResult *tbmpre; bool do_prefetch = false; + bool skip_fetch; /* * Recheck under the mutex. If some other process has already @@ -633,7 +687,15 @@ BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan) break; } - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + /* As above, skip prefetch if we expect not to need page */ + skip_fetch = (node->can_skip_fetch && + (node->tbmres ? !node->tbmres->recheck : false) && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmpre->blockno, + &node->pvmbuffer)); + + if (!skip_fetch) + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } } @@ -687,6 +749,7 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) /* rescan to release any page pin */ heap_rescan(node->ss.ss_currentScanDesc, NULL); + /* release bitmaps and buffers if any */ if (node->tbmiterator) tbm_end_iterate(node->tbmiterator); if (node->prefetch_iterator) @@ -697,6 +760,10 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) tbm_end_shared_iterate(node->shared_prefetch_iterator); if (node->tbm) tbm_free(node->tbm); + if (node->vmbuffer != InvalidBuffer) + ReleaseBuffer(node->vmbuffer); + if (node->pvmbuffer != InvalidBuffer) + ReleaseBuffer(node->pvmbuffer); node->tbm = NULL; node->tbmiterator = NULL; node->tbmres = NULL; @@ -704,6 +771,8 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) node->initialized = false; node->shared_tbmiterator = NULL; node->shared_prefetch_iterator = NULL; + node->vmbuffer = InvalidBuffer; + node->pvmbuffer = InvalidBuffer; ExecScanReScan(&node->ss); @@ -748,7 +817,7 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) ExecEndNode(outerPlanState(node)); /* - * release bitmap if any + * release bitmaps and buffers if any */ if (node->tbmiterator) tbm_end_iterate(node->tbmiterator); @@ -760,6 +829,10 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) tbm_end_shared_iterate(node->shared_tbmiterator); if (node->shared_prefetch_iterator) tbm_end_shared_iterate(node->shared_prefetch_iterator); + if (node->vmbuffer != InvalidBuffer) + ReleaseBuffer(node->vmbuffer); + if (node->pvmbuffer != InvalidBuffer) + ReleaseBuffer(node->pvmbuffer); /* * close heap scan @@ -805,6 +878,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->tbm = NULL; scanstate->tbmiterator = NULL; scanstate->tbmres = NULL; + scanstate->skip_fetch = false; + scanstate->vmbuffer = InvalidBuffer; + scanstate->pvmbuffer = InvalidBuffer; scanstate->exact_pages = 0; scanstate->lossy_pages = 0; scanstate->prefetch_iterator = NULL; @@ -815,8 +891,19 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->pscan_len = 0; scanstate->initialized = false; scanstate->shared_tbmiterator = NULL; + scanstate->shared_prefetch_iterator = NULL; scanstate->pstate = NULL; + /* + * We can potentially skip fetching heap pages if we do not need any + * columns of the table, either for checking non-indexable quals or for + * returning data. This test is a bit simplistic, as it checks the + * stronger condition that there's no qual or return tlist at all. But in + * most cases it's probably not worth working harder than that. + */ + scanstate->can_skip_fetch = (node->scan.plan.qual == NIL && + node->scan.plan.targetlist == NIL); + /* * Miscellaneous initialization * diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index c802d61c39..4b497486a0 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -807,6 +807,15 @@ use_physical_tlist(PlannerInfo *root, Path *path, int flags) if (IsA(path, CustomPath)) return false; + /* + * If a bitmap scan's tlist is empty, keep it as-is. This may allow the + * executor to skip heap page fetches, and in any case, the benefit of + * using a physical tlist instead would be minimal. + */ + if (IsA(path, BitmapHeapPath) && + path->pathtarget->exprs == NIL) + return false; + /* * Can't do it if any system columns or whole-row Vars are requested. * (This could possibly be fixed but would take some fragile assumptions diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 8698c8a50c..d209ec012c 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -507,7 +507,7 @@ typedef struct EState bool *es_epqTupleSet; /* true if EPQ tuple is provided */ bool *es_epqScanDone; /* true if EPQ tuple has been fetched */ - bool es_use_parallel_mode; /* can we use parallel workers? */ + bool es_use_parallel_mode; /* can we use parallel workers? */ /* The per-query shared memory area to use for parallel execution. */ struct dsa_area *es_query_dsa; @@ -1331,6 +1331,10 @@ typedef struct ParallelBitmapHeapState * tbm bitmap obtained from child index scan(s) * tbmiterator iterator for scanning current pages * tbmres current-page data + * can_skip_fetch can we potentially skip tuple fetches in this scan? + * skip_fetch are we skipping tuple fetches on this page? + * vmbuffer buffer for visibility-map lookups + * pvmbuffer ditto, for prefetched pages * exact_pages total number of exact pages retrieved * lossy_pages total number of lossy pages retrieved * prefetch_iterator iterator for prefetching ahead of current page @@ -1351,6 +1355,10 @@ typedef struct BitmapHeapScanState TIDBitmap *tbm; TBMIterator *tbmiterator; TBMIterateResult *tbmres; + bool can_skip_fetch; + bool skip_fetch; + Buffer vmbuffer; + Buffer pvmbuffer; long exact_pages; long lossy_pages; TBMIterator *prefetch_iterator; diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index fc91f3ce36..991c287b11 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -136,12 +136,15 @@ SELECT count(*) FROM tenk2; (1 row) -- do an indexscan +-- make sure it is not a bitmap scan, which might skip fetching heap tuples +SET enable_bitmapscan TO off; SELECT count(*) FROM tenk2 WHERE unique1 = 1; count ------- 1 (1 row) +RESET enable_bitmapscan; -- We can't just call wait_for_stats() at this point, because we only -- transmit stats when the session goes idle, and we probably didn't -- transmit the last couple of counts yet thanks to the rate-limiting logic diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index 6e882bf3ac..2be7dde834 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -138,7 +138,10 @@ ROLLBACK; -- do a seqscan SELECT count(*) FROM tenk2; -- do an indexscan +-- make sure it is not a bitmap scan, which might skip fetching heap tuples +SET enable_bitmapscan TO off; SELECT count(*) FROM tenk2 WHERE unique1 = 1; +RESET enable_bitmapscan; -- We can't just call wait_for_stats() at this point, because we only -- transmit stats when the session goes idle, and we probably didn't