tableam: bitmap table scan.

This moves bitmap heap scan support to below an optional tableam
callback. It's optional as the whole concept of bitmap heapscans is
fairly block specific.

This basically moves the work previously done in bitgetpage() into the
new scan_bitmap_next_block callback, and the direct poking into the
buffer done in BitmapHeapNext() into the new scan_bitmap_next_tuple()
callback.

The abstraction is currently somewhat leaky because
nodeBitmapHeapscan.c's prefetching and visibilitymap based logic
remains - it's likely that we'll later have to move more into the
AM. But it's not trivial to do so without introducing a significant
amount of code duplication between the AMs, so that's a project for
later.

Note that now nodeBitmapHeapscan.c and the associated node types are a
bit misnamed. But it's not clear whether renaming wouldn't be a cure
worse than the disease. Either way, that'd be best done in a separate
commit.

Author: Andres Freund
Reviewed-By: Robert Haas (in an older version)
Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
This commit is contained in:
Andres Freund 2019-03-31 17:51:49 -07:00
parent 73c954d248
commit bfbcad478f
7 changed files with 293 additions and 187 deletions

View File

@ -1952,6 +1952,159 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
* ------------------------------------------------------------------------
*/
static bool
heapam_scan_bitmap_next_block(TableScanDesc scan,
TBMIterateResult *tbmres)
{
HeapScanDesc hscan = (HeapScanDesc) scan;
BlockNumber page = tbmres->blockno;
Buffer buffer;
Snapshot snapshot;
int ntup;
hscan->rs_cindex = 0;
hscan->rs_ntuples = 0;
/*
* Ignore any claimed entries past what we think is the end of the
* relation. It may have been extended after the start of our scan (we
* only hold an AccessShareLock, and it could be inserts from this
* backend).
*/
if (page >= hscan->rs_nblocks)
return false;
/*
* Acquire pin on the target heap page, trading in any pin we held before.
*/
hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
scan->rs_rd,
page);
hscan->rs_cblock = page;
buffer = hscan->rs_cbuf;
snapshot = scan->rs_snapshot;
ntup = 0;
/*
* Prune and repair fragmentation for the whole page, if possible.
*/
heap_page_prune_opt(scan->rs_rd, buffer);
/*
* We must hold share lock on the buffer content while examining tuple
* visibility. Afterwards, however, the tuples we have found to be
* visible are guaranteed good as long as we hold the buffer pin.
*/
LockBuffer(buffer, BUFFER_LOCK_SHARE);
/*
* We need two separate strategies for lossy and non-lossy cases.
*/
if (tbmres->ntuples >= 0)
{
/*
* Bitmap is non-lossy, so we just look through the offsets listed in
* tbmres; but we have to follow any HOT chain starting at each such
* offset.
*/
int curslot;
for (curslot = 0; curslot < tbmres->ntuples; curslot++)
{
OffsetNumber offnum = tbmres->offsets[curslot];
ItemPointerData tid;
HeapTupleData heapTuple;
ItemPointerSet(&tid, page, offnum);
if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
&heapTuple, NULL, true))
hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
}
}
else
{
/*
* Bitmap is lossy, so we must examine each item pointer on the page.
* But we can ignore HOT chains, since we'll check each tuple anyway.
*/
Page dp = (Page) BufferGetPage(buffer);
OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
OffsetNumber offnum;
for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
{
ItemId lp;
HeapTupleData loctup;
bool valid;
lp = PageGetItemId(dp, offnum);
if (!ItemIdIsNormal(lp))
continue;
loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
loctup.t_len = ItemIdGetLength(lp);
loctup.t_tableOid = scan->rs_rd->rd_id;
ItemPointerSet(&loctup.t_self, page, offnum);
valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
if (valid)
{
hscan->rs_vistuples[ntup++] = offnum;
PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
}
CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
buffer, snapshot);
}
}
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
Assert(ntup <= MaxHeapTuplesPerPage);
hscan->rs_ntuples = ntup;
return ntup > 0;
}
static bool
heapam_scan_bitmap_next_tuple(TableScanDesc scan,
TBMIterateResult *tbmres,
TupleTableSlot *slot)
{
HeapScanDesc hscan = (HeapScanDesc) scan;
OffsetNumber targoffset;
Page dp;
ItemId lp;
/*
* Out of range? If so, nothing more to look at on this page
*/
if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
return false;
targoffset = hscan->rs_vistuples[hscan->rs_cindex];
dp = (Page) BufferGetPage(hscan->rs_cbuf);
lp = PageGetItemId(dp, targoffset);
Assert(ItemIdIsNormal(lp));
hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
hscan->rs_ctup.t_len = ItemIdGetLength(lp);
hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
pgstat_count_heap_fetch(scan->rs_rd);
/*
* Set up the result slot to point to this tuple. Note that the slot
* acquires a pin on the buffer.
*/
ExecStoreBufferHeapTuple(&hscan->rs_ctup,
slot,
hscan->rs_cbuf);
hscan->rs_cindex++;
return true;
}
static bool
heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
{
@ -2266,6 +2419,8 @@ static const TableAmRoutine heapam_methods = {
.relation_estimate_size = heapam_estimate_rel_size,
.scan_bitmap_next_block = heapam_scan_bitmap_next_block,
.scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
.scan_sample_next_block = heapam_scan_sample_next_block,
.scan_sample_next_tuple = heapam_scan_sample_next_tuple
};

View File

@ -89,6 +89,9 @@ GetTableAmRoutine(Oid amhandler)
Assert(routine->index_validate_scan != NULL);
Assert(routine->relation_estimate_size != NULL);
/* optional, but one callback implies presence of hte other */
Assert((routine->scan_bitmap_next_block == NULL) ==
(routine->scan_bitmap_next_tuple == NULL));
Assert(routine->scan_sample_next_block != NULL);
Assert(routine->scan_sample_next_tuple != NULL);

View File

@ -37,7 +37,6 @@
#include <math.h>
#include "access/heapam.h"
#include "access/relscan.h"
#include "access/tableam.h"
#include "access/transam.h"
@ -55,7 +54,6 @@
static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
static inline void BitmapDoneInitializingSharedState(
ParallelBitmapHeapState *pstate);
static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
@ -78,12 +76,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
{
ExprContext *econtext;
TableScanDesc scan;
HeapScanDesc hscan;
TIDBitmap *tbm;
TBMIterator *tbmiterator = NULL;
TBMSharedIterator *shared_tbmiterator = NULL;
TBMIterateResult *tbmres;
OffsetNumber targoffset;
TupleTableSlot *slot;
ParallelBitmapHeapState *pstate = node->pstate;
dsa_area *dsa = node->ss.ps.state->es_query_dsa;
@ -94,7 +90,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
econtext = node->ss.ps.ps_ExprContext;
slot = node->ss.ss_ScanTupleSlot;
scan = node->ss.ss_currentScanDesc;
hscan = (HeapScanDesc) scan;
tbm = node->tbm;
if (pstate == NULL)
tbmiterator = node->tbmiterator;
@ -194,8 +189,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
for (;;)
{
Page dp;
ItemId lp;
bool skip_fetch;
CHECK_FOR_INTERRUPTS();
@ -216,43 +210,35 @@ BitmapHeapNext(BitmapHeapScanState *node)
BitmapAdjustPrefetchIterator(node, tbmres);
/*
* Ignore any claimed entries past what we think is the end of the
* relation. (This is probably not necessary given that we got at
* least AccessShareLock on the table before performing any of the
* indexscans, but let's be safe.)
*/
if (tbmres->blockno >= hscan->rs_nblocks)
{
node->tbmres = tbmres = NULL;
continue;
}
/*
* We can skip fetching the heap page if we don't need any fields
* from the heap, and the bitmap entries don't need rechecking,
* and all tuples on the page are visible to our transaction.
*
* XXX: It's a layering violation that we do these checks above
* tableam, they should probably moved below it at some point.
*/
node->skip_fetch = (node->can_skip_fetch &&
!tbmres->recheck &&
VM_ALL_VISIBLE(node->ss.ss_currentRelation,
tbmres->blockno,
&node->vmbuffer));
skip_fetch = (node->can_skip_fetch &&
!tbmres->recheck &&
VM_ALL_VISIBLE(node->ss.ss_currentRelation,
tbmres->blockno,
&node->vmbuffer));
if (node->skip_fetch)
if (skip_fetch)
{
/* can't be lossy in the skip_fetch case */
Assert(tbmres->ntuples >= 0);
/*
* The number of tuples on this page is put into
* scan->rs_ntuples; note we don't fill scan->rs_vistuples.
* node->return_empty_tuples.
*/
hscan->rs_ntuples = tbmres->ntuples;
node->return_empty_tuples = tbmres->ntuples;
}
else
else if (!table_scan_bitmap_next_block(scan, tbmres))
{
/*
* Fetch the current heap page and identify candidate tuples.
*/
bitgetpage(hscan, tbmres);
/* AM doesn't think this block is valid, skip */
continue;
}
if (tbmres->ntuples >= 0)
@ -260,20 +246,14 @@ BitmapHeapNext(BitmapHeapScanState *node)
else
node->lossy_pages++;
/*
* Set rs_cindex to first slot to examine
*/
hscan->rs_cindex = 0;
/* Adjust the prefetch target */
BitmapAdjustPrefetchTarget(node);
}
else
{
/*
* Continuing in previously obtained page; advance rs_cindex
* Continuing in previously obtained page.
*/
hscan->rs_cindex++;
#ifdef USE_PREFETCH
@ -297,55 +277,42 @@ BitmapHeapNext(BitmapHeapScanState *node)
#endif /* USE_PREFETCH */
}
/*
* Out of range? If so, nothing more to look at on this page
*/
if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
{
node->tbmres = tbmres = NULL;
continue;
}
/*
* We issue prefetch requests *after* fetching the current page to try
* to avoid having prefetching interfere with the main I/O. Also, this
* should happen only when we have determined there is still something
* to do on the current page, else we may uselessly prefetch the same
* page we are just about to request for real.
*
* XXX: It's a layering violation that we do these checks above
* tableam, they should probably moved below it at some point.
*/
BitmapPrefetch(node, scan);
if (node->skip_fetch)
if (node->return_empty_tuples > 0)
{
/*
* If we don't have to fetch the tuple, just return nulls.
*/
ExecStoreAllNullTuple(slot);
if (--node->return_empty_tuples == 0)
{
/* no more tuples to return in the next round */
node->tbmres = tbmres = NULL;
}
}
else
{
/*
* Okay to fetch the tuple.
* Attempt to fetch tuple from AM.
*/
targoffset = hscan->rs_vistuples[hscan->rs_cindex];
dp = (Page) BufferGetPage(hscan->rs_cbuf);
lp = PageGetItemId(dp, targoffset);
Assert(ItemIdIsNormal(lp));
hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
hscan->rs_ctup.t_len = ItemIdGetLength(lp);
hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
ItemPointerSet(&hscan->rs_ctup.t_self, tbmres->blockno, targoffset);
pgstat_count_heap_fetch(scan->rs_rd);
/*
* Set up the result slot to point to this tuple. Note that the
* slot acquires a pin on the buffer.
*/
ExecStoreBufferHeapTuple(&hscan->rs_ctup,
slot,
hscan->rs_cbuf);
if (!table_scan_bitmap_next_tuple(scan, tbmres, slot))
{
/* nothing more to look at on this page */
node->tbmres = tbmres = NULL;
continue;
}
/*
* If we are using lossy info, we have to recheck the qual
@ -374,110 +341,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
return ExecClearTuple(slot);
}
/*
* bitgetpage - subroutine for BitmapHeapNext()
*
* This routine reads and pins the specified page of the relation, then
* builds an array indicating which tuples on the page are both potentially
* interesting according to the bitmap, and visible according to the snapshot.
*/
static void
bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
{
BlockNumber page = tbmres->blockno;
Buffer buffer;
Snapshot snapshot;
int ntup;
/*
* Acquire pin on the target heap page, trading in any pin we held before.
*/
Assert(page < scan->rs_nblocks);
scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
scan->rs_base.rs_rd,
page);
buffer = scan->rs_cbuf;
snapshot = scan->rs_base.rs_snapshot;
ntup = 0;
/*
* Prune and repair fragmentation for the whole page, if possible.
*/
heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
/*
* We must hold share lock on the buffer content while examining tuple
* visibility. Afterwards, however, the tuples we have found to be
* visible are guaranteed good as long as we hold the buffer pin.
*/
LockBuffer(buffer, BUFFER_LOCK_SHARE);
/*
* We need two separate strategies for lossy and non-lossy cases.
*/
if (tbmres->ntuples >= 0)
{
/*
* Bitmap is non-lossy, so we just look through the offsets listed in
* tbmres; but we have to follow any HOT chain starting at each such
* offset.
*/
int curslot;
for (curslot = 0; curslot < tbmres->ntuples; curslot++)
{
OffsetNumber offnum = tbmres->offsets[curslot];
ItemPointerData tid;
HeapTupleData heapTuple;
ItemPointerSet(&tid, page, offnum);
if (heap_hot_search_buffer(&tid, scan->rs_base.rs_rd, buffer,
snapshot, &heapTuple, NULL, true))
scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
}
}
else
{
/*
* Bitmap is lossy, so we must examine each item pointer on the page.
* But we can ignore HOT chains, since we'll check each tuple anyway.
*/
Page dp = (Page) BufferGetPage(buffer);
OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
OffsetNumber offnum;
for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
{
ItemId lp;
HeapTupleData loctup;
bool valid;
lp = PageGetItemId(dp, offnum);
if (!ItemIdIsNormal(lp))
continue;
loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
loctup.t_len = ItemIdGetLength(lp);
loctup.t_tableOid = scan->rs_base.rs_rd->rd_id;
ItemPointerSet(&loctup.t_self, page, offnum);
valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
if (valid)
{
scan->rs_vistuples[ntup++] = offnum;
PredicateLockTuple(scan->rs_base.rs_rd, &loctup, snapshot);
}
CheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
&loctup, buffer, snapshot);
}
}
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
Assert(ntup <= MaxHeapTuplesPerPage);
scan->rs_ntuples = ntup;
}
/*
* BitmapDoneInitializingSharedState - Shared state is initialized
*
@ -869,7 +732,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->tbm = NULL;
scanstate->tbmiterator = NULL;
scanstate->tbmres = NULL;
scanstate->skip_fetch = false;
scanstate->return_empty_tuples = 0;
scanstate->vmbuffer = InvalidBuffer;
scanstate->pvmbuffer = InvalidBuffer;
scanstate->exact_pages = 0;
@ -951,10 +814,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
scanstate->ss.ss_currentRelation = currentRelation;
/*
* Even though we aren't going to do a conventional seqscan, it is useful
* to create a HeapScanDesc --- most of the fields in it are usable.
*/
scanstate->ss.ss_currentScanDesc = table_beginscan_bm(currentRelation,
estate->es_snapshot,
0,

View File

@ -272,7 +272,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->amsearchnulls = amroutine->amsearchnulls;
info->amcanparallel = amroutine->amcanparallel;
info->amhasgettuple = (amroutine->amgettuple != NULL);
info->amhasgetbitmap = (amroutine->amgetbitmap != NULL);
info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
relation->rd_tableam->scan_bitmap_next_block != NULL;
info->amcostestimate = amroutine->amcostestimate;
Assert(info->amcostestimate != NULL);

View File

@ -31,6 +31,7 @@ struct BulkInsertStateData;
struct IndexInfo;
struct IndexBuildCallback;
struct SampleScanState;
struct TBMIterateResult;
struct VacuumParams;
struct ValidateIndexState;
@ -527,8 +528,58 @@ typedef struct TableAmRoutine
*/
/*
* Acquire the next block in a sample scan. Return false if the sample
* scan is finished, true otherwise.
* Prepare to fetch / check / return tuples from `tbmres->blockno` as part
* of a bitmap table scan. `scan` was started via table_beginscan_bm().
* Return false if there's no tuples to be found on the page, true
* otherwise.
*
* This will typically read and pin the target block, and do the necessary
* work to allow scan_bitmap_next_tuple() to return tuples (e.g. it might
* make sense to perform tuple visibility checks at this time). For some
* AMs it will make more sense to do all the work referencing `tbmres`
* contents here, for others it might be better to defer more work to
* scan_bitmap_next_tuple.
*
* If `tbmres->blockno` is -1, this is a lossy scan and all visible tuples
* on the page have to be returned, otherwise the tuples at offsets in
* `tbmres->offsets` need to be returned.
*
* XXX: Currently this may only be implemented if the AM uses md.c as its
* storage manager, and uses ItemPointer->ip_blkid in a manner that maps
* blockids directly to the underlying storage. nodeBitmapHeapscan.c
* performs prefetching directly using that interface. This probably
* needs to be rectified at a later point.
*
* XXX: Currently this may only be implemented if the AM uses the
* visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to
* perform prefetching. This probably needs to be rectified at a later
* point.
*
* Optional callback, but either both scan_bitmap_next_block and
* scan_bitmap_next_tuple need to exist, or neither.
*/
bool (*scan_bitmap_next_block) (TableScanDesc scan,
struct TBMIterateResult *tbmres);
/*
* Fetch the next tuple of a bitmap table scan into `slot` and return true
* if a visible tuple was found, false otherwise.
*
* For some AMs it will make more sense to do all the work referencing
* `tbmres` contents in scan_bitmap_next_block, for others it might be
* better to defer more work to this callback.
*
* Optional callback, but either both scan_bitmap_next_block and
* scan_bitmap_next_tuple need to exist, or neither.
*/
bool (*scan_bitmap_next_tuple) (TableScanDesc scan,
struct TBMIterateResult *tbmres,
TupleTableSlot *slot);
/*
* Prepare to fetch tuples from the next block in a sample scan. Return
* false if the sample scan is finished, true otherwise. `scan` was
* started via table_beginscan_sampling().
*
* Typically this will first determine the target block by call the
* TsmRoutine's NextSampleBlock() callback if not NULL, or alternatively
@ -1396,8 +1447,44 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
*/
/*
* Acquire the next block in a sample scan. Returns false if the sample scan
* is finished, true otherwise.
* Prepare to fetch / check / return tuples from `tbmres->blockno` as part of
* a bitmap table scan. `scan` needs to have been started via
* table_beginscan_bm(). Returns false if there's no tuples to be found on the
* page, true otherwise.
*
* Note, this is an optionally implemented function, therefore should only be
* used after verifying the presence (at plan time or such).
*/
static inline bool
table_scan_bitmap_next_block(TableScanDesc scan,
struct TBMIterateResult *tbmres)
{
return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
tbmres);
}
/*
* Fetch the next tuple of a bitmap table scan into `slot` and return true if
* a visible tuple was found, false otherwise.
* table_scan_bitmap_next_block() needs to previously have selected a
* block (i.e. returned true), and no previous
* table_scan_bitmap_next_tuple() for the same block may have
* returned false.
*/
static inline bool
table_scan_bitmap_next_tuple(TableScanDesc scan,
struct TBMIterateResult *tbmres,
TupleTableSlot *slot)
{
return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
tbmres,
slot);
}
/*
* Prepare to fetch tuples from the next block in a sample scan. Returns false
* if the sample scan is finished, true otherwise. `scan` needs to have been
* started via table_beginscan_sampling().
*
* This will call the TsmRoutine's NextSampleBlock() callback if necessary
* (i.e. NextSampleBlock is not NULL), or perform a sequential scan over the
@ -1413,7 +1500,8 @@ table_scan_sample_next_block(TableScanDesc scan,
/*
* Fetch the next sample tuple into `slot` and return true if a visible tuple
* was found, false otherwise. table_scan_sample_next_block() needs to
* previously have selected a block (i.e. returned true).
* previously have selected a block (i.e. returned true), and no previous
* table_scan_sample_next_tuple() for the same block may have returned false.
*
* This will call the TsmRoutine's NextSampleTuple() callback.
*/

View File

@ -1507,7 +1507,7 @@ typedef struct ParallelBitmapHeapState
* tbmiterator iterator for scanning current pages
* tbmres current-page data
* can_skip_fetch can we potentially skip tuple fetches in this scan?
* skip_fetch are we skipping tuple fetches on this page?
* return_empty_tuples number of empty tuples to return
* vmbuffer buffer for visibility-map lookups
* pvmbuffer ditto, for prefetched pages
* exact_pages total number of exact pages retrieved
@ -1531,7 +1531,7 @@ typedef struct BitmapHeapScanState
TBMIterator *tbmiterator;
TBMIterateResult *tbmres;
bool can_skip_fetch;
bool skip_fetch;
int return_empty_tuples;
Buffer vmbuffer;
Buffer pvmbuffer;
long exact_pages;

View File

@ -37,7 +37,7 @@ typedef struct TBMIterator TBMIterator;
typedef struct TBMSharedIterator TBMSharedIterator;
/* Result structure for tbm_iterate */
typedef struct
typedef struct TBMIterateResult
{
BlockNumber blockno; /* page number containing tuples */
int ntuples; /* -1 indicates lossy result */