603 lines
15 KiB
C
603 lines
15 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* nodeSamplescan.c
|
|
* Support routines for sample scans of relations (table sampling).
|
|
*
|
|
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/executor/nodeSamplescan.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/hash.h"
|
|
#include "access/relscan.h"
|
|
#include "access/tsmapi.h"
|
|
#include "executor/executor.h"
|
|
#include "executor/nodeSamplescan.h"
|
|
#include "miscadmin.h"
|
|
#include "pgstat.h"
|
|
#include "storage/predicate.h"
|
|
#include "utils/rel.h"
|
|
#include "utils/tqual.h"
|
|
|
|
static void InitScanRelation(SampleScanState *node, EState *estate, int eflags);
|
|
static TupleTableSlot *SampleNext(SampleScanState *node);
|
|
static void tablesample_init(SampleScanState *scanstate);
|
|
static HeapTuple tablesample_getnext(SampleScanState *scanstate);
|
|
static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset,
|
|
HeapScanDesc scan);
|
|
|
|
/* ----------------------------------------------------------------
|
|
* Scan Support
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
|
|
/* ----------------------------------------------------------------
|
|
* SampleNext
|
|
*
|
|
* This is a workhorse for ExecSampleScan
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static TupleTableSlot *
|
|
SampleNext(SampleScanState *node)
|
|
{
|
|
HeapTuple tuple;
|
|
TupleTableSlot *slot;
|
|
|
|
/*
|
|
* if this is first call within a scan, initialize
|
|
*/
|
|
if (!node->begun)
|
|
tablesample_init(node);
|
|
|
|
/*
|
|
* get the next tuple, and store it in our result slot
|
|
*/
|
|
tuple = tablesample_getnext(node);
|
|
|
|
slot = node->ss.ss_ScanTupleSlot;
|
|
|
|
if (tuple)
|
|
ExecStoreTuple(tuple, /* tuple to store */
|
|
slot, /* slot to store in */
|
|
node->ss.ss_currentScanDesc->rs_cbuf, /* tuple's buffer */
|
|
false); /* don't pfree this pointer */
|
|
else
|
|
ExecClearTuple(slot);
|
|
|
|
return slot;
|
|
}
|
|
|
|
/*
|
|
* SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual
|
|
*/
|
|
static bool
|
|
SampleRecheck(SampleScanState *node, TupleTableSlot *slot)
|
|
{
|
|
/*
|
|
* No need to recheck for SampleScan, since like SeqScan we don't pass any
|
|
* checkable keys to heap_beginscan.
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecSampleScan(node)
|
|
*
|
|
* Scans the relation using the sampling method and returns
|
|
* the next qualifying tuple.
|
|
* We call the ExecScan() routine and pass it the appropriate
|
|
* access method functions.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
TupleTableSlot *
|
|
ExecSampleScan(SampleScanState *node)
|
|
{
|
|
return ExecScan((ScanState *) node,
|
|
(ExecScanAccessMtd) SampleNext,
|
|
(ExecScanRecheckMtd) SampleRecheck);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* InitScanRelation
|
|
*
|
|
* Set up to access the scan relation.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
static void
|
|
InitScanRelation(SampleScanState *node, EState *estate, int eflags)
|
|
{
|
|
Relation currentRelation;
|
|
|
|
/*
|
|
* get the relation object id from the relid'th entry in the range table,
|
|
* open that relation and acquire appropriate lock on it.
|
|
*/
|
|
currentRelation = ExecOpenScanRelation(estate,
|
|
((SampleScan *) node->ss.ps.plan)->scan.scanrelid,
|
|
eflags);
|
|
|
|
node->ss.ss_currentRelation = currentRelation;
|
|
|
|
/* we won't set up the HeapScanDesc till later */
|
|
node->ss.ss_currentScanDesc = NULL;
|
|
|
|
/* and report the scan tuple slot's rowtype */
|
|
ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
|
|
}
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecInitSampleScan
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
SampleScanState *
|
|
ExecInitSampleScan(SampleScan *node, EState *estate, int eflags)
|
|
{
|
|
SampleScanState *scanstate;
|
|
TableSampleClause *tsc = node->tablesample;
|
|
TsmRoutine *tsm;
|
|
|
|
Assert(outerPlan(node) == NULL);
|
|
Assert(innerPlan(node) == NULL);
|
|
|
|
/*
|
|
* create state structure
|
|
*/
|
|
scanstate = makeNode(SampleScanState);
|
|
scanstate->ss.ps.plan = (Plan *) node;
|
|
scanstate->ss.ps.state = estate;
|
|
|
|
/*
|
|
* Miscellaneous initialization
|
|
*
|
|
* create expression context for node
|
|
*/
|
|
ExecAssignExprContext(estate, &scanstate->ss.ps);
|
|
|
|
/*
|
|
* initialize child expressions
|
|
*/
|
|
scanstate->ss.ps.targetlist = (List *)
|
|
ExecInitExpr((Expr *) node->scan.plan.targetlist,
|
|
(PlanState *) scanstate);
|
|
scanstate->ss.ps.qual = (List *)
|
|
ExecInitExpr((Expr *) node->scan.plan.qual,
|
|
(PlanState *) scanstate);
|
|
|
|
scanstate->args = (List *)
|
|
ExecInitExpr((Expr *) tsc->args,
|
|
(PlanState *) scanstate);
|
|
scanstate->repeatable =
|
|
ExecInitExpr(tsc->repeatable,
|
|
(PlanState *) scanstate);
|
|
|
|
/*
|
|
* tuple table initialization
|
|
*/
|
|
ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
|
|
ExecInitScanTupleSlot(estate, &scanstate->ss);
|
|
|
|
/*
|
|
* initialize scan relation
|
|
*/
|
|
InitScanRelation(scanstate, estate, eflags);
|
|
|
|
scanstate->ss.ps.ps_TupFromTlist = false;
|
|
|
|
/*
|
|
* Initialize result tuple type and projection info.
|
|
*/
|
|
ExecAssignResultTypeFromTL(&scanstate->ss.ps);
|
|
ExecAssignScanProjectionInfo(&scanstate->ss);
|
|
|
|
/*
|
|
* If we don't have a REPEATABLE clause, select a random seed. We want to
|
|
* do this just once, since the seed shouldn't change over rescans.
|
|
*/
|
|
if (tsc->repeatable == NULL)
|
|
scanstate->seed = random();
|
|
|
|
/*
|
|
* Finally, initialize the TABLESAMPLE method handler.
|
|
*/
|
|
tsm = GetTsmRoutine(tsc->tsmhandler);
|
|
scanstate->tsmroutine = tsm;
|
|
scanstate->tsm_state = NULL;
|
|
|
|
if (tsm->InitSampleScan)
|
|
tsm->InitSampleScan(scanstate, eflags);
|
|
|
|
/* We'll do BeginSampleScan later; we can't evaluate params yet */
|
|
scanstate->begun = false;
|
|
|
|
return scanstate;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecEndSampleScan
|
|
*
|
|
* frees any storage allocated through C routines.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecEndSampleScan(SampleScanState *node)
|
|
{
|
|
/*
|
|
* Tell sampling function that we finished the scan.
|
|
*/
|
|
if (node->tsmroutine->EndSampleScan)
|
|
node->tsmroutine->EndSampleScan(node);
|
|
|
|
/*
|
|
* Free the exprcontext
|
|
*/
|
|
ExecFreeExprContext(&node->ss.ps);
|
|
|
|
/*
|
|
* clean out the tuple table
|
|
*/
|
|
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
|
|
ExecClearTuple(node->ss.ss_ScanTupleSlot);
|
|
|
|
/*
|
|
* close heap scan
|
|
*/
|
|
if (node->ss.ss_currentScanDesc)
|
|
heap_endscan(node->ss.ss_currentScanDesc);
|
|
|
|
/*
|
|
* close the heap relation.
|
|
*/
|
|
ExecCloseScanRelation(node->ss.ss_currentRelation);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecReScanSampleScan
|
|
*
|
|
* Rescans the relation.
|
|
*
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void
|
|
ExecReScanSampleScan(SampleScanState *node)
|
|
{
|
|
/* Remember we need to do BeginSampleScan again (if we did it at all) */
|
|
node->begun = false;
|
|
|
|
ExecScanReScan(&node->ss);
|
|
}
|
|
|
|
|
|
/*
|
|
* Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan.
|
|
*/
|
|
static void
|
|
tablesample_init(SampleScanState *scanstate)
|
|
{
|
|
TsmRoutine *tsm = scanstate->tsmroutine;
|
|
ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
|
|
Datum *params;
|
|
Datum datum;
|
|
bool isnull;
|
|
uint32 seed;
|
|
bool allow_sync;
|
|
int i;
|
|
ListCell *arg;
|
|
|
|
params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum));
|
|
|
|
i = 0;
|
|
foreach(arg, scanstate->args)
|
|
{
|
|
ExprState *argstate = (ExprState *) lfirst(arg);
|
|
|
|
params[i] = ExecEvalExprSwitchContext(argstate,
|
|
econtext,
|
|
&isnull,
|
|
NULL);
|
|
if (isnull)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
|
|
errmsg("TABLESAMPLE parameter cannot be null")));
|
|
i++;
|
|
}
|
|
|
|
if (scanstate->repeatable)
|
|
{
|
|
datum = ExecEvalExprSwitchContext(scanstate->repeatable,
|
|
econtext,
|
|
&isnull,
|
|
NULL);
|
|
if (isnull)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT),
|
|
errmsg("TABLESAMPLE REPEATABLE parameter cannot be null")));
|
|
|
|
/*
|
|
* The REPEATABLE parameter has been coerced to float8 by the parser.
|
|
* The reason for using float8 at the SQL level is that it will
|
|
* produce unsurprising results both for users used to databases that
|
|
* accept only integers in the REPEATABLE clause and for those who
|
|
* might expect that REPEATABLE works like setseed() (a float in the
|
|
* range from -1 to 1).
|
|
*
|
|
* We use hashfloat8() to convert the supplied value into a suitable
|
|
* seed. For regression-testing purposes, that has the convenient
|
|
* property that REPEATABLE(0) gives a machine-independent result.
|
|
*/
|
|
seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum));
|
|
}
|
|
else
|
|
{
|
|
/* Use the seed selected by ExecInitSampleScan */
|
|
seed = scanstate->seed;
|
|
}
|
|
|
|
/* Set default values for params that BeginSampleScan can adjust */
|
|
scanstate->use_bulkread = true;
|
|
scanstate->use_pagemode = true;
|
|
|
|
/* Let tablesample method do its thing */
|
|
tsm->BeginSampleScan(scanstate,
|
|
params,
|
|
list_length(scanstate->args),
|
|
seed);
|
|
|
|
/* We'll use syncscan if there's no NextSampleBlock function */
|
|
allow_sync = (tsm->NextSampleBlock == NULL);
|
|
|
|
/* Now we can create or reset the HeapScanDesc */
|
|
if (scanstate->ss.ss_currentScanDesc == NULL)
|
|
{
|
|
scanstate->ss.ss_currentScanDesc =
|
|
heap_beginscan_sampling(scanstate->ss.ss_currentRelation,
|
|
scanstate->ss.ps.state->es_snapshot,
|
|
0, NULL,
|
|
scanstate->use_bulkread,
|
|
allow_sync,
|
|
scanstate->use_pagemode);
|
|
}
|
|
else
|
|
{
|
|
heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL,
|
|
scanstate->use_bulkread,
|
|
allow_sync,
|
|
scanstate->use_pagemode);
|
|
}
|
|
|
|
pfree(params);
|
|
|
|
/* And we're initialized. */
|
|
scanstate->begun = true;
|
|
}
|
|
|
|
/*
|
|
* Get next tuple from TABLESAMPLE method.
|
|
*
|
|
* Note: an awful lot of this is copied-and-pasted from heapam.c. It would
|
|
* perhaps be better to refactor to share more code.
|
|
*/
|
|
static HeapTuple
|
|
tablesample_getnext(SampleScanState *scanstate)
|
|
{
|
|
TsmRoutine *tsm = scanstate->tsmroutine;
|
|
HeapScanDesc scan = scanstate->ss.ss_currentScanDesc;
|
|
HeapTuple tuple = &(scan->rs_ctup);
|
|
Snapshot snapshot = scan->rs_snapshot;
|
|
bool pagemode = scan->rs_pageatatime;
|
|
BlockNumber blockno;
|
|
Page page;
|
|
bool all_visible;
|
|
OffsetNumber maxoffset;
|
|
|
|
if (!scan->rs_inited)
|
|
{
|
|
/*
|
|
* return null immediately if relation is empty
|
|
*/
|
|
if (scan->rs_nblocks == 0)
|
|
{
|
|
Assert(!BufferIsValid(scan->rs_cbuf));
|
|
tuple->t_data = NULL;
|
|
return NULL;
|
|
}
|
|
if (tsm->NextSampleBlock)
|
|
{
|
|
blockno = tsm->NextSampleBlock(scanstate);
|
|
if (!BlockNumberIsValid(blockno))
|
|
{
|
|
tuple->t_data = NULL;
|
|
return NULL;
|
|
}
|
|
}
|
|
else
|
|
blockno = scan->rs_startblock;
|
|
Assert(blockno < scan->rs_nblocks);
|
|
heapgetpage(scan, blockno);
|
|
scan->rs_inited = true;
|
|
}
|
|
else
|
|
{
|
|
/* continue from previously returned page/tuple */
|
|
blockno = scan->rs_cblock; /* current page */
|
|
}
|
|
|
|
/*
|
|
* When not using pagemode, we must lock the buffer during tuple
|
|
* visibility checks.
|
|
*/
|
|
if (!pagemode)
|
|
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
|
|
|
|
page = (Page) BufferGetPage(scan->rs_cbuf);
|
|
all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
|
|
maxoffset = PageGetMaxOffsetNumber(page);
|
|
|
|
for (;;)
|
|
{
|
|
OffsetNumber tupoffset;
|
|
bool finished;
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
/* Ask the tablesample method which tuples to check on this page. */
|
|
tupoffset = tsm->NextSampleTuple(scanstate,
|
|
blockno,
|
|
maxoffset);
|
|
|
|
if (OffsetNumberIsValid(tupoffset))
|
|
{
|
|
ItemId itemid;
|
|
bool visible;
|
|
|
|
/* Skip invalid tuple pointers. */
|
|
itemid = PageGetItemId(page, tupoffset);
|
|
if (!ItemIdIsNormal(itemid))
|
|
continue;
|
|
|
|
tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
|
|
tuple->t_len = ItemIdGetLength(itemid);
|
|
ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
|
|
|
|
if (all_visible)
|
|
visible = true;
|
|
else
|
|
visible = SampleTupleVisible(tuple, tupoffset, scan);
|
|
|
|
/* in pagemode, heapgetpage did this for us */
|
|
if (!pagemode)
|
|
CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
|
|
scan->rs_cbuf, snapshot);
|
|
|
|
if (visible)
|
|
{
|
|
/* Found visible tuple, return it. */
|
|
if (!pagemode)
|
|
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
/* Try next tuple from same page. */
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* if we get here, it means we've exhausted the items on this page and
|
|
* it's time to move to the next.
|
|
*/
|
|
if (!pagemode)
|
|
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
|
|
|
|
if (tsm->NextSampleBlock)
|
|
{
|
|
blockno = tsm->NextSampleBlock(scanstate);
|
|
Assert(!scan->rs_syncscan);
|
|
finished = !BlockNumberIsValid(blockno);
|
|
}
|
|
else
|
|
{
|
|
/* Without NextSampleBlock, just do a plain forward seqscan. */
|
|
blockno++;
|
|
if (blockno >= scan->rs_nblocks)
|
|
blockno = 0;
|
|
|
|
/*
|
|
* Report our new scan position for synchronization purposes.
|
|
*
|
|
* Note: we do this before checking for end of scan so that the
|
|
* final state of the position hint is back at the start of the
|
|
* rel. That's not strictly necessary, but otherwise when you run
|
|
* the same query multiple times the starting position would shift
|
|
* a little bit backwards on every invocation, which is confusing.
|
|
* We don't guarantee any specific ordering in general, though.
|
|
*/
|
|
if (scan->rs_syncscan)
|
|
ss_report_location(scan->rs_rd, blockno);
|
|
|
|
finished = (blockno == scan->rs_startblock);
|
|
}
|
|
|
|
/*
|
|
* Reached end of scan?
|
|
*/
|
|
if (finished)
|
|
{
|
|
if (BufferIsValid(scan->rs_cbuf))
|
|
ReleaseBuffer(scan->rs_cbuf);
|
|
scan->rs_cbuf = InvalidBuffer;
|
|
scan->rs_cblock = InvalidBlockNumber;
|
|
tuple->t_data = NULL;
|
|
scan->rs_inited = false;
|
|
return NULL;
|
|
}
|
|
|
|
Assert(blockno < scan->rs_nblocks);
|
|
heapgetpage(scan, blockno);
|
|
|
|
/* Re-establish state for new page */
|
|
if (!pagemode)
|
|
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
|
|
|
|
page = (Page) BufferGetPage(scan->rs_cbuf);
|
|
all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
|
|
maxoffset = PageGetMaxOffsetNumber(page);
|
|
}
|
|
|
|
/* Count successfully-fetched tuples as heap fetches */
|
|
pgstat_count_heap_getnext(scan->rs_rd);
|
|
|
|
return &(scan->rs_ctup);
|
|
}
|
|
|
|
/*
|
|
* Check visibility of the tuple.
|
|
*/
|
|
static bool
|
|
SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
|
|
{
|
|
if (scan->rs_pageatatime)
|
|
{
|
|
/*
|
|
* In pageatatime mode, heapgetpage() already did visibility checks,
|
|
* so just look at the info it left in rs_vistuples[].
|
|
*
|
|
* We use a binary search over the known-sorted array. Note: we could
|
|
* save some effort if we insisted that NextSampleTuple select tuples
|
|
* in increasing order, but it's not clear that there would be enough
|
|
* gain to justify the restriction.
|
|
*/
|
|
int start = 0,
|
|
end = scan->rs_ntuples - 1;
|
|
|
|
while (start <= end)
|
|
{
|
|
int mid = (start + end) / 2;
|
|
OffsetNumber curoffset = scan->rs_vistuples[mid];
|
|
|
|
if (tupoffset == curoffset)
|
|
return true;
|
|
else if (tupoffset < curoffset)
|
|
end = mid - 1;
|
|
else
|
|
start = mid + 1;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
/* Otherwise, we have to check the tuple individually. */
|
|
return HeapTupleSatisfiesVisibility(tuple,
|
|
scan->rs_snapshot,
|
|
scan->rs_cbuf);
|
|
}
|
|
}
|