diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 0eb991cdf0..59b8a2e2b3 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -2297,6 +2297,7 @@ JumbleRangeTable(pgssJumbleState *jstate, List *rtable) { case RTE_RELATION: APP_JUMB(rte->relid); + JumbleExpr(jstate, (Node *) rte->tablesample); break; case RTE_SUBQUERY: JumbleQuery(jstate, rte->subquery); @@ -2767,6 +2768,15 @@ JumbleExpr(pgssJumbleState *jstate, Node *node) JumbleExpr(jstate, rtfunc->funcexpr); } break; + case T_TableSampleClause: + { + TableSampleClause *tsc = (TableSampleClause *) node; + + APP_JUMB(tsc->tsmhandler); + JumbleExpr(jstate, (Node *) tsc->args); + JumbleExpr(jstate, (Node *) tsc->repeatable); + } + break; default: /* Only a warning, since we can stumble along anyway */ elog(WARNING, "unrecognized node type: %d", diff --git a/contrib/tsm_system_rows/Makefile b/contrib/tsm_system_rows/Makefile index 700ab276db..609af463c5 100644 --- a/contrib/tsm_system_rows/Makefile +++ b/contrib/tsm_system_rows/Makefile @@ -1,8 +1,8 @@ -# src/test/modules/tsm_system_rows/Makefile +# contrib/tsm_system_rows/Makefile MODULE_big = tsm_system_rows OBJS = tsm_system_rows.o $(WIN32RES) -PGFILEDESC = "tsm_system_rows - SYSTEM TABLESAMPLE method which accepts number of rows as a limit" +PGFILEDESC = "tsm_system_rows - TABLESAMPLE method which accepts number of rows as a limit" EXTENSION = tsm_system_rows DATA = tsm_system_rows--1.0.sql diff --git a/contrib/tsm_system_rows/expected/tsm_system_rows.out b/contrib/tsm_system_rows/expected/tsm_system_rows.out index 7e0f72b02b..87b4a8fc64 100644 --- a/contrib/tsm_system_rows/expected/tsm_system_rows.out +++ b/contrib/tsm_system_rows/expected/tsm_system_rows.out @@ -1,31 +1,83 @@ CREATE EXTENSION tsm_system_rows; -CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages -INSERT INTO test_tablesample SELECT i, repeat(i::text, 1000) FROM generate_series(0, 30) s(i) ORDER BY i; +CREATE TABLE test_tablesample (id int, name text); +INSERT INTO test_tablesample SELECT i, repeat(i::text, 1000) + FROM generate_series(0, 30) s(i); ANALYZE test_tablesample; -SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (1000); +SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (0); + count +------- + 0 +(1 row) + +SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (1); + count +------- + 1 +(1 row) + +SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (10); + count +------- + 10 +(1 row) + +SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (100); count ------- 31 (1 row) -SELECT id FROM test_tablesample TABLESAMPLE system_rows (8) REPEATABLE (5432); - id ----- - 7 - 14 - 21 - 28 - 4 - 11 - 18 - 25 -(8 rows) +-- bad parameters should get through planning, but not execution: +EXPLAIN (COSTS OFF) +SELECT id FROM test_tablesample TABLESAMPLE system_rows (-1); + QUERY PLAN +---------------------------------------- + Sample Scan on test_tablesample + Sampling: system_rows ('-1'::bigint) +(2 rows) -EXPLAIN SELECT id FROM test_tablesample TABLESAMPLE system_rows (20) REPEATABLE (10); - QUERY PLAN ------------------------------------------------------------------------------------ - Sample Scan (system_rows) on test_tablesample (cost=0.00..80.20 rows=20 width=4) +SELECT id FROM test_tablesample TABLESAMPLE system_rows (-1); +ERROR: sample size must not be negative +-- fail, this method is not repeatable: +SELECT * FROM test_tablesample TABLESAMPLE system_rows (10) REPEATABLE (0); +ERROR: tablesample method system_rows does not support REPEATABLE +LINE 1: SELECT * FROM test_tablesample TABLESAMPLE system_rows (10) ... + ^ +-- but a join should be allowed: +EXPLAIN (COSTS OFF) +SELECT * FROM + (VALUES (0),(10),(100)) v(nrows), + LATERAL (SELECT count(*) FROM test_tablesample + TABLESAMPLE system_rows (nrows)) ss; + QUERY PLAN +---------------------------------------------------------- + Nested Loop + -> Values Scan on "*VALUES*" + -> Aggregate + -> Sample Scan on test_tablesample + Sampling: system_rows ("*VALUES*".column1) +(5 rows) + +SELECT * FROM + (VALUES (0),(10),(100)) v(nrows), + LATERAL (SELECT count(*) FROM test_tablesample + TABLESAMPLE system_rows (nrows)) ss; + nrows | count +-------+------- + 0 | 0 + 10 | 10 + 100 | 31 +(3 rows) + +CREATE VIEW vv AS + SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (20); +SELECT * FROM vv; + count +------- + 20 (1 row) --- done -DROP TABLE test_tablesample CASCADE; +DROP EXTENSION tsm_system_rows; -- fail, view depends on extension +ERROR: cannot drop extension tsm_system_rows because other objects depend on it +DETAIL: view vv depends on function system_rows(internal) +HINT: Use DROP ... CASCADE to drop the dependent objects too. diff --git a/contrib/tsm_system_rows/sql/tsm_system_rows.sql b/contrib/tsm_system_rows/sql/tsm_system_rows.sql index bd812220ed..e3ab4204ee 100644 --- a/contrib/tsm_system_rows/sql/tsm_system_rows.sql +++ b/contrib/tsm_system_rows/sql/tsm_system_rows.sql @@ -1,14 +1,39 @@ CREATE EXTENSION tsm_system_rows; -CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages - -INSERT INTO test_tablesample SELECT i, repeat(i::text, 1000) FROM generate_series(0, 30) s(i) ORDER BY i; +CREATE TABLE test_tablesample (id int, name text); +INSERT INTO test_tablesample SELECT i, repeat(i::text, 1000) + FROM generate_series(0, 30) s(i); ANALYZE test_tablesample; -SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (1000); -SELECT id FROM test_tablesample TABLESAMPLE system_rows (8) REPEATABLE (5432); +SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (0); +SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (1); +SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (10); +SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (100); -EXPLAIN SELECT id FROM test_tablesample TABLESAMPLE system_rows (20) REPEATABLE (10); +-- bad parameters should get through planning, but not execution: +EXPLAIN (COSTS OFF) +SELECT id FROM test_tablesample TABLESAMPLE system_rows (-1); --- done -DROP TABLE test_tablesample CASCADE; +SELECT id FROM test_tablesample TABLESAMPLE system_rows (-1); + +-- fail, this method is not repeatable: +SELECT * FROM test_tablesample TABLESAMPLE system_rows (10) REPEATABLE (0); + +-- but a join should be allowed: +EXPLAIN (COSTS OFF) +SELECT * FROM + (VALUES (0),(10),(100)) v(nrows), + LATERAL (SELECT count(*) FROM test_tablesample + TABLESAMPLE system_rows (nrows)) ss; + +SELECT * FROM + (VALUES (0),(10),(100)) v(nrows), + LATERAL (SELECT count(*) FROM test_tablesample + TABLESAMPLE system_rows (nrows)) ss; + +CREATE VIEW vv AS + SELECT count(*) FROM test_tablesample TABLESAMPLE system_rows (20); + +SELECT * FROM vv; + +DROP EXTENSION tsm_system_rows; -- fail, view depends on extension diff --git a/contrib/tsm_system_rows/tsm_system_rows--1.0.sql b/contrib/tsm_system_rows/tsm_system_rows--1.0.sql index 1a29c584b5..de508ed726 100644 --- a/contrib/tsm_system_rows/tsm_system_rows--1.0.sql +++ b/contrib/tsm_system_rows/tsm_system_rows--1.0.sql @@ -1,44 +1,9 @@ -/* src/test/modules/tablesample/tsm_system_rows--1.0.sql */ +/* contrib/tsm_system_rows/tsm_system_rows--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION tsm_system_rows" to load this file. \quit -CREATE FUNCTION tsm_system_rows_init(internal, int4, int4) -RETURNS void -AS 'MODULE_PATHNAME' +CREATE FUNCTION system_rows(internal) +RETURNS tsm_handler +AS 'MODULE_PATHNAME', 'tsm_system_rows_handler' LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_rows_nextblock(internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_rows_nexttuple(internal, int4, int2) -RETURNS int2 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_rows_examinetuple(internal, int4, internal, bool) -RETURNS bool -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_rows_end(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_rows_reset(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_rows_cost(internal, internal, internal, internal, internal, internal, internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -INSERT INTO pg_tablesample_method VALUES('system_rows', false, true, - 'tsm_system_rows_init', 'tsm_system_rows_nextblock', - 'tsm_system_rows_nexttuple', 'tsm_system_rows_examinetuple', - 'tsm_system_rows_end', 'tsm_system_rows_reset', 'tsm_system_rows_cost'); diff --git a/contrib/tsm_system_rows/tsm_system_rows.c b/contrib/tsm_system_rows/tsm_system_rows.c index e325eaff49..f251e3e5e0 100644 --- a/contrib/tsm_system_rows/tsm_system_rows.c +++ b/contrib/tsm_system_rows/tsm_system_rows.c @@ -1,240 +1,356 @@ /*------------------------------------------------------------------------- * * tsm_system_rows.c - * interface routines for system_rows tablesample method + * support routines for SYSTEM_ROWS tablesample method * + * The desire here is to produce a random sample with a given number of rows + * (or the whole relation, if that is fewer rows). We use a block-sampling + * approach. To ensure that the whole relation will be visited if necessary, + * we start at a randomly chosen block and then advance with a stride that + * is randomly chosen but is relatively prime to the relation's nblocks. * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Because of the dependence on nblocks, this method cannot be repeatable + * across queries. (Even if the user hasn't explicitly changed the relation, + * maintenance activities such as autovacuum might change nblocks.) However, + * we can at least make it repeatable across scans, by determining the + * sampling pattern only once on the first scan. This means that rescans + * won't visit blocks added after the first scan, but that is fine since + * such blocks shouldn't contain any visible tuples anyway. + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * contrib/tsm_system_rows_rowlimit/tsm_system_rows.c + * contrib/tsm_system_rows/tsm_system_rows.c * *------------------------------------------------------------------------- */ #include "postgres.h" -#include "fmgr.h" - -#include "access/tablesample.h" #include "access/relscan.h" +#include "access/tsmapi.h" +#include "catalog/pg_type.h" #include "miscadmin.h" -#include "nodes/execnodes.h" -#include "nodes/relation.h" #include "optimizer/clauses.h" -#include "storage/bufmgr.h" +#include "optimizer/cost.h" #include "utils/sampling.h" PG_MODULE_MAGIC; -/* - * State - */ +PG_FUNCTION_INFO_V1(tsm_system_rows_handler); + + +/* Private state */ typedef struct { - SamplerRandomState randstate; uint32 seed; /* random seed */ - BlockNumber nblocks; /* number of block in relation */ - int32 ntuples; /* number of tuples to return */ - int32 donetuples; /* tuples already returned */ + int64 ntuples; /* number of tuples to return */ + int64 donetuples; /* number of tuples already returned */ OffsetNumber lt; /* last tuple returned from current block */ - BlockNumber step; /* step size */ + BlockNumber doneblocks; /* number of already-scanned blocks */ BlockNumber lb; /* last block visited */ - BlockNumber doneblocks; /* number of already returned blocks */ -} SystemSamplerData; - - -PG_FUNCTION_INFO_V1(tsm_system_rows_init); -PG_FUNCTION_INFO_V1(tsm_system_rows_nextblock); -PG_FUNCTION_INFO_V1(tsm_system_rows_nexttuple); -PG_FUNCTION_INFO_V1(tsm_system_rows_examinetuple); -PG_FUNCTION_INFO_V1(tsm_system_rows_end); -PG_FUNCTION_INFO_V1(tsm_system_rows_reset); -PG_FUNCTION_INFO_V1(tsm_system_rows_cost); + /* these three values are not changed during a rescan: */ + BlockNumber nblocks; /* number of blocks in relation */ + BlockNumber firstblock; /* first block to sample from */ + BlockNumber step; /* step size, or 0 if not set yet */ +} SystemRowsSamplerData; +static void system_rows_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void system_rows_initsamplescan(SampleScanState *node, + int eflags); +static void system_rows_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static BlockNumber system_rows_nextsampleblock(SampleScanState *node); +static OffsetNumber system_rows_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); +static bool SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan); static uint32 random_relative_prime(uint32 n, SamplerRandomState randstate); + /* - * Initializes the state. + * Create a TsmRoutine descriptor for the SYSTEM_ROWS method. */ Datum -tsm_system_rows_init(PG_FUNCTION_ARGS) +tsm_system_rows_handler(PG_FUNCTION_ARGS) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - uint32 seed = PG_GETARG_UINT32(1); - int32 ntuples = PG_ARGISNULL(2) ? -1 : PG_GETARG_INT32(2); - HeapScanDesc scan = tsdesc->heapScan; - SystemSamplerData *sampler; + TsmRoutine *tsm = makeNode(TsmRoutine); - if (ntuples < 1) + tsm->parameterTypes = list_make1_oid(INT8OID); + + /* See notes at head of file */ + tsm->repeatable_across_queries = false; + tsm->repeatable_across_scans = true; + + tsm->SampleScanGetSampleSize = system_rows_samplescangetsamplesize; + tsm->InitSampleScan = system_rows_initsamplescan; + tsm->BeginSampleScan = system_rows_beginsamplescan; + tsm->NextSampleBlock = system_rows_nextsampleblock; + tsm->NextSampleTuple = system_rows_nextsampletuple; + tsm->EndSampleScan = NULL; + + PG_RETURN_POINTER(tsm); +} + +/* + * Sample size estimation. + */ +static void +system_rows_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) +{ + Node *limitnode; + int64 ntuples; + double npages; + + /* Try to extract an estimate for the limit rowcount */ + limitnode = (Node *) linitial(paramexprs); + limitnode = estimate_expression_value(root, limitnode); + + if (IsA(limitnode, Const) && + !((Const *) limitnode)->constisnull) + { + ntuples = DatumGetInt64(((Const *) limitnode)->constvalue); + if (ntuples < 0) + { + /* Default ntuples if the value is bogus */ + ntuples = 1000; + } + } + else + { + /* Default ntuples if we didn't obtain a non-null Const */ + ntuples = 1000; + } + + /* Clamp to the estimated relation size */ + if (ntuples > baserel->tuples) + ntuples = (int64) baserel->tuples; + ntuples = clamp_row_est(ntuples); + + if (baserel->tuples > 0 && baserel->pages > 0) + { + /* Estimate number of pages visited based on tuple density */ + double density = baserel->tuples / (double) baserel->pages; + + npages = ntuples / density; + } + else + { + /* For lack of data, assume one tuple per page */ + npages = ntuples; + } + + /* Clamp to sane value */ + npages = clamp_row_est(Min((double) baserel->pages, npages)); + + *pages = npages; + *tuples = ntuples; +} + +/* + * Initialize during executor setup. + */ +static void +system_rows_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(SystemRowsSamplerData)); + /* Note the above leaves tsm_state->step equal to zero */ +} + +/* + * Examine parameters and prepare for a sample scan. + */ +static void +system_rows_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) +{ + SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state; + int64 ntuples = DatumGetInt64(params[0]); + + if (ntuples < 0) ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("invalid sample size"), - errhint("Sample size must be positive integer value."))); + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample size must not be negative"))); - sampler = palloc0(sizeof(SystemSamplerData)); - - /* Remember initial values for reinit */ sampler->seed = seed; - sampler->nblocks = scan->rs_nblocks; sampler->ntuples = ntuples; sampler->donetuples = 0; sampler->lt = InvalidOffsetNumber; sampler->doneblocks = 0; - - sampler_random_init_state(sampler->seed, sampler->randstate); - - /* Find relative prime as step size for linear probing. */ - sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate); + /* lb will be initialized during first NextSampleBlock call */ + /* we intentionally do not change nblocks/firstblock/step here */ /* - * Randomize start position so that blocks close to step size don't have - * higher probability of being chosen on very short scan. + * We *must* use pagemode visibility checking in this module, so force + * that even though it's currently default. */ - sampler->lb = sampler_random_fract(sampler->randstate) * - (sampler->nblocks / sampler->step); - - tsdesc->tsmdata = (void *) sampler; - - PG_RETURN_VOID(); + node->use_pagemode = true; } /* - * Get next block number or InvalidBlockNumber when we're done. + * Select next block to sample. * * Uses linear probing algorithm for picking next block. */ -Datum -tsm_system_rows_nextblock(PG_FUNCTION_ARGS) +static BlockNumber +system_rows_nextsampleblock(SampleScanState *node) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state; + HeapScanDesc scan = node->ss.ss_currentScanDesc; - sampler->lb = (sampler->lb + sampler->step) % sampler->nblocks; - sampler->doneblocks++; + /* First call within scan? */ + if (sampler->doneblocks == 0) + { + /* First scan within query? */ + if (sampler->step == 0) + { + /* Initialize now that we have scan descriptor */ + SamplerRandomState randstate; - /* All blocks have been read, we're done */ - if (sampler->doneblocks > sampler->nblocks || + /* If relation is empty, there's nothing to scan */ + if (scan->rs_nblocks == 0) + return InvalidBlockNumber; + + /* We only need an RNG during this setup step */ + sampler_random_init_state(sampler->seed, randstate); + + /* Compute nblocks/firstblock/step only once per query */ + sampler->nblocks = scan->rs_nblocks; + + /* Choose random starting block within the relation */ + /* (Actually this is the predecessor of the first block visited) */ + sampler->firstblock = sampler_random_fract(randstate) * + sampler->nblocks; + + /* Find relative prime as step size for linear probing */ + sampler->step = random_relative_prime(sampler->nblocks, randstate); + } + + /* Reinitialize lb */ + sampler->lb = sampler->firstblock; + } + + /* If we've read all blocks or returned all needed tuples, we're done */ + if (++sampler->doneblocks > sampler->nblocks || sampler->donetuples >= sampler->ntuples) - PG_RETURN_UINT32(InvalidBlockNumber); + return InvalidBlockNumber; - PG_RETURN_UINT32(sampler->lb); + /* + * It's probably impossible for scan->rs_nblocks to decrease between scans + * within a query; but just in case, loop until we select a block number + * less than scan->rs_nblocks. We don't care if scan->rs_nblocks has + * increased since the first scan. + */ + do + { + /* Advance lb, using uint64 arithmetic to forestall overflow */ + sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks; + } while (sampler->lb >= scan->rs_nblocks); + + return sampler->lb; } /* - * Get next tuple offset in current block or InvalidOffsetNumber if we are done - * with this block. + * Select next sampled tuple in current block. + * + * In block sampling, we just want to sample all the tuples in each selected + * block. + * + * When we reach end of the block, return InvalidOffsetNumber which tells + * SampleScan to go to next block. */ -Datum -tsm_system_rows_nexttuple(PG_FUNCTION_ARGS) +static OffsetNumber +system_rows_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - OffsetNumber maxoffset = PG_GETARG_UINT16(2); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + SystemRowsSamplerData *sampler = (SystemRowsSamplerData *) node->tsm_state; + HeapScanDesc scan = node->ss.ss_currentScanDesc; OffsetNumber tupoffset = sampler->lt; - if (tupoffset == InvalidOffsetNumber) - tupoffset = FirstOffsetNumber; - else - tupoffset++; + /* Quit if we've returned all needed tuples */ + if (sampler->donetuples >= sampler->ntuples) + return InvalidOffsetNumber; - if (tupoffset > maxoffset || - sampler->donetuples >= sampler->ntuples) - tupoffset = InvalidOffsetNumber; + /* + * Because we should only count visible tuples as being returned, we need + * to search for a visible tuple rather than just let the core code do it. + */ + + /* We rely on the data accumulated in pagemode access */ + Assert(scan->rs_pageatatime); + for (;;) + { + /* Advance to next possible offset on page */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* Done? */ + if (tupoffset > maxoffset) + { + tupoffset = InvalidOffsetNumber; + break; + } + + /* Found a candidate? */ + if (SampleOffsetVisible(tupoffset, scan)) + { + sampler->donetuples++; + break; + } + } sampler->lt = tupoffset; - PG_RETURN_UINT16(tupoffset); + return tupoffset; } /* - * Examine tuple and decide if it should be returned. + * Check if tuple offset is visible + * + * In pageatatime mode, heapgetpage() already did visibility checks, + * so just look at the info it left in rs_vistuples[]. */ -Datum -tsm_system_rows_examinetuple(PG_FUNCTION_ARGS) +static bool +SampleOffsetVisible(OffsetNumber tupoffset, HeapScanDesc scan) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - bool visible = PG_GETARG_BOOL(3); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + int start = 0, + end = scan->rs_ntuples - 1; - if (!visible) - PG_RETURN_BOOL(false); - - sampler->donetuples++; - - PG_RETURN_BOOL(true); -} - -/* - * Cleanup method. - */ -Datum -tsm_system_rows_end(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - - pfree(tsdesc->tsmdata); - - PG_RETURN_VOID(); -} - -/* - * Reset state (called by ReScan). - */ -Datum -tsm_system_rows_reset(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; - - sampler->lt = InvalidOffsetNumber; - sampler->donetuples = 0; - sampler->doneblocks = 0; - - sampler_random_init_state(sampler->seed, sampler->randstate); - sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate); - sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step); - - PG_RETURN_VOID(); -} - -/* - * Costing function. - */ -Datum -tsm_system_rows_cost(PG_FUNCTION_ARGS) -{ - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Path *path = (Path *) PG_GETARG_POINTER(1); - RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); - List *args = (List *) PG_GETARG_POINTER(3); - BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); - double *tuples = (double *) PG_GETARG_POINTER(5); - Node *limitnode; - int32 ntuples; - - limitnode = linitial(args); - limitnode = estimate_expression_value(root, limitnode); - - if (IsA(limitnode, RelabelType)) - limitnode = (Node *) ((RelabelType *) limitnode)->arg; - - if (IsA(limitnode, Const)) - ntuples = DatumGetInt32(((Const *) limitnode)->constvalue); - else + while (start <= end) { - /* Default ntuples if the estimation didn't return Const. */ - ntuples = 1000; + int mid = (start + end) / 2; + OffsetNumber curoffset = scan->rs_vistuples[mid]; + + if (tupoffset == curoffset) + return true; + else if (tupoffset < curoffset) + end = mid - 1; + else + start = mid + 1; } - *pages = Min(baserel->pages, ntuples); - *tuples = ntuples; - path->rows = *tuples; - - PG_RETURN_VOID(); + return false; } - +/* + * Compute greatest common divisor of two uint32's. + */ static uint32 gcd(uint32 a, uint32 b) { @@ -250,22 +366,29 @@ gcd(uint32 a, uint32 b) return b; } +/* + * Pick a random value less than and relatively prime to n, if possible + * (else return 1). + */ static uint32 random_relative_prime(uint32 n, SamplerRandomState randstate) { - /* Pick random starting number, with some limits on what it can be. */ - uint32 r = (uint32) sampler_random_fract(randstate) * n / 2 + n / 4, - t; + uint32 r; + + /* Safety check to avoid infinite loop or zero result for small n. */ + if (n <= 1) + return 1; /* * This should only take 2 or 3 iterations as the probability of 2 numbers - * being relatively prime is ~61%. + * being relatively prime is ~61%; but just in case, we'll include a + * CHECK_FOR_INTERRUPTS in the loop. */ - while ((t = gcd(r, n)) > 1) + do { CHECK_FOR_INTERRUPTS(); - r /= t; - } + r = (uint32) (sampler_random_fract(randstate) * n); + } while (r == 0 || gcd(r, n) > 1); return r; } diff --git a/contrib/tsm_system_rows/tsm_system_rows.control b/contrib/tsm_system_rows/tsm_system_rows.control index 84ea7adb49..4bd0232f97 100644 --- a/contrib/tsm_system_rows/tsm_system_rows.control +++ b/contrib/tsm_system_rows/tsm_system_rows.control @@ -1,5 +1,5 @@ # tsm_system_rows extension -comment = 'SYSTEM TABLESAMPLE method which accepts number rows as a limit' +comment = 'TABLESAMPLE method which accepts number of rows as a limit' default_version = '1.0' module_pathname = '$libdir/tsm_system_rows' relocatable = true diff --git a/contrib/tsm_system_time/Makefile b/contrib/tsm_system_time/Makefile index c42c1c6bb6..168becf54e 100644 --- a/contrib/tsm_system_time/Makefile +++ b/contrib/tsm_system_time/Makefile @@ -1,8 +1,8 @@ -# src/test/modules/tsm_system_time/Makefile +# contrib/tsm_system_time/Makefile MODULE_big = tsm_system_time OBJS = tsm_system_time.o $(WIN32RES) -PGFILEDESC = "tsm_system_time - SYSTEM TABLESAMPLE method which accepts number rows of as a limit" +PGFILEDESC = "tsm_system_time - TABLESAMPLE method which accepts time in milliseconds as a limit" EXTENSION = tsm_system_time DATA = tsm_system_time--1.0.sql diff --git a/contrib/tsm_system_time/expected/tsm_system_time.out b/contrib/tsm_system_time/expected/tsm_system_time.out index 32ad03c4bd..ac44f30be9 100644 --- a/contrib/tsm_system_time/expected/tsm_system_time.out +++ b/contrib/tsm_system_time/expected/tsm_system_time.out @@ -1,54 +1,100 @@ CREATE EXTENSION tsm_system_time; -CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages -INSERT INTO test_tablesample SELECT i, repeat(i::text, 1000) FROM generate_series(0, 30) s(i) ORDER BY i; +CREATE TABLE test_tablesample (id int, name text); +INSERT INTO test_tablesample SELECT i, repeat(i::text, 1000) + FROM generate_series(0, 30) s(i); ANALYZE test_tablesample; -SELECT count(*) FROM test_tablesample TABLESAMPLE system_time (1000); +-- It's a bit tricky to test SYSTEM_TIME in a platform-independent way. +-- We can test the zero-time corner case ... +SELECT count(*) FROM test_tablesample TABLESAMPLE system_time (0); + count +------- + 0 +(1 row) + +-- ... and we assume that this will finish before running out of time: +SELECT count(*) FROM test_tablesample TABLESAMPLE system_time (100000); count ------- 31 (1 row) -SELECT id FROM test_tablesample TABLESAMPLE system_time (1000) REPEATABLE (5432); - id ----- - 7 - 14 - 21 - 28 - 4 - 11 - 18 - 25 - 1 - 8 - 15 - 22 - 29 - 5 - 12 - 19 - 26 - 2 - 9 - 16 - 23 - 30 - 6 - 13 - 20 - 27 - 3 - 10 - 17 - 24 - 0 -(31 rows) +-- bad parameters should get through planning, but not execution: +EXPLAIN (COSTS OFF) +SELECT id FROM test_tablesample TABLESAMPLE system_time (-1); + QUERY PLAN +-------------------------------------------------- + Sample Scan on test_tablesample + Sampling: system_time ('-1'::double precision) +(2 rows) -EXPLAIN SELECT id FROM test_tablesample TABLESAMPLE system_time (100) REPEATABLE (10); - QUERY PLAN ------------------------------------------------------------------------------------- - Sample Scan (system_time) on test_tablesample (cost=0.00..100.25 rows=25 width=4) -(1 row) +SELECT id FROM test_tablesample TABLESAMPLE system_time (-1); +ERROR: sample collection time must not be negative +-- fail, this method is not repeatable: +SELECT * FROM test_tablesample TABLESAMPLE system_time (10) REPEATABLE (0); +ERROR: tablesample method system_time does not support REPEATABLE +LINE 1: SELECT * FROM test_tablesample TABLESAMPLE system_time (10) ... + ^ +-- since it's not repeatable, we expect a Materialize node in these plans: +EXPLAIN (COSTS OFF) +SELECT * FROM + (VALUES (0),(100000)) v(time), + LATERAL (SELECT COUNT(*) FROM test_tablesample + TABLESAMPLE system_time (100000)) ss; + QUERY PLAN +------------------------------------------------------------------------ + Nested Loop + -> Aggregate + -> Materialize + -> Sample Scan on test_tablesample + Sampling: system_time ('100000'::double precision) + -> Values Scan on "*VALUES*" +(6 rows) --- done -DROP TABLE test_tablesample CASCADE; +SELECT * FROM + (VALUES (0),(100000)) v(time), + LATERAL (SELECT COUNT(*) FROM test_tablesample + TABLESAMPLE system_time (100000)) ss; + time | count +--------+------- + 0 | 31 + 100000 | 31 +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM + (VALUES (0),(100000)) v(time), + LATERAL (SELECT COUNT(*) FROM test_tablesample + TABLESAMPLE system_time (time)) ss; + QUERY PLAN +---------------------------------------------------------------- + Nested Loop + -> Values Scan on "*VALUES*" + -> Aggregate + -> Materialize + -> Sample Scan on test_tablesample + Sampling: system_time ("*VALUES*".column1) +(6 rows) + +SELECT * FROM + (VALUES (0),(100000)) v(time), + LATERAL (SELECT COUNT(*) FROM test_tablesample + TABLESAMPLE system_time (time)) ss; + time | count +--------+------- + 0 | 0 + 100000 | 31 +(2 rows) + +CREATE VIEW vv AS + SELECT * FROM test_tablesample TABLESAMPLE system_time (20); +EXPLAIN (COSTS OFF) SELECT * FROM vv; + QUERY PLAN +-------------------------------------------------- + Sample Scan on test_tablesample + Sampling: system_time ('20'::double precision) +(2 rows) + +DROP EXTENSION tsm_system_time; -- fail, view depends on extension +ERROR: cannot drop extension tsm_system_time because other objects depend on it +DETAIL: view vv depends on function system_time(internal) +HINT: Use DROP ... CASCADE to drop the dependent objects too. diff --git a/contrib/tsm_system_time/sql/tsm_system_time.sql b/contrib/tsm_system_time/sql/tsm_system_time.sql index 68dbbf98af..117de163d8 100644 --- a/contrib/tsm_system_time/sql/tsm_system_time.sql +++ b/contrib/tsm_system_time/sql/tsm_system_time.sql @@ -1,14 +1,51 @@ CREATE EXTENSION tsm_system_time; -CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages - -INSERT INTO test_tablesample SELECT i, repeat(i::text, 1000) FROM generate_series(0, 30) s(i) ORDER BY i; +CREATE TABLE test_tablesample (id int, name text); +INSERT INTO test_tablesample SELECT i, repeat(i::text, 1000) + FROM generate_series(0, 30) s(i); ANALYZE test_tablesample; -SELECT count(*) FROM test_tablesample TABLESAMPLE system_time (1000); -SELECT id FROM test_tablesample TABLESAMPLE system_time (1000) REPEATABLE (5432); +-- It's a bit tricky to test SYSTEM_TIME in a platform-independent way. +-- We can test the zero-time corner case ... +SELECT count(*) FROM test_tablesample TABLESAMPLE system_time (0); +-- ... and we assume that this will finish before running out of time: +SELECT count(*) FROM test_tablesample TABLESAMPLE system_time (100000); -EXPLAIN SELECT id FROM test_tablesample TABLESAMPLE system_time (100) REPEATABLE (10); +-- bad parameters should get through planning, but not execution: +EXPLAIN (COSTS OFF) +SELECT id FROM test_tablesample TABLESAMPLE system_time (-1); --- done -DROP TABLE test_tablesample CASCADE; +SELECT id FROM test_tablesample TABLESAMPLE system_time (-1); + +-- fail, this method is not repeatable: +SELECT * FROM test_tablesample TABLESAMPLE system_time (10) REPEATABLE (0); + +-- since it's not repeatable, we expect a Materialize node in these plans: +EXPLAIN (COSTS OFF) +SELECT * FROM + (VALUES (0),(100000)) v(time), + LATERAL (SELECT COUNT(*) FROM test_tablesample + TABLESAMPLE system_time (100000)) ss; + +SELECT * FROM + (VALUES (0),(100000)) v(time), + LATERAL (SELECT COUNT(*) FROM test_tablesample + TABLESAMPLE system_time (100000)) ss; + +EXPLAIN (COSTS OFF) +SELECT * FROM + (VALUES (0),(100000)) v(time), + LATERAL (SELECT COUNT(*) FROM test_tablesample + TABLESAMPLE system_time (time)) ss; + +SELECT * FROM + (VALUES (0),(100000)) v(time), + LATERAL (SELECT COUNT(*) FROM test_tablesample + TABLESAMPLE system_time (time)) ss; + +CREATE VIEW vv AS + SELECT * FROM test_tablesample TABLESAMPLE system_time (20); + +EXPLAIN (COSTS OFF) SELECT * FROM vv; + +DROP EXTENSION tsm_system_time; -- fail, view depends on extension diff --git a/contrib/tsm_system_time/tsm_system_time--1.0.sql b/contrib/tsm_system_time/tsm_system_time--1.0.sql index 1f390d6ed7..c59d2e84ef 100644 --- a/contrib/tsm_system_time/tsm_system_time--1.0.sql +++ b/contrib/tsm_system_time/tsm_system_time--1.0.sql @@ -1,39 +1,9 @@ -/* src/test/modules/tablesample/tsm_system_time--1.0.sql */ +/* contrib/tsm_system_time/tsm_system_time--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION tsm_system_time" to load this file. \quit -CREATE FUNCTION tsm_system_time_init(internal, int4, int4) -RETURNS void -AS 'MODULE_PATHNAME' +CREATE FUNCTION system_time(internal) +RETURNS tsm_handler +AS 'MODULE_PATHNAME', 'tsm_system_time_handler' LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_time_nextblock(internal) -RETURNS int4 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_time_nexttuple(internal, int4, int2) -RETURNS int2 -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_time_end(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_time_reset(internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FUNCTION tsm_system_time_cost(internal, internal, internal, internal, internal, internal, internal) -RETURNS void -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -INSERT INTO pg_tablesample_method VALUES('system_time', false, true, - 'tsm_system_time_init', 'tsm_system_time_nextblock', - 'tsm_system_time_nexttuple', '-', 'tsm_system_time_end', - 'tsm_system_time_reset', 'tsm_system_time_cost'); diff --git a/contrib/tsm_system_time/tsm_system_time.c b/contrib/tsm_system_time/tsm_system_time.c index 7708fc0761..83f1455c5f 100644 --- a/contrib/tsm_system_time/tsm_system_time.c +++ b/contrib/tsm_system_time/tsm_system_time.c @@ -1,286 +1,320 @@ /*------------------------------------------------------------------------- * * tsm_system_time.c - * interface routines for system_time tablesample method + * support routines for SYSTEM_TIME tablesample method * + * The desire here is to produce a random sample with as many rows as possible + * in no more than the specified amount of time. We use a block-sampling + * approach. To ensure that the whole relation will be visited if necessary, + * we start at a randomly chosen block and then advance with a stride that + * is randomly chosen but is relatively prime to the relation's nblocks. * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Because of the time dependence, this method is necessarily unrepeatable. + * However, we do what we can to reduce surprising behavior by selecting + * the sampling pattern just once per query, much as in tsm_system_rows. + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * contrib/tsm_system_time_rowlimit/tsm_system_time.c + * contrib/tsm_system_time/tsm_system_time.c * *------------------------------------------------------------------------- */ #include "postgres.h" -#include "fmgr.h" +#ifdef _MSC_VER +#include /* for _isnan */ +#endif +#include -#include "access/tablesample.h" #include "access/relscan.h" +#include "access/tsmapi.h" +#include "catalog/pg_type.h" #include "miscadmin.h" -#include "nodes/execnodes.h" -#include "nodes/relation.h" #include "optimizer/clauses.h" -#include "storage/bufmgr.h" +#include "optimizer/cost.h" #include "utils/sampling.h" #include "utils/spccache.h" -#include "utils/timestamp.h" PG_MODULE_MAGIC; -/* - * State - */ +PG_FUNCTION_INFO_V1(tsm_system_time_handler); + + +/* Private state */ typedef struct { - SamplerRandomState randstate; uint32 seed; /* random seed */ - BlockNumber nblocks; /* number of block in relation */ - int32 time; /* time limit for sampling */ - TimestampTz start_time; /* start time of sampling */ - TimestampTz end_time; /* end time of sampling */ + double millis; /* time limit for sampling */ + instr_time start_time; /* scan start time */ OffsetNumber lt; /* last tuple returned from current block */ - BlockNumber step; /* step size */ + BlockNumber doneblocks; /* number of already-scanned blocks */ BlockNumber lb; /* last block visited */ - BlockNumber estblocks; /* estimated number of returned blocks - * (moving) */ - BlockNumber doneblocks; /* number of already returned blocks */ -} SystemSamplerData; - - -PG_FUNCTION_INFO_V1(tsm_system_time_init); -PG_FUNCTION_INFO_V1(tsm_system_time_nextblock); -PG_FUNCTION_INFO_V1(tsm_system_time_nexttuple); -PG_FUNCTION_INFO_V1(tsm_system_time_end); -PG_FUNCTION_INFO_V1(tsm_system_time_reset); -PG_FUNCTION_INFO_V1(tsm_system_time_cost); + /* these three values are not changed during a rescan: */ + BlockNumber nblocks; /* number of blocks in relation */ + BlockNumber firstblock; /* first block to sample from */ + BlockNumber step; /* step size, or 0 if not set yet */ +} SystemTimeSamplerData; +static void system_time_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void system_time_initsamplescan(SampleScanState *node, + int eflags); +static void system_time_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static BlockNumber system_time_nextsampleblock(SampleScanState *node); +static OffsetNumber system_time_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); static uint32 random_relative_prime(uint32 n, SamplerRandomState randstate); + /* - * Initializes the state. + * Create a TsmRoutine descriptor for the SYSTEM_TIME method. */ Datum -tsm_system_time_init(PG_FUNCTION_ARGS) +tsm_system_time_handler(PG_FUNCTION_ARGS) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - uint32 seed = PG_GETARG_UINT32(1); - int32 time = PG_ARGISNULL(2) ? -1 : PG_GETARG_INT32(2); - HeapScanDesc scan = tsdesc->heapScan; - SystemSamplerData *sampler; + TsmRoutine *tsm = makeNode(TsmRoutine); - if (time < 1) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("invalid time limit"), - errhint("Time limit must be positive integer value."))); + tsm->parameterTypes = list_make1_oid(FLOAT8OID); - sampler = palloc0(sizeof(SystemSamplerData)); + /* See notes at head of file */ + tsm->repeatable_across_queries = false; + tsm->repeatable_across_scans = false; - /* Remember initial values for reinit */ - sampler->seed = seed; - sampler->nblocks = scan->rs_nblocks; - sampler->lt = InvalidOffsetNumber; - sampler->estblocks = 2; - sampler->doneblocks = 0; - sampler->time = time; - sampler->start_time = GetCurrentTimestamp(); - sampler->end_time = TimestampTzPlusMilliseconds(sampler->start_time, - sampler->time); + tsm->SampleScanGetSampleSize = system_time_samplescangetsamplesize; + tsm->InitSampleScan = system_time_initsamplescan; + tsm->BeginSampleScan = system_time_beginsamplescan; + tsm->NextSampleBlock = system_time_nextsampleblock; + tsm->NextSampleTuple = system_time_nextsampletuple; + tsm->EndSampleScan = NULL; - sampler_random_init_state(sampler->seed, sampler->randstate); - - /* Find relative prime as step size for linear probing. */ - sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate); - - /* - * Randomize start position so that blocks close to step size don't have - * higher probability of being chosen on very short scan. - */ - sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step); - - tsdesc->tsmdata = (void *) sampler; - - PG_RETURN_VOID(); + PG_RETURN_POINTER(tsm); } /* - * Get next block number or InvalidBlockNumber when we're done. - * - * Uses linear probing algorithm for picking next block. + * Sample size estimation. */ -Datum -tsm_system_time_nextblock(PG_FUNCTION_ARGS) +static void +system_time_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; - - sampler->lb = (sampler->lb + sampler->step) % sampler->nblocks; - sampler->doneblocks++; - - /* All blocks have been read, we're done */ - if (sampler->doneblocks > sampler->nblocks) - PG_RETURN_UINT32(InvalidBlockNumber); - - /* - * Update the estimations for time limit at least 10 times per estimated - * number of returned blocks to handle variations in block read speed. - */ - if (sampler->doneblocks % Max(sampler->estblocks / 10, 1) == 0) - { - TimestampTz now = GetCurrentTimestamp(); - long secs; - int usecs; - int usecs_remaining; - int time_per_block; - - TimestampDifference(sampler->start_time, now, &secs, &usecs); - usecs += (int) secs *1000000; - - time_per_block = usecs / sampler->doneblocks; - - /* No time left, end. */ - TimestampDifference(now, sampler->end_time, &secs, &usecs); - if (secs <= 0 && usecs <= 0) - PG_RETURN_UINT32(InvalidBlockNumber); - - /* Remaining microseconds */ - usecs_remaining = usecs + (int) secs *1000000; - - /* Recalculate estimated returned number of blocks */ - if (time_per_block < usecs_remaining && time_per_block > 0) - sampler->estblocks = sampler->time * time_per_block; - } - - PG_RETURN_UINT32(sampler->lb); -} - -/* - * Get next tuple offset in current block or InvalidOffsetNumber if we are done - * with this block. - */ -Datum -tsm_system_time_nexttuple(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - OffsetNumber maxoffset = PG_GETARG_UINT16(2); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; - OffsetNumber tupoffset = sampler->lt; - - if (tupoffset == InvalidOffsetNumber) - tupoffset = FirstOffsetNumber; - else - tupoffset++; - - if (tupoffset > maxoffset) - tupoffset = InvalidOffsetNumber; - - sampler->lt = tupoffset; - - PG_RETURN_UINT16(tupoffset); -} - -/* - * Cleanup method. - */ -Datum -tsm_system_time_end(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - - pfree(tsdesc->tsmdata); - - PG_RETURN_VOID(); -} - -/* - * Reset state (called by ReScan). - */ -Datum -tsm_system_time_reset(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; - - sampler->lt = InvalidOffsetNumber; - sampler->start_time = GetCurrentTimestamp(); - sampler->end_time = TimestampTzPlusMilliseconds(sampler->start_time, - sampler->time); - sampler->estblocks = 2; - sampler->doneblocks = 0; - - sampler_random_init_state(sampler->seed, sampler->randstate); - sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate); - sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step); - - PG_RETURN_VOID(); -} - -/* - * Costing function. - */ -Datum -tsm_system_time_cost(PG_FUNCTION_ARGS) -{ - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Path *path = (Path *) PG_GETARG_POINTER(1); - RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); - List *args = (List *) PG_GETARG_POINTER(3); - BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); - double *tuples = (double *) PG_GETARG_POINTER(5); Node *limitnode; - int32 time; - BlockNumber relpages; - double reltuples; - double density; + double millis; double spc_random_page_cost; + double npages; + double ntuples; - limitnode = linitial(args); + /* Try to extract an estimate for the limit time spec */ + limitnode = (Node *) linitial(paramexprs); limitnode = estimate_expression_value(root, limitnode); - if (IsA(limitnode, RelabelType)) - limitnode = (Node *) ((RelabelType *) limitnode)->arg; - - if (IsA(limitnode, Const)) - time = DatumGetInt32(((Const *) limitnode)->constvalue); + if (IsA(limitnode, Const) && + !((Const *) limitnode)->constisnull) + { + millis = DatumGetFloat8(((Const *) limitnode)->constvalue); + if (millis < 0 || isnan(millis)) + { + /* Default millis if the value is bogus */ + millis = 1000; + } + } else { - /* Default time (1s) if the estimation didn't return Const. */ - time = 1000; + /* Default millis if we didn't obtain a non-null Const */ + millis = 1000; } - relpages = baserel->pages; - reltuples = baserel->tuples; - - /* estimate the tuple density */ - if (relpages > 0) - density = reltuples / (double) relpages; - else - density = (BLCKSZ - SizeOfPageHeaderData) / baserel->width; - - /* - * We equal random page cost value to number of ms it takes to read the - * random page here which is far from accurate but we don't have anything - * better to base our predicted page reads. - */ + /* Get the planner's idea of cost per page read */ get_tablespace_page_costs(baserel->reltablespace, &spc_random_page_cost, NULL); /* - * Assumption here is that we'll never read less than 1% of table pages, - * this is here mainly because it is much less bad to overestimate than - * underestimate and using just spc_random_page_cost will probably lead to - * underestimations in general. + * Estimate the number of pages we can read by assuming that the cost + * figure is expressed in milliseconds. This is completely, unmistakably + * bogus, but we have to do something to produce an estimate and there's + * no better answer. */ - *pages = Min(baserel->pages, Max(time / spc_random_page_cost, baserel->pages / 100)); - *tuples = rint(density * (double) *pages * path->rows / baserel->tuples); - path->rows = *tuples; + if (spc_random_page_cost > 0) + npages = millis / spc_random_page_cost; + else + npages = millis; /* even more bogus, but whatcha gonna do? */ - PG_RETURN_VOID(); + /* Clamp to sane value */ + npages = clamp_row_est(Min((double) baserel->pages, npages)); + + if (baserel->tuples > 0 && baserel->pages > 0) + { + /* Estimate number of tuples returned based on tuple density */ + double density = baserel->tuples / (double) baserel->pages; + + ntuples = npages * density; + } + else + { + /* For lack of data, assume one tuple per page */ + ntuples = npages; + } + + /* Clamp to the estimated relation size */ + ntuples = clamp_row_est(Min(baserel->tuples, ntuples)); + + *pages = npages; + *tuples = ntuples; } +/* + * Initialize during executor setup. + */ +static void +system_time_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(SystemTimeSamplerData)); + /* Note the above leaves tsm_state->step equal to zero */ +} + +/* + * Examine parameters and prepare for a sample scan. + */ +static void +system_time_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) +{ + SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state; + double millis = DatumGetFloat8(params[0]); + + if (millis < 0 || isnan(millis)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample collection time must not be negative"))); + + sampler->seed = seed; + sampler->millis = millis; + sampler->lt = InvalidOffsetNumber; + sampler->doneblocks = 0; + /* start_time, lb will be initialized during first NextSampleBlock call */ + /* we intentionally do not change nblocks/firstblock/step here */ +} + +/* + * Select next block to sample. + * + * Uses linear probing algorithm for picking next block. + */ +static BlockNumber +system_time_nextsampleblock(SampleScanState *node) +{ + SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state; + HeapScanDesc scan = node->ss.ss_currentScanDesc; + instr_time cur_time; + + /* First call within scan? */ + if (sampler->doneblocks == 0) + { + /* First scan within query? */ + if (sampler->step == 0) + { + /* Initialize now that we have scan descriptor */ + SamplerRandomState randstate; + + /* If relation is empty, there's nothing to scan */ + if (scan->rs_nblocks == 0) + return InvalidBlockNumber; + + /* We only need an RNG during this setup step */ + sampler_random_init_state(sampler->seed, randstate); + + /* Compute nblocks/firstblock/step only once per query */ + sampler->nblocks = scan->rs_nblocks; + + /* Choose random starting block within the relation */ + /* (Actually this is the predecessor of the first block visited) */ + sampler->firstblock = sampler_random_fract(randstate) * + sampler->nblocks; + + /* Find relative prime as step size for linear probing */ + sampler->step = random_relative_prime(sampler->nblocks, randstate); + } + + /* Reinitialize lb and start_time */ + sampler->lb = sampler->firstblock; + INSTR_TIME_SET_CURRENT(sampler->start_time); + } + + /* If we've read all blocks in relation, we're done */ + if (++sampler->doneblocks > sampler->nblocks) + return InvalidBlockNumber; + + /* If we've used up all the allotted time, we're done */ + INSTR_TIME_SET_CURRENT(cur_time); + INSTR_TIME_SUBTRACT(cur_time, sampler->start_time); + if (INSTR_TIME_GET_MILLISEC(cur_time) >= sampler->millis) + return InvalidBlockNumber; + + /* + * It's probably impossible for scan->rs_nblocks to decrease between scans + * within a query; but just in case, loop until we select a block number + * less than scan->rs_nblocks. We don't care if scan->rs_nblocks has + * increased since the first scan. + */ + do + { + /* Advance lb, using uint64 arithmetic to forestall overflow */ + sampler->lb = ((uint64) sampler->lb + sampler->step) % sampler->nblocks; + } while (sampler->lb >= scan->rs_nblocks); + + return sampler->lb; +} + +/* + * Select next sampled tuple in current block. + * + * In block sampling, we just want to sample all the tuples in each selected + * block. + * + * When we reach end of the block, return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +static OffsetNumber +system_time_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) +{ + SystemTimeSamplerData *sampler = (SystemTimeSamplerData *) node->tsm_state; + OffsetNumber tupoffset = sampler->lt; + + /* Advance to next possible offset on page */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* Done? */ + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + return tupoffset; +} + +/* + * Compute greatest common divisor of two uint32's. + */ static uint32 gcd(uint32 a, uint32 b) { @@ -296,22 +330,29 @@ gcd(uint32 a, uint32 b) return b; } +/* + * Pick a random value less than and relatively prime to n, if possible + * (else return 1). + */ static uint32 random_relative_prime(uint32 n, SamplerRandomState randstate) { - /* Pick random starting number, with some limits on what it can be. */ - uint32 r = (uint32) sampler_random_fract(randstate) * n / 2 + n / 4, - t; + uint32 r; + + /* Safety check to avoid infinite loop or zero result for small n. */ + if (n <= 1) + return 1; /* * This should only take 2 or 3 iterations as the probability of 2 numbers - * being relatively prime is ~61%. + * being relatively prime is ~61%; but just in case, we'll include a + * CHECK_FOR_INTERRUPTS in the loop. */ - while ((t = gcd(r, n)) > 1) + do { CHECK_FOR_INTERRUPTS(); - r /= t; - } + r = (uint32) (sampler_random_fract(randstate) * n); + } while (r == 0 || gcd(r, n) > 1); return r; } diff --git a/contrib/tsm_system_time/tsm_system_time.control b/contrib/tsm_system_time/tsm_system_time.control index ebcee19d23..c247987c66 100644 --- a/contrib/tsm_system_time/tsm_system_time.control +++ b/contrib/tsm_system_time/tsm_system_time.control @@ -1,5 +1,5 @@ # tsm_system_time extension -comment = 'SYSTEM TABLESAMPLE method which accepts time in milliseconds as a limit' +comment = 'TABLESAMPLE method which accepts time in milliseconds as a limit' default_version = '1.0' module_pathname = '$libdir/tsm_system_time' relocatable = true diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 2c2190f13d..9096ee5d51 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -278,11 +278,6 @@ planner statistics - - pg_tablesample_method - table sampling methods - - pg_tablespace tablespaces within this database cluster @@ -6132,121 +6127,6 @@ - - <structname>pg_tabesample_method</structname> - - - pg_am - - - - The catalog pg_tablesample_method stores - information about table sampling methods which can be used in - TABLESAMPLE clause of a SELECT - statement. - - - - <structname>pg_tablesample_method</> Columns - - - - - Name - Type - References - Description - - - - - - oid - oid - - Row identifier (hidden attribute; must be explicitly selected) - - - - tsmname - name - - Name of the sampling method - - - - tsmseqscan - bool - - If true, the sampling method scans the whole table sequentially. - - - - - tsmpagemode - bool - - If true, the sampling method always reads the pages completely. - - - - - tsminit - regproc - pg_proc.oid - Initialize the sampling scan function - - - - tsmnextblock - regproc - pg_proc.oid - Get next block number function - - - - tsmnexttuple - regproc - pg_proc.oid - Get next tuple offset function - - - - tsmexaminetuple - regproc - pg_proc.oid - Function which examines the tuple contents and decides if to - return it, or zero if none - - - - tsmend - regproc - pg_proc.oid - End the sampling scan function - - - - tsmreset - regproc - pg_proc.oid - Restart the state of sampling scan function - - - - tsmcost - regproc - pg_proc.oid - Costing function - - - - -
- -
- - <structname>pg_tablespace</structname> diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 8e13555a3a..8113ddf817 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -4346,7 +4346,7 @@ SET xmloption TO { DOCUMENT | CONTENT }; an object identifier. There are also several alias types for oid: regproc, regprocedure, regoper, regoperator, regclass, - regtype, regrole, regnamespace, + regtype, regrole, regnamespace, regconfig, and regdictionary. shows an overview. @@ -4622,6 +4622,10 @@ SELECT * FROM pg_attribute fdw_handler + + tsm_handler + + cstring @@ -4716,6 +4720,11 @@ SELECT * FROM pg_attribute A foreign-data wrapper handler is declared to return fdw_handler.
+ + tsm_handler + A tablesample method handler is declared to return tsm_handler. + + record Identifies a function returning an unspecified row type. diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml index d1703e9c01..7e82cdc3b1 100644 --- a/doc/src/sgml/postgres.sgml +++ b/doc/src/sgml/postgres.sgml @@ -243,6 +243,7 @@ &nls; &plhandler; &fdwhandler; + &tablesample-method; &custom-scan; &geqo; &indexam; @@ -250,7 +251,6 @@ &spgist; &gin; &brin; - &tablesample-method; &storage; &bki; &planstats; diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml index 632d7935cb..44810f4909 100644 --- a/doc/src/sgml/ref/select.sgml +++ b/doc/src/sgml/ref/select.sgml @@ -49,7 +49,8 @@ SELECT [ ALL | DISTINCT [ ON ( expressionwhere from_item can be one of: - [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] + [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] + [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] [ LATERAL ] ( select ) [ AS ] alias [ ( column_alias [, ...] ) ] with_query_name [ [ AS ] alias [ ( column_alias [, ...] ) ] ] [ LATERAL ] function_name ( [ argument [, ...] ] ) @@ -325,50 +326,6 @@ TABLE [ ONLY ] table_name [ * ] - - TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] - - - Table sample clause after - table_name indicates that - a sampling_method should - be used to retrieve subset of rows in the table. - The sampling_method can be - any sampling method installed in the database. There are currently two - sampling methods available in the standard - PostgreSQL distribution: - - - SYSTEM - - - BERNOULLI - - - Both of these sampling methods currently accept only single argument - which is the percent (floating point from 0 to 100) of the rows to - be returned. - The SYSTEM sampling method does block level - sampling with each block having the same chance of being selected and - returns all rows from each selected block. - The BERNOULLI scans whole table and returns - individual rows with equal probability. Additional sampling methods - may be installed in the database via extensions. - - - The optional parameter REPEATABLE uses the seed - parameter, which can be a number or expression producing a number, as - a random seed for sampling. Note that subsequent commands may return - different results even if same REPEATABLE clause was - specified. This happens because DML statements and - maintenance operations such as VACUUM may affect physical - distribution of data. The setseed() function will not - affect the sampling result when the REPEATABLE - parameter is used. - - - - alias @@ -387,6 +344,61 @@ TABLE [ ONLY ] table_name [ * ] + + TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] + + + A TABLESAMPLE clause after + a table_name indicates that the + specified sampling_method + should be used to retrieve a subset of the rows in that table. + This sampling precedes the application of any other filters such + as WHERE clauses. + The standard PostgreSQL distribution + includes two sampling methods, BERNOULLI + and SYSTEM, and other sampling methods can be + installed in the database via extensions. + + + + The BERNOULLI and SYSTEM sampling methods + each accept a single argument + which is the fraction of the table to sample, expressed as a + percentage between 0 and 100. This argument can be + any real-valued expression. (Other sampling methods might + accept more or different arguments.) These two methods each return + a randomly-chosen sample of the table that will contain + approximately the specified percentage of the table's rows. + The BERNOULLI method scans the whole table and + selects or ignores individual rows independently with the specified + probability. + The SYSTEM method does block-level sampling with + each block having the specified chance of being selected; all rows + in each selected block are returned. + The SYSTEM method is significantly faster than + the BERNOULLI method when small sampling + percentages are specified, but it may return a less-random sample of + the table as a result of clustering effects. + + + + The optional REPEATABLE clause specifies + a seed number or expression to use + for generating random numbers within the sampling method. The seed + value can be any non-null floating-point value. Two queries that + specify the same seed and argument + values will select the same sample of the table, if the table has + not been changed meanwhile. But different seed values will usually + produce different samples. + If REPEATABLE is not given then a new random + sample is selected for each query. + Note that some add-on sampling methods do not + accept REPEATABLE, and will always produce new + samples on each use. + + + + select @@ -1870,6 +1882,16 @@ SELECT distributors.* WHERE distributors.name = 'Westward'; + + <literal>TABLESAMPLE</literal> Clause Restrictions + + + The TABLESAMPLE clause is currently accepted only on + regular tables and materialized views. According to the SQL standard + it should be possible to apply it to any FROM item. + + + Function Calls in <literal>FROM</literal> @@ -1993,19 +2015,5 @@ SELECT distributors.* WHERE distributors.name = 'Westward'; - - <literal>TABLESAMPLE</literal> clause - - - The TABLESAMPLE clause is currently accepted only on physical - relations and materialized views. - - - - Additional modules allow you to install custom sampling methods and use - them instead of the SQL standard methods. - - - diff --git a/doc/src/sgml/tablesample-method.sgml b/doc/src/sgml/tablesample-method.sgml index 48eb7fe84e..22f8bbe19a 100644 --- a/doc/src/sgml/tablesample-method.sgml +++ b/doc/src/sgml/tablesample-method.sgml @@ -1,139 +1,301 @@ - Writing A TABLESAMPLE Sampling Method + Writing A Table Sampling Method - tablesample method + table sampling method + + + + TABLESAMPLE method - The TABLESAMPLE clause implementation in - PostgreSQL supports creating a custom sampling methods. - These methods control what sample of the table will be returned when the - TABLESAMPLE clause is used. + PostgreSQL's implementation of the TABLESAMPLE + clause supports custom table sampling methods, in addition to + the BERNOULLI and SYSTEM methods that are required + by the SQL standard. The sampling method determines which rows of the + table will be selected when the TABLESAMPLE clause is used. - - Tablesample Method Functions + + At the SQL level, a table sampling method is represented by a single SQL + function, typically implemented in C, having the signature + +method_name(internal) RETURNS tsm_handler + + The name of the function is the same method name appearing in the + TABLESAMPLE clause. The internal argument is a dummy + (always having value zero) that simply serves to prevent this function from + being called directly from a SQL command. + The result of the function must be a palloc'd struct of + type TsmRoutine, which contains pointers to support functions for + the sampling method. These support functions are plain C functions and + are not visible or callable at the SQL level. The support functions are + described in . + + + + In addition to function pointers, the TsmRoutine struct must + provide these additional fields: + + + + + List *parameterTypes + + + This is an OID list containing the data type OIDs of the parameter(s) + that will be accepted by the TABLESAMPLE clause when this + sampling method is used. For example, for the built-in methods, this + list contains a single item with value FLOAT4OID, which + represents the sampling percentage. Custom sampling methods can have + more or different parameters. + + + + + + bool repeatable_across_queries + + + If true, the sampling method can deliver identical samples + across successive queries, if the same parameters + and REPEATABLE seed value are supplied each time and the + table contents have not changed. When this is false, + the REPEATABLE clause is not accepted for use with the + sampling method. + + + + + + bool repeatable_across_scans + + + If true, the sampling method can deliver identical samples + across successive scans in the same query (assuming unchanging + parameters, seed value, and snapshot). + When this is false, the planner will not select plans that + would require scanning the sampled table more than once, since that + might result in inconsistent query output. + + + + + + + The TsmRoutine struct type is declared + in src/include/access/tsmapi.h, which see for additional + details. + + + + The table sampling methods included in the standard distribution are good + references when trying to write your own. Look into + the src/backend/access/tablesample subdirectory of the source + tree for the built-in sampling methods, and into the contrib + subdirectory for add-on methods. + + + + Sampling Method Support Functions - The tablesample method must provide following set of functions: + The TSM handler function returns a palloc'd TsmRoutine struct + containing pointers to the support functions described below. Most of + the functions are required, but some are optional, and those pointers can + be NULL. void -tsm_init (TableSampleDesc *desc, - uint32 seed, ...); +SampleScanGetSampleSize (PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); - Initialize the tablesample scan. The function is called at the beginning - of each relation scan. + + This function is called during planning. It must estimate the number of + relation pages that will be read during a sample scan, and the number of + tuples that will be selected by the scan. (For example, these might be + determined by estimating the sampling fraction, and then multiplying + the baserel->pages and baserel->tuples + numbers by that, being sure to round the results to integral values.) + The paramexprs list holds the expression(s) that are + parameters to the TABLESAMPLE clause. It is recommended to + use estimate_expression_value() to try to reduce these + expressions to constants, if their values are needed for estimation + purposes; but the function must provide size estimates even if they cannot + be reduced, and it should not fail even if the values appear invalid + (remember that they're only estimates of what the run-time values will be). + The pages and tuples parameters are outputs. + - Note that the first two parameters are required but you can specify - additional parameters which then will be used by the TABLESAMPLE - clause to determine the required user input in the query itself. - This means that if your function will specify additional float4 parameter - named percent, the user will have to call the tablesample method with - expression which evaluates (or can be coerced) to float4. - For example this definition: -tsm_init (TableSampleDesc *desc, - uint32 seed, float4 pct); +void +InitSampleScan (SampleScanState *node, + int eflags); -Will lead to SQL call like this: + + Initialize for execution of a SampleScan plan node. + This is called during executor startup. + It should perform any initialization needed before processing can start. + The SampleScanState node has already been created, but + its tsm_state field is NULL. + The InitSampleScan function can palloc whatever internal + state data is needed by the sampling method, and store a pointer to + it in node->tsm_state. + Information about the table to scan is accessible through other fields + of the SampleScanState node (but note that the + node->ss.ss_currentScanDesc scan descriptor is not set + up yet). + eflags contains flag bits describing the executor's + operating mode for this plan node. + + + + When (eflags & EXEC_FLAG_EXPLAIN_ONLY) is true, + the scan will not actually be performed, so this function should only do + the minimum required to make the node state valid for EXPLAIN + and EndSampleScan. + + + + This function can be omitted (set the pointer to NULL), in which case + BeginSampleScan must perform all initialization needed + by the sampling method. + + + -... TABLESAMPLE yourmethod(0.5) ... +void +BeginSampleScan (SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); + + Begin execution of a sampling scan. + This is called just before the first attempt to fetch a tuple, and + may be called again if the scan needs to be restarted. + Information about the table to scan is accessible through fields + of the SampleScanState node (but note that the + node->ss.ss_currentScanDesc scan descriptor is not set + up yet). + The params array, of length nparams, contains the + values of the parameters supplied in the TABLESAMPLE clause. + These will have the number and types specified in the sampling + method's parameterTypes list, and have been checked + to not be null. + seed contains a seed to use for any random numbers generated + within the sampling method; it is either a hash derived from the + REPEATABLE value if one was given, or the result + of random() if not. + + + + This function may adjust the fields node->use_bulkread + and node->use_pagemode. + If node->use_bulkread is true, which it is by + default, the scan will use a buffer access strategy that encourages + recycling buffers after use. It might be reasonable to set this + to false if the scan will visit only a small fraction of the + table's pages. + If node->use_pagemode is true, which it is by + default, the scan will perform visibility checking in a single pass for + all tuples on each visited page. It might be reasonable to set this + to false if the scan will select only a small fraction of the + tuples on each visited page. That will result in fewer tuple visibility + checks being performed, though each one will be more expensive because it + will require more locking. + + + + If the sampling method is + marked repeatable_across_scans, it must be able to + select the same set of tuples during a rescan as it did originally, that is + a fresh call of BeginSampleScan must lead to selecting the + same tuples as before (if the TABLESAMPLE parameters + and seed don't change). BlockNumber -tsm_nextblock (TableSampleDesc *desc); +NextSampleBlock (SampleScanState *node); - Returns the block number of next page to be scanned. InvalidBlockNumber - should be returned if the sampling has reached end of the relation. + + Returns the block number of the next page to be scanned, or + InvalidBlockNumber if no pages remain to be scanned. + + + + This function can be omitted (set the pointer to NULL), in which case + the core code will perform a sequential scan of the entire relation. + Such a scan can use synchronized scanning, so that the sampling method + cannot assume that the relation pages are visited in the same order on + each scan. OffsetNumber -tsm_nexttuple (TableSampleDesc *desc, BlockNumber blockno, - OffsetNumber maxoffset); +NextSampleTuple (SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); - Return next tuple offset for the current page. InvalidOffsetNumber should - be returned if the sampling has reached end of the page. + + Returns the offset number of the next tuple to be sampled on the + specified page, or InvalidOffsetNumber if no tuples remain to + be sampled. maxoffset is the largest offset number in use + on the page. + + + NextSampleTuple is not explicitly told which of the offset + numbers in the range 1 .. maxoffset actually contain valid + tuples. This is not normally a problem since the core code ignores + requests to sample missing or invisible tuples; that should not result in + any bias in the sample. However, if necessary, the function can + examine node->ss.ss_currentScanDesc->rs_vistuples[] + to identify which tuples are valid and visible. (This + requires node->use_pagemode to be true.) + + + + + + NextSampleTuple must not assume + that blockno is the same page number returned by the most + recent NextSampleBlock call. It was returned by some + previous NextSampleBlock call, but the core code is allowed + to call NextSampleBlock in advance of actually scanning + pages, so as to support prefetching. It is OK to assume that once + sampling of a given page begins, successive NextSampleTuple + calls all refer to the same page until InvalidOffsetNumber is + returned. + + + void -tsm_end (TableSampleDesc *desc); +EndSampleScan (SampleScanState *node); - The scan has finished, cleanup any left over state. + + End the scan and release resources. It is normally not important + to release palloc'd memory, but any externally-visible resources + should be cleaned up. + This function can be omitted (set the pointer to NULL) in the common + case where no such resources exist. - - -void -tsm_reset (TableSampleDesc *desc); - - The scan needs to rescan the relation again, reset any tablesample method - state. - - - - -void -tsm_cost (PlannerInfo *root, Path *path, RelOptInfo *baserel, - List *args, BlockNumber *pages, double *tuples); - - This function is used by optimizer to decide best plan and is also used - for output of EXPLAIN. - - - - There is one more function which tablesampling method can implement in order - to gain more fine grained control over sampling. This function is optional: - - - - -bool -tsm_examinetuple (TableSampleDesc *desc, BlockNumber blockno, - HeapTuple tuple, bool visible); - - Function that enables the sampling method to examine contents of the tuple - (for example to collect some internal statistics). The return value of this - function is used to determine if the tuple should be returned to client. - Note that this function will receive even invisible tuples but it is not - allowed to return true for such tuple (if it does, - PostgreSQL will raise an error). - - - - As you can see most of the tablesample method interfaces get the - TableSampleDesc as a first parameter. This structure holds - state of the current scan and also provides storage for the tablesample - method's state. It is defined as following: - -typedef struct TableSampleDesc { - HeapScanDesc heapScan; - TupleDesc tupDesc; - - void *tsmdata; -} TableSampleDesc; - - Where heapScan is the descriptor of the physical table scan. - It's possible to get table size info from it. The tupDesc - represents the tuple descriptor of the tuples returned by the scan and passed - to the tsm_examinetuple() interface. The tsmdata - can be used by tablesample method itself to store any state info it might - need during the scan. If used by the method, it should be pfreed - in tsm_end() function. - diff --git a/doc/src/sgml/tsm-system-rows.sgml b/doc/src/sgml/tsm-system-rows.sgml index 0c2f1779c9..93aa536664 100644 --- a/doc/src/sgml/tsm-system-rows.sgml +++ b/doc/src/sgml/tsm-system-rows.sgml @@ -8,24 +8,37 @@ - The tsm_system_rows module provides the tablesample method - SYSTEM_ROWS, which can be used inside the - TABLESAMPLE clause of a SELECT. + The tsm_system_rows module provides the table sampling method + SYSTEM_ROWS, which can be used in + the TABLESAMPLE clause of a + command. - This tablesample method uses a linear probing algorithm to read sample - of a table and uses actual number of rows as limit (unlike the - SYSTEM tablesample method which limits by percentage - of a table). + This table sampling method accepts a single integer argument that is the + maximum number of rows to read. The resulting sample will always contain + exactly that many rows, unless the table does not contain enough rows, in + which case the whole table is selected. + + + + Like the built-in SYSTEM sampling + method, SYSTEM_ROWS performs block-level sampling, so + that the sample is not completely random but may be subject to clustering + effects, especially if only a small number of rows are requested. + + + + SYSTEM_ROWS does not support + the REPEATABLE clause. Examples - Here is an example of selecting sample of a table with - SYSTEM_ROWS. First install the extension: + Here is an example of selecting a sample of a table with + SYSTEM_ROWS. First install the extension: @@ -33,8 +46,7 @@ CREATE EXTENSION tsm_system_rows; - Then you can use it in SELECT command same way as other - tablesample methods: + Then you can use it in a SELECT command, for instance: SELECT * FROM my_table TABLESAMPLE SYSTEM_ROWS(100); @@ -42,8 +54,9 @@ SELECT * FROM my_table TABLESAMPLE SYSTEM_ROWS(100); - The above command will return a sample of 100 rows from the table my_table - (less if the table does not have 100 visible rows). + This command will return a sample of 100 rows from the + table my_table (unless the table does not have 100 + visible rows, in which case all its rows are returned). diff --git a/doc/src/sgml/tsm-system-time.sgml b/doc/src/sgml/tsm-system-time.sgml index 2343ab16d4..3f8ff1a026 100644 --- a/doc/src/sgml/tsm-system-time.sgml +++ b/doc/src/sgml/tsm-system-time.sgml @@ -8,25 +8,39 @@ - The tsm_system_time module provides the tablesample method - SYSTEM_TIME, which can be used inside the - TABLESAMPLE clause of a SELECT. + The tsm_system_time module provides the table sampling method + SYSTEM_TIME, which can be used in + the TABLESAMPLE clause of a + command. - This tablesample method uses a linear probing algorithm to read sample - of a table and uses time in milliseconds as limit (unlike the - SYSTEM tablesample method which limits by percentage - of a table). This gives you some control over the length of execution - of your query. + This table sampling method accepts a single floating-point argument that + is the maximum number of milliseconds to spend reading the table. This + gives you direct control over how long the query takes, at the price that + the size of the sample becomes hard to predict. The resulting sample will + contain as many rows as could be read in the specified time, unless the + whole table has been read first. + + + + Like the built-in SYSTEM sampling + method, SYSTEM_TIME performs block-level sampling, so + that the sample is not completely random but may be subject to clustering + effects, especially if only a small number of rows are selected. + + + + SYSTEM_TIME does not support + the REPEATABLE clause. Examples - Here is an example of selecting sample of a table with - SYSTEM_TIME. First install the extension: + Here is an example of selecting a sample of a table with + SYSTEM_TIME. First install the extension: @@ -34,8 +48,7 @@ CREATE EXTENSION tsm_system_time; - Then you can use it in a SELECT command the same way as - other tablesample methods: + Then you can use it in a SELECT command, for instance: SELECT * FROM my_table TABLESAMPLE SYSTEM_TIME(1000); @@ -43,8 +56,9 @@ SELECT * FROM my_table TABLESAMPLE SYSTEM_TIME(1000); - The above command will return as large a sample of my_table as it can read in - 1 second (or less if it reads whole table faster). + This command will return as large a sample of my_table as + it can read in 1 second (1000 milliseconds). Of course, if the whole + table can be read in under 1 second, all its rows will be returned. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 6f4ff2718f..050efdc480 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -80,8 +80,11 @@ bool synchronize_seqscans = true; static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, bool allow_pagemode, - bool is_bitmapscan, bool is_samplescan, + bool allow_strat, + bool allow_sync, + bool allow_pagemode, + bool is_bitmapscan, + bool is_samplescan, bool temp_snap); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); @@ -207,7 +210,7 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = * ---------------- */ static void -initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) +initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) { bool allow_strat; bool allow_sync; @@ -257,12 +260,12 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) scan->rs_strategy = NULL; } - if (is_rescan) + if (keep_startblock) { /* - * If rescan, keep the previous startblock setting so that rewinding a - * cursor doesn't generate surprising results. Reset the syncscan - * setting, though. + * When rescanning, we want to keep the previous startblock setting, + * so that rewinding a cursor doesn't generate surprising results. + * Reset the active syncscan setting, though. */ scan->rs_syncscan = (allow_sync && synchronize_seqscans); } @@ -1313,6 +1316,10 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, /* ---------------- * heap_beginscan - begin relation scan * + * heap_beginscan is the "standard" case. + * + * heap_beginscan_catalog differs in setting up its own temporary snapshot. + * * heap_beginscan_strat offers an extended API that lets the caller control * whether a nondefault buffer access strategy can be used, and whether * syncscan can be chosen (possibly resulting in the scan not starting from @@ -1323,8 +1330,11 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, * really quite unlike a standard seqscan, there is just enough commonality * to make it worth using the same data structure. * - * heap_beginscan_samplingscan is alternate entry point for setting up a - * HeapScanDesc for a TABLESAMPLE scan. + * heap_beginscan_sampling is an alternative entry point for setting up a + * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth + * using the same data structure although the behavior is rather different. + * In addition to the options offered by heap_beginscan_strat, this call + * also allows control of whether page-mode visibility checking is used. * ---------------- */ HeapScanDesc @@ -1366,18 +1376,22 @@ heap_beginscan_bm(Relation relation, Snapshot snapshot, HeapScanDesc heap_beginscan_sampling(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_pagemode) + bool allow_strat, bool allow_sync, bool allow_pagemode) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - allow_strat, false, allow_pagemode, + allow_strat, allow_sync, allow_pagemode, false, true, false); } static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, bool allow_pagemode, - bool is_bitmapscan, bool is_samplescan, bool temp_snap) + bool allow_strat, + bool allow_sync, + bool allow_pagemode, + bool is_bitmapscan, + bool is_samplescan, + bool temp_snap) { HeapScanDesc scan; @@ -1461,6 +1475,27 @@ heap_rescan(HeapScanDesc scan, initscan(scan, key, true); } +/* ---------------- + * heap_rescan_set_params - restart a relation scan after changing params + * + * This call allows changing the buffer strategy, syncscan, and pagemode + * options before starting a fresh scan. Note that although the actual use + * of syncscan might change (effectively, enabling or disabling reporting), + * the previously selected startblock will be kept. + * ---------------- + */ +void +heap_rescan_set_params(HeapScanDesc scan, ScanKey key, + bool allow_strat, bool allow_sync, bool allow_pagemode) +{ + /* adjust parameters */ + scan->rs_allow_strat = allow_strat; + scan->rs_allow_sync = allow_sync; + scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot); + /* ... and rescan */ + heap_rescan(scan, key); +} + /* ---------------- * heap_endscan - end relation scan * diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile index 46eeb59f9c..68d9ab2814 100644 --- a/src/backend/access/tablesample/Makefile +++ b/src/backend/access/tablesample/Makefile @@ -1,10 +1,10 @@ #------------------------------------------------------------------------- # # Makefile-- -# Makefile for utils/tablesample +# Makefile for access/tablesample # # IDENTIFICATION -# src/backend/utils/tablesample/Makefile +# src/backend/access/tablesample/Makefile # #------------------------------------------------------------------------- @@ -12,6 +12,6 @@ subdir = src/backend/access/tablesample top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = tablesample.o system.o bernoulli.o +OBJS = bernoulli.o system.o tablesample.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c index 0a53900822..cf88f95e75 100644 --- a/src/backend/access/tablesample/bernoulli.c +++ b/src/backend/access/tablesample/bernoulli.c @@ -1,233 +1,231 @@ /*------------------------------------------------------------------------- * * bernoulli.c - * interface routines for BERNOULLI tablesample method + * support routines for BERNOULLI tablesample method * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * To ensure repeatability of samples, it is necessary that selection of a + * given tuple be history-independent; otherwise syncscanning would break + * repeatability, to say nothing of logically-irrelevant maintenance such + * as physical extension or shortening of the relation. + * + * To achieve that, we proceed by hashing each candidate TID together with + * the active seed, and then selecting it if the hash is less than the + * cutoff value computed from the selection probability by BeginSampleScan. + * + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * src/backend/utils/tablesample/bernoulli.c + * src/backend/access/tablesample/bernoulli.c * *------------------------------------------------------------------------- */ #include "postgres.h" -#include "fmgr.h" +#ifdef _MSC_VER +#include /* for _isnan */ +#endif +#include -#include "access/tablesample.h" -#include "access/relscan.h" -#include "nodes/execnodes.h" -#include "nodes/relation.h" +#include "access/hash.h" +#include "access/tsmapi.h" +#include "catalog/pg_type.h" #include "optimizer/clauses.h" -#include "storage/bufmgr.h" -#include "utils/sampling.h" +#include "optimizer/cost.h" +#include "utils/builtins.h" -/* tsdesc */ +/* Private state */ typedef struct { + uint64 cutoff; /* select tuples with hash less than this */ uint32 seed; /* random seed */ - BlockNumber startblock; /* starting block, we use ths for syncscan - * support */ - BlockNumber nblocks; /* number of blocks */ - BlockNumber blockno; /* current block */ - float4 probability; /* probabilty that tuple will be returned - * (0.0-1.0) */ OffsetNumber lt; /* last tuple returned from current block */ - SamplerRandomState randstate; /* random generator tsdesc */ } BernoulliSamplerData; + +static void bernoulli_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void bernoulli_initsamplescan(SampleScanState *node, + int eflags); +static void bernoulli_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + + /* - * Initialize the state. + * Create a TsmRoutine descriptor for the BERNOULLI method. */ Datum -tsm_bernoulli_init(PG_FUNCTION_ARGS) +tsm_bernoulli_handler(PG_FUNCTION_ARGS) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - uint32 seed = PG_GETARG_UINT32(1); - float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); - HeapScanDesc scan = tsdesc->heapScan; - BernoulliSamplerData *sampler; + TsmRoutine *tsm = makeNode(TsmRoutine); - if (percent < 0 || percent > 100) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("invalid sample size"), - errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + tsm->parameterTypes = list_make1_oid(FLOAT4OID); + tsm->repeatable_across_queries = true; + tsm->repeatable_across_scans = true; + tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize; + tsm->InitSampleScan = bernoulli_initsamplescan; + tsm->BeginSampleScan = bernoulli_beginsamplescan; + tsm->NextSampleBlock = NULL; + tsm->NextSampleTuple = bernoulli_nextsampletuple; + tsm->EndSampleScan = NULL; - sampler = palloc0(sizeof(BernoulliSamplerData)); - - /* Remember initial values for reinit */ - sampler->seed = seed; - sampler->startblock = scan->rs_startblock; - sampler->nblocks = scan->rs_nblocks; - sampler->blockno = InvalidBlockNumber; - sampler->probability = percent / 100; - sampler->lt = InvalidOffsetNumber; - sampler_random_init_state(sampler->seed, sampler->randstate); - - tsdesc->tsmdata = (void *) sampler; - - PG_RETURN_VOID(); + PG_RETURN_POINTER(tsm); } /* - * Get next block number to read or InvalidBlockNumber if we are at the - * end of the relation. + * Sample size estimation. */ -Datum -tsm_bernoulli_nextblock(PG_FUNCTION_ARGS) +static void +bernoulli_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata; + Node *pctnode; + float4 samplefract; - /* - * Bernoulli sampling scans all blocks on the table and supports syncscan - * so loop from startblock to startblock instead of from 0 to nblocks. - */ - if (sampler->blockno == InvalidBlockNumber) - sampler->blockno = sampler->startblock; + /* Try to extract an estimate for the sample percentage */ + pctnode = (Node *) linitial(paramexprs); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, Const) && + !((Const *) pctnode)->constisnull) + { + samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); + if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) + samplefract /= 100.0f; + else + { + /* Default samplefract if the value is bogus */ + samplefract = 0.1f; + } + } else { - sampler->blockno++; - - if (sampler->blockno >= sampler->nblocks) - sampler->blockno = 0; - - if (sampler->blockno == sampler->startblock) - PG_RETURN_UINT32(InvalidBlockNumber); + /* Default samplefract if we didn't obtain a non-null Const */ + samplefract = 0.1f; } - PG_RETURN_UINT32(sampler->blockno); + /* We'll visit all pages of the baserel */ + *pages = baserel->pages; + + *tuples = clamp_row_est(baserel->tuples * samplefract); } /* - * Get next tuple from current block. + * Initialize during executor setup. + */ +static void +bernoulli_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(BernoulliSamplerData)); +} + +/* + * Examine parameters and prepare for a sample scan. + */ +static void +bernoulli_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) +{ + BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state; + double percent = DatumGetFloat4(params[0]); + + if (percent < 0 || percent > 100 || isnan(percent)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample percentage must be between 0 and 100"))); + + /* + * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to + * store that as a uint64, of course. Note that this gives strictly + * correct behavior at the limits of zero or one probability. + */ + sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); + sampler->seed = seed; + sampler->lt = InvalidOffsetNumber; + + /* + * Use bulkread, since we're scanning all pages. But pagemode visibility + * checking is a win only at larger sampling fractions. The 25% cutoff + * here is based on very limited experimentation. + */ + node->use_bulkread = true; + node->use_pagemode = (percent >= 25); +} + +/* + * Select next sampled tuple in current block. * - * This method implements the main logic in bernoulli sampling. - * The algorithm simply generates new random number (in 0.0-1.0 range) and if - * it falls within user specified probability (in the same range) return the - * tuple offset. + * It is OK here to return an offset without knowing if the tuple is visible + * (or even exists). The reason is that we do the coinflip for every tuple + * offset in the table. Since all tuples have the same probability of being + * returned, it doesn't matter if we do extra coinflips for invisible tuples. * - * It is ok here to return tuple offset without knowing if tuple is visible - * and not check it via examinetuple. The reason for that is that we do the - * coinflip (random number generation) for every tuple in the table. Since all - * tuples have same probability of being returned the visible and invisible - * tuples will be returned in same ratio as they have in the actual table. - * This means that there is no skew towards either visible or invisible tuples - * and the number of visible tuples returned from the executor node should - * match the fraction of visible tuples which was specified by user. - * - * This is faster than doing the coinflip in examinetuple because we don't - * have to do visibility checks on uninteresting tuples. - * - * If we reach end of the block return InvalidOffsetNumber which tells + * When we reach end of the block, return InvalidOffsetNumber which tells * SampleScan to go to next block. */ -Datum -tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS) +static OffsetNumber +bernoulli_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - OffsetNumber maxoffset = PG_GETARG_UINT16(2); - BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata; + BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state; OffsetNumber tupoffset = sampler->lt; - float4 probability = sampler->probability; + uint32 hashinput[3]; + /* Advance to first/next tuple in block */ if (tupoffset == InvalidOffsetNumber) tupoffset = FirstOffsetNumber; else tupoffset++; /* - * Loop over tuple offsets until the random generator returns value that - * is within the probability of returning the tuple or until we reach end - * of the block. + * We compute the hash by applying hash_any to an array of 3 uint32's + * containing the block, offset, and seed. This is efficient to set up, + * and with the current implementation of hash_any, it gives + * machine-independent results, which is a nice property for regression + * testing. * - * (This is our implementation of bernoulli trial) + * These words in the hash input are the same throughout the block: */ - while (sampler_random_fract(sampler->randstate) > probability) - { - tupoffset++; + hashinput[0] = blockno; + hashinput[2] = sampler->seed; - if (tupoffset > maxoffset) + /* + * Loop over tuple offsets until finding suitable TID or reaching end of + * block. + */ + for (; tupoffset <= maxoffset; tupoffset++) + { + uint32 hash; + + hashinput[1] = tupoffset; + + hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, + (int) sizeof(hashinput))); + if (hash < sampler->cutoff) break; } if (tupoffset > maxoffset) - /* Tell SampleScan that we want next block. */ tupoffset = InvalidOffsetNumber; sampler->lt = tupoffset; - PG_RETURN_UINT16(tupoffset); -} - -/* - * Cleanup method. - */ -Datum -tsm_bernoulli_end(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - - pfree(tsdesc->tsmdata); - - PG_RETURN_VOID(); -} - -/* - * Reset tsdesc (called by ReScan). - */ -Datum -tsm_bernoulli_reset(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata; - - sampler->blockno = InvalidBlockNumber; - sampler->lt = InvalidOffsetNumber; - sampler_random_init_state(sampler->seed, sampler->randstate); - - PG_RETURN_VOID(); -} - -/* - * Costing function. - */ -Datum -tsm_bernoulli_cost(PG_FUNCTION_ARGS) -{ - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Path *path = (Path *) PG_GETARG_POINTER(1); - RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); - List *args = (List *) PG_GETARG_POINTER(3); - BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); - double *tuples = (double *) PG_GETARG_POINTER(5); - Node *pctnode; - float4 samplesize; - - *pages = baserel->pages; - - pctnode = linitial(args); - pctnode = estimate_expression_value(root, pctnode); - - if (IsA(pctnode, RelabelType)) - pctnode = (Node *) ((RelabelType *) pctnode)->arg; - - if (IsA(pctnode, Const)) - { - samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); - samplesize /= 100.0; - } - else - { - /* Default samplesize if the estimation didn't return Const. */ - samplesize = 0.1f; - } - - *tuples = path->rows * samplesize; - path->rows = *tuples; - - PG_RETURN_VOID(); + return tupoffset; } diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c index 1d834369a4..43c5dab716 100644 --- a/src/backend/access/tablesample/system.c +++ b/src/backend/access/tablesample/system.c @@ -1,186 +1,260 @@ /*------------------------------------------------------------------------- * * system.c - * interface routines for system tablesample method + * support routines for SYSTEM tablesample method + * + * To ensure repeatability of samples, it is necessary that selection of a + * given tuple be history-independent; otherwise syncscanning would break + * repeatability, to say nothing of logically-irrelevant maintenance such + * as physical extension or shortening of the relation. + * + * To achieve that, we proceed by hashing each candidate block number together + * with the active seed, and then selecting it if the hash is less than the + * cutoff value computed from the selection probability by BeginSampleScan. * * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * src/backend/utils/tablesample/system.c + * src/backend/access/tablesample/system.c * *------------------------------------------------------------------------- */ #include "postgres.h" -#include "fmgr.h" +#ifdef _MSC_VER +#include /* for _isnan */ +#endif +#include -#include "access/tablesample.h" +#include "access/hash.h" #include "access/relscan.h" -#include "nodes/execnodes.h" -#include "nodes/relation.h" +#include "access/tsmapi.h" +#include "catalog/pg_type.h" #include "optimizer/clauses.h" -#include "storage/bufmgr.h" -#include "utils/sampling.h" +#include "optimizer/cost.h" +#include "utils/builtins.h" -/* - * State - */ +/* Private state */ typedef struct { - BlockSamplerData bs; + uint64 cutoff; /* select blocks with hash less than this */ uint32 seed; /* random seed */ - BlockNumber nblocks; /* number of block in relation */ - int samplesize; /* number of blocks to return */ + BlockNumber nextblock; /* next block to consider sampling */ OffsetNumber lt; /* last tuple returned from current block */ } SystemSamplerData; +static void system_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void system_initsamplescan(SampleScanState *node, + int eflags); +static void system_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static BlockNumber system_nextsampleblock(SampleScanState *node); +static OffsetNumber system_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + + /* - * Initializes the state. + * Create a TsmRoutine descriptor for the SYSTEM method. */ Datum -tsm_system_init(PG_FUNCTION_ARGS) +tsm_system_handler(PG_FUNCTION_ARGS) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - uint32 seed = PG_GETARG_UINT32(1); - float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); - HeapScanDesc scan = tsdesc->heapScan; - SystemSamplerData *sampler; + TsmRoutine *tsm = makeNode(TsmRoutine); - if (percent < 0 || percent > 100) + tsm->parameterTypes = list_make1_oid(FLOAT4OID); + tsm->repeatable_across_queries = true; + tsm->repeatable_across_scans = true; + tsm->SampleScanGetSampleSize = system_samplescangetsamplesize; + tsm->InitSampleScan = system_initsamplescan; + tsm->BeginSampleScan = system_beginsamplescan; + tsm->NextSampleBlock = system_nextsampleblock; + tsm->NextSampleTuple = system_nextsampletuple; + tsm->EndSampleScan = NULL; + + PG_RETURN_POINTER(tsm); +} + +/* + * Sample size estimation. + */ +static void +system_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) +{ + Node *pctnode; + float4 samplefract; + + /* Try to extract an estimate for the sample percentage */ + pctnode = (Node *) linitial(paramexprs); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, Const) && + !((Const *) pctnode)->constisnull) + { + samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); + if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) + samplefract /= 100.0f; + else + { + /* Default samplefract if the value is bogus */ + samplefract = 0.1f; + } + } + else + { + /* Default samplefract if we didn't obtain a non-null Const */ + samplefract = 0.1f; + } + + /* We'll visit a sample of the pages ... */ + *pages = clamp_row_est(baserel->pages * samplefract); + + /* ... and hopefully get a representative number of tuples from them */ + *tuples = clamp_row_est(baserel->tuples * samplefract); +} + +/* + * Initialize during executor setup. + */ +static void +system_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(SystemSamplerData)); +} + +/* + * Examine parameters and prepare for a sample scan. + */ +static void +system_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + double percent = DatumGetFloat4(params[0]); + + if (percent < 0 || percent > 100 || isnan(percent)) ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("invalid sample size"), - errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample percentage must be between 0 and 100"))); - sampler = palloc0(sizeof(SystemSamplerData)); - - /* Remember initial values for reinit */ + /* + * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to + * store that as a uint64, of course. Note that this gives strictly + * correct behavior at the limits of zero or one probability. + */ + sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); sampler->seed = seed; - sampler->nblocks = scan->rs_nblocks; - sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0)); + sampler->nextblock = 0; sampler->lt = InvalidOffsetNumber; - BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, - sampler->seed); - - tsdesc->tsmdata = (void *) sampler; - - PG_RETURN_VOID(); + /* + * Bulkread buffer access strategy probably makes sense unless we're + * scanning a very small fraction of the table. The 1% cutoff here is a + * guess. We should use pagemode visibility checking, since we scan all + * tuples on each selected page. + */ + node->use_bulkread = (percent >= 1); + node->use_pagemode = true; } /* - * Get next block number or InvalidBlockNumber when we're done. + * Select next block to sample. + */ +static BlockNumber +system_nextsampleblock(SampleScanState *node) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + HeapScanDesc scan = node->ss.ss_currentScanDesc; + BlockNumber nextblock = sampler->nextblock; + uint32 hashinput[2]; + + /* + * We compute the hash by applying hash_any to an array of 2 uint32's + * containing the block number and seed. This is efficient to set up, and + * with the current implementation of hash_any, it gives + * machine-independent results, which is a nice property for regression + * testing. + * + * These words in the hash input are the same throughout the block: + */ + hashinput[1] = sampler->seed; + + /* + * Loop over block numbers until finding suitable block or reaching end of + * relation. + */ + for (; nextblock < scan->rs_nblocks; nextblock++) + { + uint32 hash; + + hashinput[0] = nextblock; + + hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, + (int) sizeof(hashinput))); + if (hash < sampler->cutoff) + break; + } + + if (nextblock < scan->rs_nblocks) + { + /* Found a suitable block; remember where we should start next time */ + sampler->nextblock = nextblock + 1; + return nextblock; + } + + /* Done, but let's reset nextblock to 0 for safety. */ + sampler->nextblock = 0; + return InvalidBlockNumber; +} + +/* + * Select next sampled tuple in current block. * - * Uses the same logic as ANALYZE for picking the random blocks. + * In block sampling, we just want to sample all the tuples in each selected + * block. + * + * It is OK here to return an offset without knowing if the tuple is visible + * (or even exists); nodeSamplescan.c will deal with that. + * + * When we reach end of the block, return InvalidOffsetNumber which tells + * SampleScan to go to next block. */ -Datum -tsm_system_nextblock(PG_FUNCTION_ARGS) +static OffsetNumber +system_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; - BlockNumber blockno; - - if (!BlockSampler_HasMore(&sampler->bs)) - PG_RETURN_UINT32(InvalidBlockNumber); - - blockno = BlockSampler_Next(&sampler->bs); - - PG_RETURN_UINT32(blockno); -} - -/* - * Get next tuple offset in current block or InvalidOffsetNumber if we are done - * with this block. - */ -Datum -tsm_system_nexttuple(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - OffsetNumber maxoffset = PG_GETARG_UINT16(2); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; OffsetNumber tupoffset = sampler->lt; + /* Advance to next possible offset on page */ if (tupoffset == InvalidOffsetNumber) tupoffset = FirstOffsetNumber; else tupoffset++; + /* Done? */ if (tupoffset > maxoffset) tupoffset = InvalidOffsetNumber; sampler->lt = tupoffset; - PG_RETURN_UINT16(tupoffset); -} - -/* - * Cleanup method. - */ -Datum -tsm_system_end(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - - pfree(tsdesc->tsmdata); - - PG_RETURN_VOID(); -} - -/* - * Reset state (called by ReScan). - */ -Datum -tsm_system_reset(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; - - sampler->lt = InvalidOffsetNumber; - BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, - sampler->seed); - - PG_RETURN_VOID(); -} - -/* - * Costing function. - */ -Datum -tsm_system_cost(PG_FUNCTION_ARGS) -{ - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Path *path = (Path *) PG_GETARG_POINTER(1); - RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); - List *args = (List *) PG_GETARG_POINTER(3); - BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); - double *tuples = (double *) PG_GETARG_POINTER(5); - Node *pctnode; - float4 samplesize; - - pctnode = linitial(args); - pctnode = estimate_expression_value(root, pctnode); - - if (IsA(pctnode, RelabelType)) - pctnode = (Node *) ((RelabelType *) pctnode)->arg; - - if (IsA(pctnode, Const)) - { - samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); - samplesize /= 100.0; - } - else - { - /* Default samplesize if the estimation didn't return Const. */ - samplesize = 0.1f; - } - - *pages = baserel->pages * samplesize; - *tuples = path->rows * samplesize; - path->rows = *tuples; - - PG_RETURN_VOID(); + return tupoffset; } diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c index f21d42c8e3..b8ad7ced74 100644 --- a/src/backend/access/tablesample/tablesample.c +++ b/src/backend/access/tablesample/tablesample.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * tablesample.c - * TABLESAMPLE internal API + * Support functions for TABLESAMPLE feature * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -10,356 +10,31 @@ * IDENTIFICATION * src/backend/access/tablesample/tablesample.c * - * TABLESAMPLE is the SQL standard clause for sampling the relations. - * - * The API is interface between the Executor and the TABLESAMPLE Methods. - * - * TABLESAMPLE Methods are implementations of actual sampling algorithms which - * can be used for returning a sample of the source relation. - * Methods don't read the table directly but are asked for block number and - * tuple offset which they want to examine (or return) and the tablesample - * interface implemented here does the reading for them. - * - * We currently only support sampling of the physical relations, but in the - * future we might extend the API to support subqueries as well. - * * ------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/tablesample.h" - -#include "catalog/pg_tablesample_method.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "storage/bufmgr.h" -#include "storage/predicate.h" -#include "utils/rel.h" -#include "utils/tqual.h" - - -static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan); +#include "access/tsmapi.h" /* - * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method. + * GetTsmRoutine --- get a TsmRoutine struct by invoking the handler. + * + * This is a convenience routine that's just meant to check for errors. */ -TableSampleDesc * -tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample) +TsmRoutine * +GetTsmRoutine(Oid tsmhandler) { - FunctionCallInfoData fcinfo; - int i; - List *args = tablesample->args; - ListCell *arg; - ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; - TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc)); + Datum datum; + TsmRoutine *routine; - /* Load functions */ - fmgr_info(tablesample->tsminit, &(tsdesc->tsminit)); - fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock)); - fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple)); - if (OidIsValid(tablesample->tsmexaminetuple)) - fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple)); - else - tsdesc->tsmexaminetuple.fn_oid = InvalidOid; - fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset)); - fmgr_info(tablesample->tsmend, &(tsdesc->tsmend)); + datum = OidFunctionCall1(tsmhandler, PointerGetDatum(NULL)); + routine = (TsmRoutine *) DatumGetPointer(datum); - InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit, - list_length(args) + 2, - InvalidOid, NULL, NULL); + if (routine == NULL || !IsA(routine, TsmRoutine)) + elog(ERROR, "tablesample handler function %u did not return a TsmRoutine struct", + tsmhandler); - tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; - tsdesc->heapScan = scanstate->ss.ss_currentScanDesc; - - /* First argument for init function is always TableSampleDesc */ - fcinfo.arg[0] = PointerGetDatum(tsdesc); - fcinfo.argnull[0] = false; - - /* - * Second arg for init function is always REPEATABLE. - * - * If tablesample->repeatable is NULL then REPEATABLE clause was not - * specified, and we insert a random value as default. - * - * When specified, the expression cannot evaluate to NULL. - */ - if (tablesample->repeatable) - { - ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable, - (PlanState *) scanstate); - - fcinfo.arg[1] = ExecEvalExpr(argstate, econtext, - &fcinfo.argnull[1], NULL); - if (fcinfo.argnull[1]) - ereport(ERROR, - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), - errmsg("REPEATABLE clause must be NOT NULL numeric value"))); - } - else - { - fcinfo.arg[1] = UInt32GetDatum(random()); - fcinfo.argnull[1] = false; - } - - /* Rest of the arguments come from user. */ - i = 2; - foreach(arg, args) - { - Expr *argexpr = (Expr *) lfirst(arg); - ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate); - - fcinfo.arg[i] = ExecEvalExpr(argstate, econtext, - &fcinfo.argnull[i], NULL); - i++; - } - Assert(i == fcinfo.nargs); - - (void) FunctionCallInvoke(&fcinfo); - - return tsdesc; -} - -/* - * Get next tuple from TABLESAMPLE Method. - */ -HeapTuple -tablesample_getnext(TableSampleDesc *desc) -{ - HeapScanDesc scan = desc->heapScan; - HeapTuple tuple = &(scan->rs_ctup); - bool pagemode = scan->rs_pageatatime; - BlockNumber blockno; - Page page; - bool page_all_visible; - ItemId itemid; - OffsetNumber tupoffset, - maxoffset; - - if (!scan->rs_inited) - { - /* - * return null immediately if relation is empty - */ - if (scan->rs_nblocks == 0) - { - Assert(!BufferIsValid(scan->rs_cbuf)); - tuple->t_data = NULL; - return NULL; - } - blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, - PointerGetDatum(desc))); - if (!BlockNumberIsValid(blockno)) - { - tuple->t_data = NULL; - return NULL; - } - - heapgetpage(scan, blockno); - scan->rs_inited = true; - } - else - { - /* continue from previously returned page/tuple */ - blockno = scan->rs_cblock; /* current page */ - } - - /* - * When pagemode is disabled, the scan will do visibility checks for each - * tuple it finds so the buffer needs to be locked. - */ - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(scan->rs_cbuf); - page_all_visible = PageIsAllVisible(page); - maxoffset = PageGetMaxOffsetNumber(page); - - for (;;) - { - CHECK_FOR_INTERRUPTS(); - - tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple, - PointerGetDatum(desc), - UInt32GetDatum(blockno), - UInt16GetDatum(maxoffset))); - - if (OffsetNumberIsValid(tupoffset)) - { - bool visible; - bool found; - - /* Skip invalid tuple pointers. */ - itemid = PageGetItemId(page, tupoffset); - if (!ItemIdIsNormal(itemid)) - continue; - - tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid); - tuple->t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tuple->t_self), blockno, tupoffset); - - if (page_all_visible) - visible = true; - else - visible = SampleTupleVisible(tuple, tupoffset, scan); - - /* - * Let the sampling method examine the actual tuple and decide if - * we should return it. - * - * Note that we let it examine even invisible tuples for - * statistical purposes, but not return them since user should - * never see invisible tuples. - */ - if (OidIsValid(desc->tsmexaminetuple.fn_oid)) - { - found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple, - PointerGetDatum(desc), - UInt32GetDatum(blockno), - PointerGetDatum(tuple), - BoolGetDatum(visible))); - /* Should not happen if sampling method is well written. */ - if (found && !visible) - elog(ERROR, "Sampling method wanted to return invisible tuple"); - } - else - found = visible; - - /* Found visible tuple, return it. */ - if (found) - { - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - break; - } - else - { - /* Try next tuple from same page. */ - continue; - } - } - - - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, - PointerGetDatum(desc))); - - /* - * Report our new scan position for synchronization purposes. We don't - * do that when moving backwards, however. That would just mess up any - * other forward-moving scanners. - * - * Note: we do this before checking for end of scan so that the final - * state of the position hint is back at the start of the rel. That's - * not strictly necessary, but otherwise when you run the same query - * multiple times the starting position would shift a little bit - * backwards on every invocation, which is confusing. We don't - * guarantee any specific ordering in general, though. - */ - if (scan->rs_syncscan) - ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ? - blockno : scan->rs_startblock); - - /* - * Reached end of scan. - */ - if (!BlockNumberIsValid(blockno)) - { - if (BufferIsValid(scan->rs_cbuf)) - ReleaseBuffer(scan->rs_cbuf); - scan->rs_cbuf = InvalidBuffer; - scan->rs_cblock = InvalidBlockNumber; - tuple->t_data = NULL; - scan->rs_inited = false; - return NULL; - } - - heapgetpage(scan, blockno); - - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(scan->rs_cbuf); - page_all_visible = PageIsAllVisible(page); - maxoffset = PageGetMaxOffsetNumber(page); - } - - pgstat_count_heap_getnext(scan->rs_rd); - - return &(scan->rs_ctup); -} - -/* - * Reset the sampling to starting state - */ -void -tablesample_reset(TableSampleDesc *desc) -{ - (void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc)); -} - -/* - * Signal the sampling method that the scan has finished. - */ -void -tablesample_end(TableSampleDesc *desc) -{ - (void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc)); -} - -/* - * Check visibility of the tuple. - */ -static bool -SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) -{ - /* - * If this scan is reading whole pages at a time, there is already - * visibility info present in rs_vistuples so we can just search it for - * the tupoffset. - */ - if (scan->rs_pageatatime) - { - int start = 0, - end = scan->rs_ntuples - 1; - - /* - * Do the binary search over rs_vistuples, it's already sorted by - * OffsetNumber so we don't need to do any sorting ourselves here. - * - * We could use bsearch() here but it's slower for integers because of - * the function call overhead and because it needs boiler plate code - * it would not save us anything code-wise anyway. - */ - while (start <= end) - { - int mid = start + (end - start) / 2; - OffsetNumber curoffset = scan->rs_vistuples[mid]; - - if (curoffset == tupoffset) - return true; - else if (curoffset > tupoffset) - end = mid - 1; - else - start = mid + 1; - } - - return false; - } - else - { - /* No pagemode, we have to check the tuple itself. */ - Snapshot snapshot = scan->rs_snapshot; - Buffer buffer = scan->rs_cbuf; - - bool visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); - - CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer, - snapshot); - - return visible; - } + return routine; } diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 3d1139b5ba..25130ecf12 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -40,8 +40,9 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_ts_parser.h pg_ts_template.h pg_extension.h \ pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \ pg_foreign_table.h pg_policy.h pg_replication_origin.h \ - pg_tablesample_method.h pg_default_acl.h pg_seclabel.h pg_shseclabel.h \ - pg_collation.h pg_range.h pg_transform.h toasting.h indexing.h \ + pg_default_acl.h pg_seclabel.h pg_shseclabel.h \ + pg_collation.h pg_range.h pg_transform.h \ + toasting.h indexing.h \ ) # location of Catalog.pm diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 5d7c441739..90b1cd835f 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -1911,6 +1911,14 @@ find_expr_references_walker(Node *node, context->addrs); } } + else if (IsA(node, TableSampleClause)) + { + TableSampleClause *tsc = (TableSampleClause *) node; + + add_object_address(OCLASS_PROC, tsc->tsmhandler, 0, + context->addrs); + /* fall through to examine arguments */ + } return expression_tree_walker(node, find_expr_references_walker, (void *) context); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 0d1ecc2a3e..5d06fa4ea6 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -96,6 +96,8 @@ static void show_sort_group_keys(PlanState *planstate, const char *qlabel, List *ancestors, ExplainState *es); static void show_sortorder_options(StringInfo buf, Node *sortexpr, Oid sortOperator, Oid collation, bool nullsFirst); +static void show_tablesample(TableSampleClause *tsc, PlanState *planstate, + List *ancestors, ExplainState *es); static void show_sort_info(SortState *sortstate, ExplainState *es); static void show_hash_info(HashState *hashstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, @@ -116,7 +118,7 @@ static void ExplainMemberNodes(List *plans, PlanState **planstates, static void ExplainSubPlans(List *plans, List *ancestors, const char *relationship, ExplainState *es); static void ExplainCustomChildren(CustomScanState *css, - List *ancestors, ExplainState *es); + List *ancestors, ExplainState *es); static void ExplainProperty(const char *qlabel, const char *value, bool numeric, ExplainState *es); static void ExplainOpenGroup(const char *objtype, const char *labelname, @@ -730,6 +732,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) switch (nodeTag(plan)) { case T_SeqScan: + case T_SampleScan: case T_IndexScan: case T_IndexOnlyScan: case T_BitmapHeapScan: @@ -739,7 +742,6 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) case T_ValuesScan: case T_CteScan: case T_WorkTableScan: - case T_SampleScan: *rels_used = bms_add_member(*rels_used, ((Scan *) plan)->scanrelid); break; @@ -935,6 +937,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_SeqScan: pname = sname = "Seq Scan"; break; + case T_SampleScan: + pname = sname = "Sample Scan"; + break; case T_IndexScan: pname = sname = "Index Scan"; break; @@ -976,23 +981,6 @@ ExplainNode(PlanState *planstate, List *ancestors, else pname = sname; break; - case T_SampleScan: - { - /* - * Fetch the tablesample method name from RTE. - * - * It would be nice to also show parameters, but since we - * support arbitrary expressions as parameter it might get - * quite messy. - */ - RangeTblEntry *rte; - - rte = rt_fetch(((SampleScan *) plan)->scanrelid, es->rtable); - custom_name = get_tablesample_method_name(rte->tablesample->tsmid); - pname = psprintf("Sample Scan (%s)", custom_name); - sname = "Sample Scan"; - } - break; case T_Material: pname = sname = "Materialize"; break; @@ -1101,6 +1089,7 @@ ExplainNode(PlanState *planstate, List *ancestors, switch (nodeTag(plan)) { case T_SeqScan: + case T_SampleScan: case T_BitmapHeapScan: case T_TidScan: case T_SubqueryScan: @@ -1115,9 +1104,6 @@ ExplainNode(PlanState *planstate, List *ancestors, if (((Scan *) plan)->scanrelid > 0) ExplainScanTarget((Scan *) plan, es); break; - case T_SampleScan: - ExplainScanTarget((Scan *) plan, es); - break; case T_IndexScan: { IndexScan *indexscan = (IndexScan *) plan; @@ -1363,12 +1349,15 @@ ExplainNode(PlanState *planstate, List *ancestors, if (es->analyze) show_tidbitmap_info((BitmapHeapScanState *) planstate, es); break; + case T_SampleScan: + show_tablesample(((SampleScan *) plan)->tablesample, + planstate, ancestors, es); + /* FALL THRU to print additional fields the same as SeqScan */ case T_SeqScan: case T_ValuesScan: case T_CteScan: case T_WorkTableScan: case T_SubqueryScan: - case T_SampleScan: show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, @@ -2109,6 +2098,72 @@ show_sortorder_options(StringInfo buf, Node *sortexpr, } } +/* + * Show TABLESAMPLE properties + */ +static void +show_tablesample(TableSampleClause *tsc, PlanState *planstate, + List *ancestors, ExplainState *es) +{ + List *context; + bool useprefix; + char *method_name; + List *params = NIL; + char *repeatable; + ListCell *lc; + + /* Set up deparsing context */ + context = set_deparse_context_planstate(es->deparse_cxt, + (Node *) planstate, + ancestors); + useprefix = list_length(es->rtable) > 1; + + /* Get the tablesample method name */ + method_name = get_func_name(tsc->tsmhandler); + + /* Deparse parameter expressions */ + foreach(lc, tsc->args) + { + Node *arg = (Node *) lfirst(lc); + + params = lappend(params, + deparse_expression(arg, context, + useprefix, false)); + } + if (tsc->repeatable) + repeatable = deparse_expression((Node *) tsc->repeatable, context, + useprefix, false); + else + repeatable = NULL; + + /* Print results */ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + bool first = true; + + appendStringInfoSpaces(es->str, es->indent * 2); + appendStringInfo(es->str, "Sampling: %s (", method_name); + foreach(lc, params) + { + if (!first) + appendStringInfoString(es->str, ", "); + appendStringInfoString(es->str, (const char *) lfirst(lc)); + first = false; + } + appendStringInfoChar(es->str, ')'); + if (repeatable) + appendStringInfo(es->str, " REPEATABLE (%s)", repeatable); + appendStringInfoChar(es->str, '\n'); + } + else + { + ExplainPropertyText("Sampling Method", method_name, es); + ExplainPropertyList("Sampling Parameters", params, es); + if (repeatable) + ExplainPropertyText("Repeatable Seed", repeatable, es); + } +} + /* * If it's EXPLAIN ANALYZE, show tuplesort stats for a sort node */ @@ -2366,13 +2421,13 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es) switch (nodeTag(plan)) { case T_SeqScan: + case T_SampleScan: case T_IndexScan: case T_IndexOnlyScan: case T_BitmapHeapScan: case T_TidScan: case T_ForeignScan: case T_CustomScan: - case T_SampleScan: case T_ModifyTable: /* Assert it's on a real relation */ Assert(rte->rtekind == RTE_RELATION); @@ -2663,9 +2718,9 @@ ExplainCustomChildren(CustomScanState *css, List *ancestors, ExplainState *es) { ListCell *cell; const char *label = - (list_length(css->custom_ps) != 1 ? "children" : "child"); + (list_length(css->custom_ps) != 1 ? "children" : "child"); - foreach (cell, css->custom_ps) + foreach(cell, css->custom_ps) ExplainNode((PlanState *) lfirst(cell), ancestors, label, NULL, es); } diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 04073d3f9f..93e1e9a691 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -463,6 +463,10 @@ ExecSupportsBackwardScan(Plan *node) case T_CteScan: return TargetListSupportsBackwardScan(node->targetlist); + case T_SampleScan: + /* Simplify life for tablesample methods by disallowing this */ + return false; + case T_IndexScan: return IndexSupportsBackwardScan(((IndexScan *) node)->indexid) && TargetListSupportsBackwardScan(node->targetlist); @@ -485,9 +489,6 @@ ExecSupportsBackwardScan(Plan *node) } return false; - case T_SampleScan: - return false; - case T_Material: case T_Sort: /* these don't evaluate tlist */ diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c index 4c1c5237b7..dbe84b0baa 100644 --- a/src/backend/executor/nodeSamplescan.c +++ b/src/backend/executor/nodeSamplescan.c @@ -3,7 +3,7 @@ * nodeSamplescan.c * Support routines for sample scans of relations (table sampling). * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -14,22 +14,23 @@ */ #include "postgres.h" -#include "access/tablesample.h" +#include "access/hash.h" +#include "access/relscan.h" +#include "access/tsmapi.h" #include "executor/executor.h" #include "executor/nodeSamplescan.h" #include "miscadmin.h" -#include "parser/parsetree.h" #include "pgstat.h" -#include "storage/bufmgr.h" #include "storage/predicate.h" #include "utils/rel.h" -#include "utils/syscache.h" #include "utils/tqual.h" -static void InitScanRelation(SampleScanState *node, EState *estate, - int eflags, TableSampleClause *tablesample); +static void InitScanRelation(SampleScanState *node, EState *estate, int eflags); static TupleTableSlot *SampleNext(SampleScanState *node); - +static void tablesample_init(SampleScanState *scanstate); +static HeapTuple tablesample_getnext(SampleScanState *scanstate); +static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, + HeapScanDesc scan); /* ---------------------------------------------------------------- * Scan Support @@ -45,23 +46,26 @@ static TupleTableSlot *SampleNext(SampleScanState *node); static TupleTableSlot * SampleNext(SampleScanState *node) { - TupleTableSlot *slot; - TableSampleDesc *tsdesc; HeapTuple tuple; + TupleTableSlot *slot; /* - * get information from the scan state + * if this is first call within a scan, initialize */ - slot = node->ss.ss_ScanTupleSlot; - tsdesc = node->tsdesc; + if (!node->begun) + tablesample_init(node); - tuple = tablesample_getnext(tsdesc); + /* + * get the next tuple, and store it in our result slot + */ + tuple = tablesample_getnext(node); + + slot = node->ss.ss_ScanTupleSlot; if (tuple) ExecStoreTuple(tuple, /* tuple to store */ slot, /* slot to store in */ - tsdesc->heapScan->rs_cbuf, /* buffer associated - * with this tuple */ + node->ss.ss_currentScanDesc->rs_cbuf, /* tuple's buffer */ false); /* don't pfree this pointer */ else ExecClearTuple(slot); @@ -75,7 +79,10 @@ SampleNext(SampleScanState *node) static bool SampleRecheck(SampleScanState *node, TupleTableSlot *slot) { - /* No need to recheck for SampleScan */ + /* + * No need to recheck for SampleScan, since like SeqScan we don't pass any + * checkable keys to heap_beginscan. + */ return true; } @@ -103,8 +110,7 @@ ExecSampleScan(SampleScanState *node) * ---------------------------------------------------------------- */ static void -InitScanRelation(SampleScanState *node, EState *estate, int eflags, - TableSampleClause *tablesample) +InitScanRelation(SampleScanState *node, EState *estate, int eflags) { Relation currentRelation; @@ -113,19 +119,13 @@ InitScanRelation(SampleScanState *node, EState *estate, int eflags, * open that relation and acquire appropriate lock on it. */ currentRelation = ExecOpenScanRelation(estate, - ((SampleScan *) node->ss.ps.plan)->scanrelid, + ((SampleScan *) node->ss.ps.plan)->scan.scanrelid, eflags); node->ss.ss_currentRelation = currentRelation; - /* - * Even though we aren't going to do a conventional seqscan, it is useful - * to create a HeapScanDesc --- many of the fields in it are usable. - */ - node->ss.ss_currentScanDesc = - heap_beginscan_sampling(currentRelation, estate->es_snapshot, 0, NULL, - tablesample->tsmseqscan, - tablesample->tsmpagemode); + /* we won't set up the HeapScanDesc till later */ + node->ss.ss_currentScanDesc = NULL; /* and report the scan tuple slot's rowtype */ ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation)); @@ -140,12 +140,11 @@ SampleScanState * ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) { SampleScanState *scanstate; - RangeTblEntry *rte = rt_fetch(node->scanrelid, - estate->es_range_table); + TableSampleClause *tsc = node->tablesample; + TsmRoutine *tsm; Assert(outerPlan(node) == NULL); Assert(innerPlan(node) == NULL); - Assert(rte->tablesample != NULL); /* * create state structure @@ -165,10 +164,17 @@ ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) * initialize child expressions */ scanstate->ss.ps.targetlist = (List *) - ExecInitExpr((Expr *) node->plan.targetlist, + ExecInitExpr((Expr *) node->scan.plan.targetlist, (PlanState *) scanstate); scanstate->ss.ps.qual = (List *) - ExecInitExpr((Expr *) node->plan.qual, + ExecInitExpr((Expr *) node->scan.plan.qual, + (PlanState *) scanstate); + + scanstate->args = (List *) + ExecInitExpr((Expr *) tsc->args, + (PlanState *) scanstate); + scanstate->repeatable = + ExecInitExpr(tsc->repeatable, (PlanState *) scanstate); /* @@ -180,7 +186,7 @@ ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) /* * initialize scan relation */ - InitScanRelation(scanstate, estate, eflags, rte->tablesample); + InitScanRelation(scanstate, estate, eflags); scanstate->ss.ps.ps_TupFromTlist = false; @@ -190,7 +196,25 @@ ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) ExecAssignResultTypeFromTL(&scanstate->ss.ps); ExecAssignScanProjectionInfo(&scanstate->ss); - scanstate->tsdesc = tablesample_init(scanstate, rte->tablesample); + /* + * If we don't have a REPEATABLE clause, select a random seed. We want to + * do this just once, since the seed shouldn't change over rescans. + */ + if (tsc->repeatable == NULL) + scanstate->seed = random(); + + /* + * Finally, initialize the TABLESAMPLE method handler. + */ + tsm = GetTsmRoutine(tsc->tsmhandler); + scanstate->tsmroutine = tsm; + scanstate->tsm_state = NULL; + + if (tsm->InitSampleScan) + tsm->InitSampleScan(scanstate, eflags); + + /* We'll do BeginSampleScan later; we can't evaluate params yet */ + scanstate->begun = false; return scanstate; } @@ -207,7 +231,8 @@ ExecEndSampleScan(SampleScanState *node) /* * Tell sampling function that we finished the scan. */ - tablesample_end(node->tsdesc); + if (node->tsmroutine->EndSampleScan) + node->tsmroutine->EndSampleScan(node); /* * Free the exprcontext @@ -223,7 +248,8 @@ ExecEndSampleScan(SampleScanState *node) /* * close heap scan */ - heap_endscan(node->ss.ss_currentScanDesc); + if (node->ss.ss_currentScanDesc) + heap_endscan(node->ss.ss_currentScanDesc); /* * close the heap relation. @@ -231,11 +257,6 @@ ExecEndSampleScan(SampleScanState *node) ExecCloseScanRelation(node->ss.ss_currentRelation); } -/* ---------------------------------------------------------------- - * Join Support - * ---------------------------------------------------------------- - */ - /* ---------------------------------------------------------------- * ExecReScanSampleScan * @@ -246,12 +267,336 @@ ExecEndSampleScan(SampleScanState *node) void ExecReScanSampleScan(SampleScanState *node) { - heap_rescan(node->ss.ss_currentScanDesc, NULL); - - /* - * Tell sampling function to reset its state for rescan. - */ - tablesample_reset(node->tsdesc); + /* Remember we need to do BeginSampleScan again (if we did it at all) */ + node->begun = false; ExecScanReScan(&node->ss); } + + +/* + * Initialize the TABLESAMPLE method: evaluate params and call BeginSampleScan. + */ +static void +tablesample_init(SampleScanState *scanstate) +{ + TsmRoutine *tsm = scanstate->tsmroutine; + ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; + Datum *params; + Datum datum; + bool isnull; + uint32 seed; + bool allow_sync; + int i; + ListCell *arg; + + params = (Datum *) palloc(list_length(scanstate->args) * sizeof(Datum)); + + i = 0; + foreach(arg, scanstate->args) + { + ExprState *argstate = (ExprState *) lfirst(arg); + + params[i] = ExecEvalExprSwitchContext(argstate, + econtext, + &isnull, + NULL); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("TABLESAMPLE parameter cannot be null"))); + i++; + } + + if (scanstate->repeatable) + { + datum = ExecEvalExprSwitchContext(scanstate->repeatable, + econtext, + &isnull, + NULL); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_REPEAT), + errmsg("TABLESAMPLE REPEATABLE parameter cannot be null"))); + + /* + * The REPEATABLE parameter has been coerced to float8 by the parser. + * The reason for using float8 at the SQL level is that it will + * produce unsurprising results both for users used to databases that + * accept only integers in the REPEATABLE clause and for those who + * might expect that REPEATABLE works like setseed() (a float in the + * range from -1 to 1). + * + * We use hashfloat8() to convert the supplied value into a suitable + * seed. For regression-testing purposes, that has the convenient + * property that REPEATABLE(0) gives a machine-independent result. + */ + seed = DatumGetUInt32(DirectFunctionCall1(hashfloat8, datum)); + } + else + { + /* Use the seed selected by ExecInitSampleScan */ + seed = scanstate->seed; + } + + /* Set default values for params that BeginSampleScan can adjust */ + scanstate->use_bulkread = true; + scanstate->use_pagemode = true; + + /* Let tablesample method do its thing */ + tsm->BeginSampleScan(scanstate, + params, + list_length(scanstate->args), + seed); + + /* We'll use syncscan if there's no NextSampleBlock function */ + allow_sync = (tsm->NextSampleBlock == NULL); + + /* Now we can create or reset the HeapScanDesc */ + if (scanstate->ss.ss_currentScanDesc == NULL) + { + scanstate->ss.ss_currentScanDesc = + heap_beginscan_sampling(scanstate->ss.ss_currentRelation, + scanstate->ss.ps.state->es_snapshot, + 0, NULL, + scanstate->use_bulkread, + allow_sync, + scanstate->use_pagemode); + } + else + { + heap_rescan_set_params(scanstate->ss.ss_currentScanDesc, NULL, + scanstate->use_bulkread, + allow_sync, + scanstate->use_pagemode); + } + + pfree(params); + + /* And we're initialized. */ + scanstate->begun = true; +} + +/* + * Get next tuple from TABLESAMPLE method. + * + * Note: an awful lot of this is copied-and-pasted from heapam.c. It would + * perhaps be better to refactor to share more code. + */ +static HeapTuple +tablesample_getnext(SampleScanState *scanstate) +{ + TsmRoutine *tsm = scanstate->tsmroutine; + HeapScanDesc scan = scanstate->ss.ss_currentScanDesc; + HeapTuple tuple = &(scan->rs_ctup); + Snapshot snapshot = scan->rs_snapshot; + bool pagemode = scan->rs_pageatatime; + BlockNumber blockno; + Page page; + bool all_visible; + OffsetNumber maxoffset; + + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return NULL; + } + if (tsm->NextSampleBlock) + { + blockno = tsm->NextSampleBlock(scanstate); + if (!BlockNumberIsValid(blockno)) + { + tuple->t_data = NULL; + return NULL; + } + } + else + blockno = scan->rs_startblock; + Assert(blockno < scan->rs_nblocks); + heapgetpage(scan, blockno); + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + blockno = scan->rs_cblock; /* current page */ + } + + /* + * When not using pagemode, we must lock the buffer during tuple + * visibility checks. + */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + OffsetNumber tupoffset; + bool finished; + + CHECK_FOR_INTERRUPTS(); + + /* Ask the tablesample method which tuples to check on this page. */ + tupoffset = tsm->NextSampleTuple(scanstate, + blockno, + maxoffset); + + if (OffsetNumberIsValid(tupoffset)) + { + ItemId itemid; + bool visible; + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + if (all_visible) + visible = true; + else + visible = SampleTupleVisible(tuple, tupoffset, scan); + + /* in pagemode, heapgetpage did this for us */ + if (!pagemode) + CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, + scan->rs_cbuf, snapshot); + + if (visible) + { + /* Found visible tuple, return it. */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + break; + } + else + { + /* Try next tuple from same page. */ + continue; + } + } + + /* + * if we get here, it means we've exhausted the items on this page and + * it's time to move to the next. + */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + if (tsm->NextSampleBlock) + { + blockno = tsm->NextSampleBlock(scanstate); + Assert(!scan->rs_syncscan); + finished = !BlockNumberIsValid(blockno); + } + else + { + /* Without NextSampleBlock, just do a plain forward seqscan. */ + blockno++; + if (blockno >= scan->rs_nblocks) + blockno = 0; + + /* + * Report our new scan position for synchronization purposes. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_syncscan) + ss_report_location(scan->rs_rd, blockno); + + finished = (blockno == scan->rs_startblock); + } + + /* + * Reached end of scan? + */ + if (finished) + { + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; + return NULL; + } + + Assert(blockno < scan->rs_nblocks); + heapgetpage(scan, blockno); + + /* Re-establish state for new page */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; + maxoffset = PageGetMaxOffsetNumber(page); + } + + /* Count successfully-fetched tuples as heap fetches */ + pgstat_count_heap_getnext(scan->rs_rd); + + return &(scan->rs_ctup); +} + +/* + * Check visibility of the tuple. + */ +static bool +SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) +{ + if (scan->rs_pageatatime) + { + /* + * In pageatatime mode, heapgetpage() already did visibility checks, + * so just look at the info it left in rs_vistuples[]. + * + * We use a binary search over the known-sorted array. Note: we could + * save some effort if we insisted that NextSampleTuple select tuples + * in increasing order, but it's not clear that there would be enough + * gain to justify the restriction. + */ + int start = 0, + end = scan->rs_ntuples - 1; + + while (start <= end) + { + int mid = (start + end) / 2; + OffsetNumber curoffset = scan->rs_vistuples[mid]; + + if (tupoffset == curoffset) + return true; + else if (tupoffset < curoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* Otherwise, we have to check the tuple individually. */ + return HeapTupleSatisfiesVisibility(tuple, + scan->rs_snapshot, + scan->rs_cbuf); + } +} diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 6a08c2db21..7248440ead 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -359,6 +359,27 @@ _copySeqScan(const SeqScan *from) return newnode; } +/* + * _copySampleScan + */ +static SampleScan * +_copySampleScan(const SampleScan *from) +{ + SampleScan *newnode = makeNode(SampleScan); + + /* + * copy node superclass fields + */ + CopyScanFields((const Scan *) from, (Scan *) newnode); + + /* + * copy remainder of node + */ + COPY_NODE_FIELD(tablesample); + + return newnode; +} + /* * _copyIndexScan */ @@ -641,22 +662,6 @@ _copyCustomScan(const CustomScan *from) return newnode; } -/* - * _copySampleScan - */ -static SampleScan * -_copySampleScan(const SampleScan *from) -{ - SampleScan *newnode = makeNode(SampleScan); - - /* - * copy node superclass fields - */ - CopyScanFields((const Scan *) from, (Scan *) newnode); - - return newnode; -} - /* * CopyJoinFields * @@ -2143,6 +2148,18 @@ _copyRangeTblFunction(const RangeTblFunction *from) return newnode; } +static TableSampleClause * +_copyTableSampleClause(const TableSampleClause *from) +{ + TableSampleClause *newnode = makeNode(TableSampleClause); + + COPY_SCALAR_FIELD(tsmhandler); + COPY_NODE_FIELD(args); + COPY_NODE_FIELD(repeatable); + + return newnode; +} + static WithCheckOption * _copyWithCheckOption(const WithCheckOption *from) { @@ -2271,40 +2288,6 @@ _copyCommonTableExpr(const CommonTableExpr *from) return newnode; } -static RangeTableSample * -_copyRangeTableSample(const RangeTableSample *from) -{ - RangeTableSample *newnode = makeNode(RangeTableSample); - - COPY_NODE_FIELD(relation); - COPY_STRING_FIELD(method); - COPY_NODE_FIELD(repeatable); - COPY_NODE_FIELD(args); - - return newnode; -} - -static TableSampleClause * -_copyTableSampleClause(const TableSampleClause *from) -{ - TableSampleClause *newnode = makeNode(TableSampleClause); - - COPY_SCALAR_FIELD(tsmid); - COPY_SCALAR_FIELD(tsmseqscan); - COPY_SCALAR_FIELD(tsmpagemode); - COPY_SCALAR_FIELD(tsminit); - COPY_SCALAR_FIELD(tsmnextblock); - COPY_SCALAR_FIELD(tsmnexttuple); - COPY_SCALAR_FIELD(tsmexaminetuple); - COPY_SCALAR_FIELD(tsmend); - COPY_SCALAR_FIELD(tsmreset); - COPY_SCALAR_FIELD(tsmcost); - COPY_NODE_FIELD(repeatable); - COPY_NODE_FIELD(args); - - return newnode; -} - static A_Expr * _copyAExpr(const A_Expr *from) { @@ -2532,6 +2515,20 @@ _copyRangeFunction(const RangeFunction *from) return newnode; } +static RangeTableSample * +_copyRangeTableSample(const RangeTableSample *from) +{ + RangeTableSample *newnode = makeNode(RangeTableSample); + + COPY_NODE_FIELD(relation); + COPY_NODE_FIELD(method); + COPY_NODE_FIELD(args); + COPY_NODE_FIELD(repeatable); + COPY_LOCATION_FIELD(location); + + return newnode; +} + static TypeCast * _copyTypeCast(const TypeCast *from) { @@ -4237,6 +4234,9 @@ copyObject(const void *from) case T_SeqScan: retval = _copySeqScan(from); break; + case T_SampleScan: + retval = _copySampleScan(from); + break; case T_IndexScan: retval = _copyIndexScan(from); break; @@ -4273,9 +4273,6 @@ copyObject(const void *from) case T_CustomScan: retval = _copyCustomScan(from); break; - case T_SampleScan: - retval = _copySampleScan(from); - break; case T_Join: retval = _copyJoin(from); break; @@ -4897,6 +4894,9 @@ copyObject(const void *from) case T_RangeFunction: retval = _copyRangeFunction(from); break; + case T_RangeTableSample: + retval = _copyRangeTableSample(from); + break; case T_TypeName: retval = _copyTypeName(from); break; @@ -4921,6 +4921,9 @@ copyObject(const void *from) case T_RangeTblFunction: retval = _copyRangeTblFunction(from); break; + case T_TableSampleClause: + retval = _copyTableSampleClause(from); + break; case T_WithCheckOption: retval = _copyWithCheckOption(from); break; @@ -4948,12 +4951,6 @@ copyObject(const void *from) case T_CommonTableExpr: retval = _copyCommonTableExpr(from); break; - case T_RangeTableSample: - retval = _copyRangeTableSample(from); - break; - case T_TableSampleClause: - retval = _copyTableSampleClause(from); - break; case T_FuncWithArgs: retval = _copyFuncWithArgs(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index faf5eedab4..6597dbc33e 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2290,6 +2290,18 @@ _equalRangeFunction(const RangeFunction *a, const RangeFunction *b) return true; } +static bool +_equalRangeTableSample(const RangeTableSample *a, const RangeTableSample *b) +{ + COMPARE_NODE_FIELD(relation); + COMPARE_NODE_FIELD(method); + COMPARE_NODE_FIELD(args); + COMPARE_NODE_FIELD(repeatable); + COMPARE_LOCATION_FIELD(location); + + return true; +} + static bool _equalIndexElem(const IndexElem *a, const IndexElem *b) { @@ -2428,6 +2440,16 @@ _equalRangeTblFunction(const RangeTblFunction *a, const RangeTblFunction *b) return true; } +static bool +_equalTableSampleClause(const TableSampleClause *a, const TableSampleClause *b) +{ + COMPARE_SCALAR_FIELD(tsmhandler); + COMPARE_NODE_FIELD(args); + COMPARE_NODE_FIELD(repeatable); + + return true; +} + static bool _equalWithCheckOption(const WithCheckOption *a, const WithCheckOption *b) { @@ -2538,36 +2560,6 @@ _equalCommonTableExpr(const CommonTableExpr *a, const CommonTableExpr *b) return true; } -static bool -_equalRangeTableSample(const RangeTableSample *a, const RangeTableSample *b) -{ - COMPARE_NODE_FIELD(relation); - COMPARE_STRING_FIELD(method); - COMPARE_NODE_FIELD(repeatable); - COMPARE_NODE_FIELD(args); - - return true; -} - -static bool -_equalTableSampleClause(const TableSampleClause *a, const TableSampleClause *b) -{ - COMPARE_SCALAR_FIELD(tsmid); - COMPARE_SCALAR_FIELD(tsmseqscan); - COMPARE_SCALAR_FIELD(tsmpagemode); - COMPARE_SCALAR_FIELD(tsminit); - COMPARE_SCALAR_FIELD(tsmnextblock); - COMPARE_SCALAR_FIELD(tsmnexttuple); - COMPARE_SCALAR_FIELD(tsmexaminetuple); - COMPARE_SCALAR_FIELD(tsmend); - COMPARE_SCALAR_FIELD(tsmreset); - COMPARE_SCALAR_FIELD(tsmcost); - COMPARE_NODE_FIELD(repeatable); - COMPARE_NODE_FIELD(args); - - return true; -} - static bool _equalXmlSerialize(const XmlSerialize *a, const XmlSerialize *b) { @@ -3260,6 +3252,9 @@ equal(const void *a, const void *b) case T_RangeFunction: retval = _equalRangeFunction(a, b); break; + case T_RangeTableSample: + retval = _equalRangeTableSample(a, b); + break; case T_TypeName: retval = _equalTypeName(a, b); break; @@ -3284,6 +3279,9 @@ equal(const void *a, const void *b) case T_RangeTblFunction: retval = _equalRangeTblFunction(a, b); break; + case T_TableSampleClause: + retval = _equalTableSampleClause(a, b); + break; case T_WithCheckOption: retval = _equalWithCheckOption(a, b); break; @@ -3311,12 +3309,6 @@ equal(const void *a, const void *b) case T_CommonTableExpr: retval = _equalCommonTableExpr(a, b); break; - case T_RangeTableSample: - retval = _equalRangeTableSample(a, b); - break; - case T_TableSampleClause: - retval = _equalTableSampleClause(a, b); - break; case T_FuncWithArgs: retval = _equalFuncWithArgs(a, b); break; diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index b1e3e6e489..c517dfd9d6 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -1486,6 +1486,9 @@ exprLocation(const Node *expr) case T_WindowDef: loc = ((const WindowDef *) expr)->location; break; + case T_RangeTableSample: + loc = ((const RangeTableSample *) expr)->location; + break; case T_TypeName: loc = ((const TypeName *) expr)->location; break; @@ -1995,6 +1998,17 @@ expression_tree_walker(Node *node, return walker(((PlaceHolderInfo *) node)->ph_var, context); case T_RangeTblFunction: return walker(((RangeTblFunction *) node)->funcexpr, context); + case T_TableSampleClause: + { + TableSampleClause *tsc = (TableSampleClause *) node; + + if (expression_tree_walker((Node *) tsc->args, + walker, context)) + return true; + if (walker((Node *) tsc->repeatable, context)) + return true; + } + break; default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); @@ -2082,13 +2096,8 @@ range_table_walker(List *rtable, switch (rte->rtekind) { case RTE_RELATION: - if (rte->tablesample) - { - if (walker(rte->tablesample->args, context)) - return true; - if (walker(rte->tablesample->repeatable, context)) - return true; - } + if (walker(rte->tablesample, context)) + return true; break; case RTE_CTE: /* nothing to do */ @@ -2782,6 +2791,17 @@ expression_tree_mutator(Node *node, return (Node *) newnode; } break; + case T_TableSampleClause: + { + TableSampleClause *tsc = (TableSampleClause *) node; + TableSampleClause *newnode; + + FLATCOPY(newnode, tsc, TableSampleClause); + MUTATE(newnode->args, tsc->args, List *); + MUTATE(newnode->repeatable, tsc->repeatable, Expr *); + return (Node *) newnode; + } + break; default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); @@ -2868,20 +2888,12 @@ range_table_mutator(List *rtable, switch (rte->rtekind) { case RTE_RELATION: - if (rte->tablesample) - { - CHECKFLATCOPY(newrte->tablesample, rte->tablesample, - TableSampleClause); - MUTATE(newrte->tablesample->args, - newrte->tablesample->args, - List *); - MUTATE(newrte->tablesample->repeatable, - newrte->tablesample->repeatable, - Node *); - } + MUTATE(newrte->tablesample, rte->tablesample, + TableSampleClause *); + /* we don't bother to copy eref, aliases, etc; OK? */ break; case RTE_CTE: - /* we don't bother to copy eref, aliases, etc; OK? */ + /* nothing to do */ break; case RTE_SUBQUERY: if (!(flags & QTW_IGNORE_RT_SUBQUERIES)) @@ -3316,6 +3328,19 @@ raw_expression_tree_walker(Node *node, return true; } break; + case T_RangeTableSample: + { + RangeTableSample *rts = (RangeTableSample *) node; + + if (walker(rts->relation, context)) + return true; + /* method name is deemed uninteresting */ + if (walker(rts->args, context)) + return true; + if (walker(rts->repeatable, context)) + return true; + } + break; case T_TypeName: { TypeName *tn = (TypeName *) node; @@ -3380,18 +3405,6 @@ raw_expression_tree_walker(Node *node, break; case T_CommonTableExpr: return walker(((CommonTableExpr *) node)->ctequery, context); - case T_RangeTableSample: - { - RangeTableSample *rts = (RangeTableSample *) node; - - if (walker(rts->relation, context)) - return true; - if (walker(rts->repeatable, context)) - return true; - if (walker(rts->args, context)) - return true; - } - break; default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 87304ba9bf..81725d6e59 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -444,6 +444,16 @@ _outSeqScan(StringInfo str, const SeqScan *node) _outScanInfo(str, (const Scan *) node); } +static void +_outSampleScan(StringInfo str, const SampleScan *node) +{ + WRITE_NODE_TYPE("SAMPLESCAN"); + + _outScanInfo(str, (const Scan *) node); + + WRITE_NODE_FIELD(tablesample); +} + static void _outIndexScan(StringInfo str, const IndexScan *node) { @@ -591,14 +601,6 @@ _outCustomScan(StringInfo str, const CustomScan *node) node->methods->TextOutCustomScan(str, node); } -static void -_outSampleScan(StringInfo str, const SampleScan *node) -{ - WRITE_NODE_TYPE("SAMPLESCAN"); - - _outScanInfo(str, (const Scan *) node); -} - static void _outJoin(StringInfo str, const Join *node) { @@ -2478,36 +2480,6 @@ _outCommonTableExpr(StringInfo str, const CommonTableExpr *node) WRITE_NODE_FIELD(ctecolcollations); } -static void -_outRangeTableSample(StringInfo str, const RangeTableSample *node) -{ - WRITE_NODE_TYPE("RANGETABLESAMPLE"); - - WRITE_NODE_FIELD(relation); - WRITE_STRING_FIELD(method); - WRITE_NODE_FIELD(repeatable); - WRITE_NODE_FIELD(args); -} - -static void -_outTableSampleClause(StringInfo str, const TableSampleClause *node) -{ - WRITE_NODE_TYPE("TABLESAMPLECLAUSE"); - - WRITE_OID_FIELD(tsmid); - WRITE_BOOL_FIELD(tsmseqscan); - WRITE_BOOL_FIELD(tsmpagemode); - WRITE_OID_FIELD(tsminit); - WRITE_OID_FIELD(tsmnextblock); - WRITE_OID_FIELD(tsmnexttuple); - WRITE_OID_FIELD(tsmexaminetuple); - WRITE_OID_FIELD(tsmend); - WRITE_OID_FIELD(tsmreset); - WRITE_OID_FIELD(tsmcost); - WRITE_NODE_FIELD(repeatable); - WRITE_NODE_FIELD(args); -} - static void _outSetOperationStmt(StringInfo str, const SetOperationStmt *node) { @@ -2594,6 +2566,16 @@ _outRangeTblFunction(StringInfo str, const RangeTblFunction *node) WRITE_BITMAPSET_FIELD(funcparams); } +static void +_outTableSampleClause(StringInfo str, const TableSampleClause *node) +{ + WRITE_NODE_TYPE("TABLESAMPLECLAUSE"); + + WRITE_OID_FIELD(tsmhandler); + WRITE_NODE_FIELD(args); + WRITE_NODE_FIELD(repeatable); +} + static void _outAExpr(StringInfo str, const A_Expr *node) { @@ -2845,6 +2827,18 @@ _outRangeFunction(StringInfo str, const RangeFunction *node) WRITE_NODE_FIELD(coldeflist); } +static void +_outRangeTableSample(StringInfo str, const RangeTableSample *node) +{ + WRITE_NODE_TYPE("RANGETABLESAMPLE"); + + WRITE_NODE_FIELD(relation); + WRITE_NODE_FIELD(method); + WRITE_NODE_FIELD(args); + WRITE_NODE_FIELD(repeatable); + WRITE_LOCATION_FIELD(location); +} + static void _outConstraint(StringInfo str, const Constraint *node) { @@ -3002,6 +2996,9 @@ _outNode(StringInfo str, const void *obj) case T_SeqScan: _outSeqScan(str, obj); break; + case T_SampleScan: + _outSampleScan(str, obj); + break; case T_IndexScan: _outIndexScan(str, obj); break; @@ -3038,9 +3035,6 @@ _outNode(StringInfo str, const void *obj) case T_CustomScan: _outCustomScan(str, obj); break; - case T_SampleScan: - _outSampleScan(str, obj); - break; case T_Join: _outJoin(str, obj); break; @@ -3393,12 +3387,6 @@ _outNode(StringInfo str, const void *obj) case T_CommonTableExpr: _outCommonTableExpr(str, obj); break; - case T_RangeTableSample: - _outRangeTableSample(str, obj); - break; - case T_TableSampleClause: - _outTableSampleClause(str, obj); - break; case T_SetOperationStmt: _outSetOperationStmt(str, obj); break; @@ -3408,6 +3396,9 @@ _outNode(StringInfo str, const void *obj) case T_RangeTblFunction: _outRangeTblFunction(str, obj); break; + case T_TableSampleClause: + _outTableSampleClause(str, obj); + break; case T_A_Expr: _outAExpr(str, obj); break; @@ -3450,6 +3441,9 @@ _outNode(StringInfo str, const void *obj) case T_RangeFunction: _outRangeFunction(str, obj); break; + case T_RangeTableSample: + _outRangeTableSample(str, obj); + break; case T_Constraint: _outConstraint(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index f5a40fbfb4..71be840eac 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -367,46 +367,6 @@ _readCommonTableExpr(void) READ_DONE(); } -/* - * _readRangeTableSample - */ -static RangeTableSample * -_readRangeTableSample(void) -{ - READ_LOCALS(RangeTableSample); - - READ_NODE_FIELD(relation); - READ_STRING_FIELD(method); - READ_NODE_FIELD(repeatable); - READ_NODE_FIELD(args); - - READ_DONE(); -} - -/* - * _readTableSampleClause - */ -static TableSampleClause * -_readTableSampleClause(void) -{ - READ_LOCALS(TableSampleClause); - - READ_OID_FIELD(tsmid); - READ_BOOL_FIELD(tsmseqscan); - READ_BOOL_FIELD(tsmpagemode); - READ_OID_FIELD(tsminit); - READ_OID_FIELD(tsmnextblock); - READ_OID_FIELD(tsmnexttuple); - READ_OID_FIELD(tsmexaminetuple); - READ_OID_FIELD(tsmend); - READ_OID_FIELD(tsmreset); - READ_OID_FIELD(tsmcost); - READ_NODE_FIELD(repeatable); - READ_NODE_FIELD(args); - - READ_DONE(); -} - /* * _readSetOperationStmt */ @@ -1391,6 +1351,21 @@ _readRangeTblFunction(void) READ_DONE(); } +/* + * _readTableSampleClause + */ +static TableSampleClause * +_readTableSampleClause(void) +{ + READ_LOCALS(TableSampleClause); + + READ_OID_FIELD(tsmhandler); + READ_NODE_FIELD(args); + READ_NODE_FIELD(repeatable); + + READ_DONE(); +} + /* * parseNodeString @@ -1426,10 +1401,6 @@ parseNodeString(void) return_value = _readRowMarkClause(); else if (MATCH("COMMONTABLEEXPR", 15)) return_value = _readCommonTableExpr(); - else if (MATCH("RANGETABLESAMPLE", 16)) - return_value = _readRangeTableSample(); - else if (MATCH("TABLESAMPLECLAUSE", 17)) - return_value = _readTableSampleClause(); else if (MATCH("SETOPERATIONSTMT", 16)) return_value = _readSetOperationStmt(); else if (MATCH("ALIAS", 5)) @@ -1528,6 +1499,8 @@ parseNodeString(void) return_value = _readRangeTblEntry(); else if (MATCH("RANGETBLFUNCTION", 16)) return_value = _readRangeTblFunction(); + else if (MATCH("TABLESAMPLECLAUSE", 17)) + return_value = _readTableSampleClause(); else if (MATCH("NOTIFY", 6)) return_value = _readNotifyStmt(); else if (MATCH("DECLARECURSOR", 13)) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 888eeac515..1590be1167 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -18,6 +18,7 @@ #include #include "access/sysattr.h" +#include "access/tsmapi.h" #include "catalog/pg_class.h" #include "catalog/pg_operator.h" #include "foreign/fdwapi.h" @@ -390,7 +391,7 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, } else if (rte->tablesample != NULL) { - /* Build sample scan on relation */ + /* Sampled relation */ set_tablesample_rel_pathlist(root, rel, rte); } else @@ -480,11 +481,40 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) /* * set_tablesample_rel_size - * Set size estimates for a sampled relation. + * Set size estimates for a sampled relation */ static void set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) { + TableSampleClause *tsc = rte->tablesample; + TsmRoutine *tsm; + BlockNumber pages; + double tuples; + + /* + * Test any partial indexes of rel for applicability. We must do this + * first since partial unique indexes can affect size estimates. + */ + check_partial_indexes(root, rel); + + /* + * Call the sampling method's estimation function to estimate the number + * of pages it will read and the number of tuples it will return. (Note: + * we assume the function returns sane values.) + */ + tsm = GetTsmRoutine(tsc->tsmhandler); + tsm->SampleScanGetSampleSize(root, rel, tsc->args, + &pages, &tuples); + + /* + * For the moment, because we will only consider a SampleScan path for the + * rel, it's okay to just overwrite the pages and tuples estimates for the + * whole relation. If we ever consider multiple path types for sampled + * rels, we'll need more complication. + */ + rel->pages = pages; + rel->tuples = tuples; + /* Mark rel with estimated output rows, width, etc */ set_baserel_size_estimates(root, rel); } @@ -492,8 +522,6 @@ set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) /* * set_tablesample_rel_pathlist * Build access paths for a sampled relation - * - * There is only one possible path - sampling scan */ static void set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) @@ -502,15 +530,41 @@ set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry * Path *path; /* - * We don't support pushing join clauses into the quals of a seqscan, but - * it could still have required parameterization due to LATERAL refs in - * its tlist. + * We don't support pushing join clauses into the quals of a samplescan, + * but it could still have required parameterization due to LATERAL refs + * in its tlist or TABLESAMPLE arguments. */ required_outer = rel->lateral_relids; - /* We only do sample scan if it was requested */ + /* Consider sampled scan */ path = create_samplescan_path(root, rel, required_outer); - rel->pathlist = list_make1(path); + + /* + * If the sampling method does not support repeatable scans, we must avoid + * plans that would scan the rel multiple times. Ideally, we'd simply + * avoid putting the rel on the inside of a nestloop join; but adding such + * a consideration to the planner seems like a great deal of complication + * to support an uncommon usage of second-rate sampling methods. Instead, + * if there is a risk that the query might perform an unsafe join, just + * wrap the SampleScan in a Materialize node. We can check for joins by + * counting the membership of all_baserels (note that this correctly + * counts inheritance trees as single rels). If we're inside a subquery, + * we can't easily check whether a join might occur in the outer query, so + * just assume one is possible. + * + * GetTsmRoutine is relatively expensive compared to the other tests here, + * so check repeatable_across_scans last, even though that's a bit odd. + */ + if ((root->query_level > 1 || + bms_membership(root->all_baserels) != BMS_SINGLETON) && + !(GetTsmRoutine(rte->tablesample->tsmhandler)->repeatable_across_scans)) + { + path = (Path *) create_material_path(rel, path); + } + + add_path(rel, path); + + /* For the moment, at least, there are no other paths to consider */ } /* @@ -2450,7 +2504,33 @@ print_path(PlannerInfo *root, Path *path, int indent) switch (nodeTag(path)) { case T_Path: - ptype = "SeqScan"; + switch (path->pathtype) + { + case T_SeqScan: + ptype = "SeqScan"; + break; + case T_SampleScan: + ptype = "SampleScan"; + break; + case T_SubqueryScan: + ptype = "SubqueryScan"; + break; + case T_FunctionScan: + ptype = "FunctionScan"; + break; + case T_ValuesScan: + ptype = "ValuesScan"; + break; + case T_CteScan: + ptype = "CteScan"; + break; + case T_WorkTableScan: + ptype = "WorkTableScan"; + break; + default: + ptype = "???Path"; + break; + } break; case T_IndexPath: ptype = "IdxScan"; diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 0d302f66be..7069f60411 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -74,6 +74,7 @@ #include #include "access/htup_details.h" +#include "access/tsmapi.h" #include "executor/executor.h" #include "executor/nodeHash.h" #include "miscadmin.h" @@ -223,64 +224,66 @@ cost_seqscan(Path *path, PlannerInfo *root, * cost_samplescan * Determines and returns the cost of scanning a relation using sampling. * - * From planner/optimizer perspective, we don't care all that much about cost - * itself since there is always only one scan path to consider when sampling - * scan is present, but number of rows estimation is still important. - * * 'baserel' is the relation to be scanned * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL */ void -cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel) +cost_samplescan(Path *path, PlannerInfo *root, + RelOptInfo *baserel, ParamPathInfo *param_info) { Cost startup_cost = 0; Cost run_cost = 0; + RangeTblEntry *rte; + TableSampleClause *tsc; + TsmRoutine *tsm; double spc_seq_page_cost, spc_random_page_cost, spc_page_cost; QualCost qpqual_cost; Cost cpu_per_tuple; - BlockNumber pages; - double tuples; - RangeTblEntry *rte = planner_rt_fetch(baserel->relid, root); - TableSampleClause *tablesample = rte->tablesample; - /* Should only be applied to base relations */ + /* Should only be applied to base relations with tablesample clauses */ Assert(baserel->relid > 0); - Assert(baserel->rtekind == RTE_RELATION); + rte = planner_rt_fetch(baserel->relid, root); + Assert(rte->rtekind == RTE_RELATION); + tsc = rte->tablesample; + Assert(tsc != NULL); + tsm = GetTsmRoutine(tsc->tsmhandler); /* Mark the path with the correct row estimate */ - if (path->param_info) - path->rows = path->param_info->ppi_rows; + if (param_info) + path->rows = param_info->ppi_rows; else path->rows = baserel->rows; - /* Call the sampling method's costing function. */ - OidFunctionCall6(tablesample->tsmcost, PointerGetDatum(root), - PointerGetDatum(path), PointerGetDatum(baserel), - PointerGetDatum(tablesample->args), - PointerGetDatum(&pages), PointerGetDatum(&tuples)); - /* fetch estimated page cost for tablespace containing table */ get_tablespace_page_costs(baserel->reltablespace, &spc_random_page_cost, &spc_seq_page_cost); - - spc_page_cost = tablesample->tsmseqscan ? spc_seq_page_cost : - spc_random_page_cost; + /* if NextSampleBlock is used, assume random access, else sequential */ + spc_page_cost = (tsm->NextSampleBlock != NULL) ? + spc_random_page_cost : spc_seq_page_cost; /* - * disk costs + * disk costs (recall that baserel->pages has already been set to the + * number of pages the sampling method will visit) */ - run_cost += spc_page_cost * pages; + run_cost += spc_page_cost * baserel->pages; - /* CPU costs */ - get_restriction_qual_cost(root, baserel, path->param_info, &qpqual_cost); + /* + * CPU costs (recall that baserel->tuples has already been set to the + * number of tuples the sampling method will select). Note that we ignore + * execution cost of the TABLESAMPLE parameter expressions; they will be + * evaluated only once per scan, and in most usages they'll likely be + * simple constants anyway. We also don't charge anything for the + * calculations the sampling method might do internally. + */ + get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; - run_cost += cpu_per_tuple * tuples; + run_cost += cpu_per_tuple * baserel->tuples; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 8d15c8ede9..f461586e08 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -102,7 +102,8 @@ static List *order_qual_clauses(PlannerInfo *root, List *clauses); static void copy_path_costsize(Plan *dest, Path *src); static void copy_plan_costsize(Plan *dest, Plan *src); static SeqScan *make_seqscan(List *qptlist, List *qpqual, Index scanrelid); -static SampleScan *make_samplescan(List *qptlist, List *qpqual, Index scanrelid); +static SampleScan *make_samplescan(List *qptlist, List *qpqual, Index scanrelid, + TableSampleClause *tsc); static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, @@ -1148,7 +1149,7 @@ create_seqscan_plan(PlannerInfo *root, Path *best_path, /* * create_samplescan_plan - * Returns a samplecan plan for the base relation scanned by 'best_path' + * Returns a samplescan plan for the base relation scanned by 'best_path' * with restriction clauses 'scan_clauses' and targetlist 'tlist'. */ static SampleScan * @@ -1157,11 +1158,15 @@ create_samplescan_plan(PlannerInfo *root, Path *best_path, { SampleScan *scan_plan; Index scan_relid = best_path->parent->relid; + RangeTblEntry *rte; + TableSampleClause *tsc; - /* it should be a base rel with tablesample clause... */ + /* it should be a base rel with a tablesample clause... */ Assert(scan_relid > 0); - Assert(best_path->parent->rtekind == RTE_RELATION); - Assert(best_path->pathtype == T_SampleScan); + rte = planner_rt_fetch(scan_relid, root); + Assert(rte->rtekind == RTE_RELATION); + tsc = rte->tablesample; + Assert(tsc != NULL); /* Sort clauses into best execution order */ scan_clauses = order_qual_clauses(root, scan_clauses); @@ -1174,13 +1179,16 @@ create_samplescan_plan(PlannerInfo *root, Path *best_path, { scan_clauses = (List *) replace_nestloop_params(root, (Node *) scan_clauses); + tsc = (TableSampleClause *) + replace_nestloop_params(root, (Node *) tsc); } scan_plan = make_samplescan(tlist, scan_clauses, - scan_relid); + scan_relid, + tsc); - copy_path_costsize(&scan_plan->plan, best_path); + copy_path_costsize(&scan_plan->scan.plan, best_path); return scan_plan; } @@ -2161,9 +2169,9 @@ create_customscan_plan(PlannerInfo *root, CustomPath *best_path, ListCell *lc; /* Recursively transform child paths. */ - foreach (lc, best_path->custom_paths) + foreach(lc, best_path->custom_paths) { - Plan *plan = create_plan_recurse(root, (Path *) lfirst(lc)); + Plan *plan = create_plan_recurse(root, (Path *) lfirst(lc)); custom_plans = lappend(custom_plans, plan); } @@ -3437,17 +3445,19 @@ make_seqscan(List *qptlist, static SampleScan * make_samplescan(List *qptlist, List *qpqual, - Index scanrelid) + Index scanrelid, + TableSampleClause *tsc) { SampleScan *node = makeNode(SampleScan); - Plan *plan = &node->plan; + Plan *plan = &node->scan.plan; /* cost should be inserted by caller */ plan->targetlist = qptlist; plan->qual = qpqual; plan->lefttree = NULL; plan->righttree = NULL; - node->scanrelid = scanrelid; + node->scan.scanrelid = scanrelid; + node->tablesample = tsc; return node; } diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 00b2625d34..701b99254d 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -306,7 +306,9 @@ extract_lateral_references(PlannerInfo *root, RelOptInfo *brel, Index rtindex) return; /* Fetch the appropriate variables */ - if (rte->rtekind == RTE_SUBQUERY) + if (rte->rtekind == RTE_RELATION) + vars = pull_vars_of_level((Node *) rte->tablesample, 0); + else if (rte->rtekind == RTE_SUBQUERY) vars = pull_vars_of_level((Node *) rte->subquery, 1); else if (rte->rtekind == RTE_FUNCTION) vars = pull_vars_of_level((Node *) rte->functions, 0); diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index a6ce96efc4..b95cc95e5d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -505,14 +505,10 @@ subquery_planner(PlannerGlobal *glob, Query *parse, if (rte->rtekind == RTE_RELATION) { if (rte->tablesample) - { - rte->tablesample->args = (List *) - preprocess_expression(root, (Node *) rte->tablesample->args, + rte->tablesample = (TableSampleClause *) + preprocess_expression(root, + (Node *) rte->tablesample, EXPRKIND_TABLESAMPLE); - rte->tablesample->repeatable = (Node *) - preprocess_expression(root, rte->tablesample->repeatable, - EXPRKIND_TABLESAMPLE); - } } else if (rte->rtekind == RTE_SUBQUERY) { @@ -697,11 +693,14 @@ preprocess_expression(PlannerInfo *root, Node *expr, int kind) * If the query has any join RTEs, replace join alias variables with * base-relation variables. We must do this before sublink processing, * else sublinks expanded out from join aliases would not get processed. - * We can skip it in non-lateral RTE functions and VALUES lists, however, - * since they can't contain any Vars of the current query level. + * We can skip it in non-lateral RTE functions, VALUES lists, and + * TABLESAMPLE clauses, however, since they can't contain any Vars of the + * current query level. */ if (root->hasJoinRTEs && - !(kind == EXPRKIND_RTFUNC || kind == EXPRKIND_VALUES)) + !(kind == EXPRKIND_RTFUNC || + kind == EXPRKIND_VALUES || + kind == EXPRKIND_TABLESAMPLE)) expr = flatten_join_alias_vars(root, expr); /* diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 258e541754..ea185d4b4c 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -372,9 +372,8 @@ flatten_rtes_walker(Node *node, PlannerGlobal *glob) * * In the flat rangetable, we zero out substructure pointers that are not * needed by the executor; this reduces the storage space and copying cost - * for cached plans. We keep only the tablesample field (which we'd otherwise - * have to put in the plan tree, anyway); the ctename, alias and eref Alias - * fields, which are needed by EXPLAIN; and the selectedCols, insertedCols and + * for cached plans. We keep only the ctename, alias and eref Alias fields, + * which are needed by EXPLAIN, and the selectedCols, insertedCols and * updatedCols bitmaps, which are needed for executor-startup permissions * checking and for trigger event checking. */ @@ -388,6 +387,7 @@ add_rte_to_flat_rtable(PlannerGlobal *glob, RangeTblEntry *rte) memcpy(newrte, rte, sizeof(RangeTblEntry)); /* zap unneeded sub-structure */ + newrte->tablesample = NULL; newrte->subquery = NULL; newrte->joinaliasvars = NIL; newrte->functions = NIL; @@ -456,11 +456,13 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) { SampleScan *splan = (SampleScan *) plan; - splan->scanrelid += rtoffset; - splan->plan.targetlist = - fix_scan_list(root, splan->plan.targetlist, rtoffset); - splan->plan.qual = - fix_scan_list(root, splan->plan.qual, rtoffset); + splan->scan.scanrelid += rtoffset; + splan->scan.plan.targetlist = + fix_scan_list(root, splan->scan.plan.targetlist, rtoffset); + splan->scan.plan.qual = + fix_scan_list(root, splan->scan.plan.qual, rtoffset); + splan->tablesample = (TableSampleClause *) + fix_scan_expr(root, (Node *) splan->tablesample, rtoffset); } break; case T_IndexScan: diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 4708b87f33..f3038cdffd 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2216,7 +2216,12 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, break; case T_SeqScan: + context.paramids = bms_add_members(context.paramids, scan_params); + break; + case T_SampleScan: + finalize_primnode((Node *) ((SampleScan *) plan)->tablesample, + &context); context.paramids = bms_add_members(context.paramids, scan_params); break; @@ -2384,7 +2389,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, bms_add_members(context.paramids, scan_params); /* child nodes if any */ - foreach (lc, cscan->custom_plans) + foreach(lc, cscan->custom_plans) { context.paramids = bms_add_members(context.paramids, diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 92b0562843..34144ccaf0 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -1091,12 +1091,15 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, switch (child_rte->rtekind) { + case RTE_RELATION: + if (child_rte->tablesample) + child_rte->lateral = true; + break; case RTE_SUBQUERY: case RTE_FUNCTION: case RTE_VALUES: child_rte->lateral = true; break; - case RTE_RELATION: case RTE_JOIN: case RTE_CTE: /* these can't contain any lateral references */ @@ -1909,6 +1912,13 @@ replace_vars_in_jointree(Node *jtnode, { switch (rte->rtekind) { + case RTE_RELATION: + /* shouldn't be marked LATERAL unless tablesample */ + Assert(rte->tablesample); + rte->tablesample = (TableSampleClause *) + pullup_replace_vars((Node *) rte->tablesample, + context); + break; case RTE_SUBQUERY: rte->subquery = pullup_replace_vars_subquery(rte->subquery, @@ -1924,7 +1934,6 @@ replace_vars_in_jointree(Node *jtnode, pullup_replace_vars((Node *) rte->values_lists, context); break; - case RTE_RELATION: case RTE_JOIN: case RTE_CTE: /* these shouldn't be marked LATERAL */ diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index f7f33bbe77..935bc2b966 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -713,7 +713,7 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer) /* * create_samplescan_path - * Like seqscan but uses sampling function while scanning. + * Creates a path node for a sampled table scan. */ Path * create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer) @@ -726,7 +726,7 @@ create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer required_outer); pathnode->pathkeys = NIL; /* samplescan has unordered result */ - cost_samplescan(pathnode, root, rel); + cost_samplescan(pathnode, root, rel, pathnode->param_info); return pathnode; } @@ -1773,6 +1773,8 @@ reparameterize_path(PlannerInfo *root, Path *path, { case T_SeqScan: return create_seqscan_path(root, rel, required_outer); + case T_SampleScan: + return (Path *) create_samplescan_path(root, rel, required_outer); case T_IndexScan: case T_IndexOnlyScan: { @@ -1805,8 +1807,6 @@ reparameterize_path(PlannerInfo *root, Path *path, case T_SubqueryScan: return create_subqueryscan_path(root, rel, path->pathkeys, required_outer); - case T_SampleScan: - return (Path *) create_samplescan_path(root, rel, required_outer); default: break; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 2b02a2e523..8f053e47e8 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -457,8 +457,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type joined_table %type relation_expr %type relation_expr_opt_alias +%type tablesample_clause opt_repeatable_clause %type target_el single_set_clause set_target insert_column_item -%type relation_expr_tablesample tablesample_clause opt_repeatable_clause %type generic_option_name %type generic_option_arg @@ -10491,9 +10491,13 @@ table_ref: relation_expr opt_alias_clause $1->alias = $2; $$ = (Node *) $1; } - | relation_expr_tablesample + | relation_expr opt_alias_clause tablesample_clause { - $$ = (Node *) $1; + RangeTableSample *n = (RangeTableSample *) $3; + $1->alias = $2; + /* relation_expr goes inside the RangeTableSample node */ + n->relation = (Node *) $1; + $$ = (Node *) n; } | func_table func_alias_clause { @@ -10820,23 +10824,18 @@ relation_expr_opt_alias: relation_expr %prec UMINUS } ; - -relation_expr_tablesample: relation_expr opt_alias_clause tablesample_clause - { - RangeTableSample *n = (RangeTableSample *) $3; - n->relation = $1; - n->relation->alias = $2; - $$ = (Node *) n; - } - ; - +/* + * TABLESAMPLE decoration in a FROM item + */ tablesample_clause: - TABLESAMPLE ColId '(' expr_list ')' opt_repeatable_clause + TABLESAMPLE func_name '(' expr_list ')' opt_repeatable_clause { RangeTableSample *n = makeNode(RangeTableSample); + /* n->relation will be filled in later */ n->method = $2; n->args = $4; n->repeatable = $6; + n->location = @2; $$ = (Node *) n; } ; diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index e90e1d68e3..4e490b23b4 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -18,8 +18,8 @@ #include "miscadmin.h" #include "access/heapam.h" +#include "access/tsmapi.h" #include "catalog/catalog.h" -#include "access/htup_details.h" #include "catalog/heap.h" #include "catalog/pg_constraint.h" #include "catalog/pg_type.h" @@ -43,7 +43,7 @@ #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/rel.h" -#include "utils/syscache.h" + /* Convenience macro for the most common makeNamespaceItem() case */ #define makeDefaultNSItem(rte) makeNamespaceItem(rte, true, true, false, true) @@ -63,6 +63,8 @@ static RangeTblEntry *transformRangeSubselect(ParseState *pstate, RangeSubselect *r); static RangeTblEntry *transformRangeFunction(ParseState *pstate, RangeFunction *r); +static TableSampleClause *transformRangeTableSample(ParseState *pstate, + RangeTableSample *rts); static Node *transformFromClauseItem(ParseState *pstate, Node *n, RangeTblEntry **top_rte, int *top_rti, List **namespace); @@ -423,40 +425,6 @@ transformJoinOnClause(ParseState *pstate, JoinExpr *j, List *namespace) return result; } -static RangeTblEntry * -transformTableSampleEntry(ParseState *pstate, RangeTableSample *rv) -{ - RangeTblEntry *rte = NULL; - CommonTableExpr *cte = NULL; - TableSampleClause *tablesample = NULL; - - /* if relation has an unqualified name, it might be a CTE reference */ - if (!rv->relation->schemaname) - { - Index levelsup; - - cte = scanNameSpaceForCTE(pstate, rv->relation->relname, &levelsup); - } - - /* We first need to build a range table entry */ - if (!cte) - rte = transformTableEntry(pstate, rv->relation); - - if (!rte || - (rte->relkind != RELKIND_RELATION && - rte->relkind != RELKIND_MATVIEW)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("TABLESAMPLE clause can only be used on tables and materialized views"), - parser_errposition(pstate, rv->relation->location))); - - tablesample = ParseTableSample(pstate, rv->method, rv->repeatable, - rv->args, rv->relation->location); - rte->tablesample = tablesample; - - return rte; -} - /* * transformTableEntry --- transform a RangeVar (simple relation reference) */ @@ -748,6 +716,109 @@ transformRangeFunction(ParseState *pstate, RangeFunction *r) return rte; } +/* + * transformRangeTableSample --- transform a TABLESAMPLE clause + * + * Caller has already transformed rts->relation, we just have to validate + * the remaining fields and create a TableSampleClause node. + */ +static TableSampleClause * +transformRangeTableSample(ParseState *pstate, RangeTableSample *rts) +{ + TableSampleClause *tablesample; + Oid handlerOid; + Oid funcargtypes[1]; + TsmRoutine *tsm; + List *fargs; + ListCell *larg, + *ltyp; + + /* + * To validate the sample method name, look up the handler function, which + * has the same name, one dummy INTERNAL argument, and a result type of + * tsm_handler. (Note: tablesample method names are not schema-qualified + * in the SQL standard; but since they are just functions to us, we allow + * schema qualification to resolve any potential ambiguity.) + */ + funcargtypes[0] = INTERNALOID; + + handlerOid = LookupFuncName(rts->method, 1, funcargtypes, true); + + /* we want error to complain about no-such-method, not no-such-function */ + if (!OidIsValid(handlerOid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("tablesample method %s does not exist", + NameListToString(rts->method)), + parser_errposition(pstate, rts->location))); + + /* check that handler has correct return type */ + if (get_func_rettype(handlerOid) != TSM_HANDLEROID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("function %s must return type \"tsm_handler\"", + NameListToString(rts->method)), + parser_errposition(pstate, rts->location))); + + /* OK, run the handler to get TsmRoutine, for argument type info */ + tsm = GetTsmRoutine(handlerOid); + + tablesample = makeNode(TableSampleClause); + tablesample->tsmhandler = handlerOid; + + /* check user provided the expected number of arguments */ + if (list_length(rts->args) != list_length(tsm->parameterTypes)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg_plural("tablesample method %s requires %d argument, not %d", + "tablesample method %s requires %d arguments, not %d", + list_length(tsm->parameterTypes), + NameListToString(rts->method), + list_length(tsm->parameterTypes), + list_length(rts->args)), + parser_errposition(pstate, rts->location))); + + /* + * Transform the arguments, typecasting them as needed. Note we must also + * assign collations now, because assign_query_collations() doesn't + * examine any substructure of RTEs. + */ + fargs = NIL; + forboth(larg, rts->args, ltyp, tsm->parameterTypes) + { + Node *arg = (Node *) lfirst(larg); + Oid argtype = lfirst_oid(ltyp); + + arg = transformExpr(pstate, arg, EXPR_KIND_FROM_FUNCTION); + arg = coerce_to_specific_type(pstate, arg, argtype, "TABLESAMPLE"); + assign_expr_collations(pstate, arg); + fargs = lappend(fargs, arg); + } + tablesample->args = fargs; + + /* Process REPEATABLE (seed) */ + if (rts->repeatable != NULL) + { + Node *arg; + + if (!tsm->repeatable_across_queries) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("tablesample method %s does not support REPEATABLE", + NameListToString(rts->method)), + parser_errposition(pstate, rts->location))); + + arg = transformExpr(pstate, rts->repeatable, EXPR_KIND_FROM_FUNCTION); + arg = coerce_to_specific_type(pstate, arg, FLOAT8OID, "REPEATABLE"); + assign_expr_collations(pstate, arg); + tablesample->repeatable = (Expr *) arg; + } + else + tablesample->repeatable = NULL; + + return tablesample; +} + /* * transformFromClauseItem - @@ -844,6 +915,33 @@ transformFromClauseItem(ParseState *pstate, Node *n, rtr->rtindex = rtindex; return (Node *) rtr; } + else if (IsA(n, RangeTableSample)) + { + /* TABLESAMPLE clause (wrapping some other valid FROM node) */ + RangeTableSample *rts = (RangeTableSample *) n; + Node *rel; + RangeTblRef *rtr; + RangeTblEntry *rte; + + /* Recursively transform the contained relation */ + rel = transformFromClauseItem(pstate, rts->relation, + top_rte, top_rti, namespace); + /* Currently, grammar could only return a RangeVar as contained rel */ + Assert(IsA(rel, RangeTblRef)); + rtr = (RangeTblRef *) rel; + rte = rt_fetch(rtr->rtindex, pstate->p_rtable); + /* We only support this on plain relations and matviews */ + if (rte->relkind != RELKIND_RELATION && + rte->relkind != RELKIND_MATVIEW) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("TABLESAMPLE clause can only be applied to tables and materialized views"), + parser_errposition(pstate, exprLocation(rts->relation)))); + + /* Transform TABLESAMPLE details and attach to the RTE */ + rte->tablesample = transformRangeTableSample(pstate, rts); + return (Node *) rtr; + } else if (IsA(n, JoinExpr)) { /* A newfangled join expression */ @@ -1165,26 +1263,6 @@ transformFromClauseItem(ParseState *pstate, Node *n, return (Node *) j; } - else if (IsA(n, RangeTableSample)) - { - /* Tablesample reference */ - RangeTableSample *rv = (RangeTableSample *) n; - RangeTblRef *rtr; - RangeTblEntry *rte = NULL; - int rtindex; - - rte = transformTableSampleEntry(pstate, rv); - - /* assume new rte is at end */ - rtindex = list_length(pstate->p_rtable); - Assert(rte == rt_fetch(rtindex, pstate->p_rtable)); - *top_rte = rte; - *top_rti = rtindex; - *namespace = list_make1(makeDefaultNSItem(rte)); - rtr = makeNode(RangeTblRef); - rtr->rtindex = rtindex; - return (Node *) rtr; - } else elog(ERROR, "unrecognized node type: %d", (int) nodeTag(n)); return NULL; /* can't get here, keep compiler quiet */ diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index 430baff116..554ca9d8c4 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -18,7 +18,6 @@ #include "catalog/pg_aggregate.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" -#include "catalog/pg_tablesample_method.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "nodes/makefuncs.h" @@ -27,7 +26,6 @@ #include "parser/parse_clause.h" #include "parser/parse_coerce.h" #include "parser/parse_func.h" -#include "parser/parse_expr.h" #include "parser/parse_relation.h" #include "parser/parse_target.h" #include "parser/parse_type.h" @@ -769,148 +767,6 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, } -/* - * ParseTableSample - * - * Parse TABLESAMPLE clause and process the arguments - */ -TableSampleClause * -ParseTableSample(ParseState *pstate, char *samplemethod, Node *repeatable, - List *sampleargs, int location) -{ - HeapTuple tuple; - Form_pg_tablesample_method tsm; - Form_pg_proc procform; - TableSampleClause *tablesample; - List *fargs; - ListCell *larg; - int nargs, - initnargs; - Oid init_arg_types[FUNC_MAX_ARGS]; - - /* Load the tablesample method */ - tuple = SearchSysCache1(TABLESAMPLEMETHODNAME, PointerGetDatum(samplemethod)); - if (!HeapTupleIsValid(tuple)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("tablesample method \"%s\" does not exist", - samplemethod), - parser_errposition(pstate, location))); - - tablesample = makeNode(TableSampleClause); - tablesample->tsmid = HeapTupleGetOid(tuple); - - tsm = (Form_pg_tablesample_method) GETSTRUCT(tuple); - - tablesample->tsmseqscan = tsm->tsmseqscan; - tablesample->tsmpagemode = tsm->tsmpagemode; - tablesample->tsminit = tsm->tsminit; - tablesample->tsmnextblock = tsm->tsmnextblock; - tablesample->tsmnexttuple = tsm->tsmnexttuple; - tablesample->tsmexaminetuple = tsm->tsmexaminetuple; - tablesample->tsmend = tsm->tsmend; - tablesample->tsmreset = tsm->tsmreset; - tablesample->tsmcost = tsm->tsmcost; - - ReleaseSysCache(tuple); - - /* Validate the parameters against init function definition. */ - tuple = SearchSysCache1(PROCOID, - ObjectIdGetDatum(tablesample->tsminit)); - - if (!HeapTupleIsValid(tuple)) /* should not happen */ - elog(ERROR, "cache lookup failed for function %u", - tablesample->tsminit); - - procform = (Form_pg_proc) GETSTRUCT(tuple); - initnargs = procform->pronargs; - Assert(initnargs >= 3); - - /* - * First parameter is used to pass the SampleScanState, second is seed - * (REPEATABLE), skip the processing for them here, just assert that the - * types are correct. - */ - Assert(procform->proargtypes.values[0] == INTERNALOID); - Assert(procform->proargtypes.values[1] == INT4OID); - initnargs -= 2; - memcpy(init_arg_types, procform->proargtypes.values + 2, - initnargs * sizeof(Oid)); - - /* Now we are done with the catalog */ - ReleaseSysCache(tuple); - - /* Process repeatable (seed) */ - if (repeatable != NULL) - { - Node *arg = repeatable; - - if (arg && IsA(arg, A_Const)) - { - A_Const *con = (A_Const *) arg; - - if (con->val.type == T_Null) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("REPEATABLE clause must be NOT NULL numeric value"), - parser_errposition(pstate, con->location))); - - } - - arg = transformExpr(pstate, arg, EXPR_KIND_FROM_FUNCTION); - arg = coerce_to_specific_type(pstate, arg, INT4OID, "REPEATABLE"); - tablesample->repeatable = arg; - } - else - tablesample->repeatable = NULL; - - /* Check user provided expected number of arguments. */ - if (list_length(sampleargs) != initnargs) - ereport(ERROR, - (errcode(ERRCODE_TOO_MANY_ARGUMENTS), - errmsg_plural("tablesample method \"%s\" expects %d argument got %d", - "tablesample method \"%s\" expects %d arguments got %d", - initnargs, - samplemethod, - initnargs, list_length(sampleargs)), - parser_errposition(pstate, location))); - - /* Transform the arguments, typecasting them as needed. */ - fargs = NIL; - nargs = 0; - foreach(larg, sampleargs) - { - Node *inarg = (Node *) lfirst(larg); - Node *arg = transformExpr(pstate, inarg, EXPR_KIND_FROM_FUNCTION); - Oid argtype = exprType(arg); - - if (argtype != init_arg_types[nargs]) - { - if (!can_coerce_type(1, &argtype, &init_arg_types[nargs], - COERCION_IMPLICIT)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("wrong parameter %d for tablesample method \"%s\"", - nargs + 1, samplemethod), - errdetail("Expected type %s got %s.", - format_type_be(init_arg_types[nargs]), - format_type_be(argtype)), - parser_errposition(pstate, exprLocation(inarg)))); - - arg = coerce_type(pstate, arg, argtype, init_arg_types[nargs], -1, - COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, -1); - } - - fargs = lappend(fargs, arg); - nargs++; - } - - /* Pass the arguments down */ - tablesample->args = fargs; - - return tablesample; -} - /* func_match_argtypes() * * Given a list of candidate functions (having the right name and number diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index bbd6b77c5e..1734e48241 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -418,6 +418,10 @@ rewriteRuleAction(Query *parsetree, switch (rte->rtekind) { + case RTE_RELATION: + sub_action->hasSubLinks = + checkExprHasSubLink((Node *) rte->tablesample); + break; case RTE_FUNCTION: sub_action->hasSubLinks = checkExprHasSubLink((Node *) rte->functions); diff --git a/src/backend/utils/adt/pseudotypes.c b/src/backend/utils/adt/pseudotypes.c index 9ad460abfb..5b809aa7d4 100644 --- a/src/backend/utils/adt/pseudotypes.c +++ b/src/backend/utils/adt/pseudotypes.c @@ -373,6 +373,33 @@ fdw_handler_out(PG_FUNCTION_ARGS) } +/* + * tsm_handler_in - input routine for pseudo-type TSM_HANDLER. + */ +Datum +tsm_handler_in(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type tsm_handler"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * tsm_handler_out - output routine for pseudo-type TSM_HANDLER. + */ +Datum +tsm_handler_out(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot display a value of type tsm_handler"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + + /* * internal_in - input routine for pseudo-type INTERNAL. */ diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 5112cac901..51391f6a4e 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -32,7 +32,6 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" -#include "catalog/pg_tablesample_method.h" #include "catalog/pg_trigger.h" #include "catalog/pg_type.h" #include "commands/defrem.h" @@ -349,8 +348,6 @@ static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, int prettyFlags); static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, int prettyFlags, int wrapColumn); -static void get_tablesample_def(TableSampleClause *tablesample, - deparse_context *context); static void get_query_def(Query *query, StringInfo buf, List *parentnamespace, TupleDesc resultDesc, int prettyFlags, int wrapColumn, int startIndent); @@ -416,6 +413,8 @@ static void get_column_alias_list(deparse_columns *colinfo, static void get_from_clause_coldeflist(RangeTblFunction *rtfunc, deparse_columns *colinfo, deparse_context *context); +static void get_tablesample_def(TableSampleClause *tablesample, + deparse_context *context); static void get_opclass_name(Oid opclass, Oid actual_datatype, StringInfo buf); static Node *processIndirection(Node *node, deparse_context *context, @@ -4235,50 +4234,6 @@ make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, heap_close(ev_relation, AccessShareLock); } -/* ---------- - * get_tablesample_def - Convert TableSampleClause back to SQL - * ---------- - */ -static void -get_tablesample_def(TableSampleClause *tablesample, deparse_context *context) -{ - StringInfo buf = context->buf; - HeapTuple tuple; - Form_pg_tablesample_method tsm; - char *tsmname; - int nargs; - ListCell *l; - - /* Load the tablesample method */ - tuple = SearchSysCache1(TABLESAMPLEMETHODOID, ObjectIdGetDatum(tablesample->tsmid)); - if (!HeapTupleIsValid(tuple)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("cache lookup failed for tablesample method %u", - tablesample->tsmid))); - - tsm = (Form_pg_tablesample_method) GETSTRUCT(tuple); - tsmname = NameStr(tsm->tsmname); - appendStringInfo(buf, " TABLESAMPLE %s (", quote_identifier(tsmname)); - - ReleaseSysCache(tuple); - - nargs = 0; - foreach(l, tablesample->args) - { - if (nargs++ > 0) - appendStringInfoString(buf, ", "); - get_rule_expr((Node *) lfirst(l), context, true); - } - appendStringInfoChar(buf, ')'); - - if (tablesample->repeatable != NULL) - { - appendStringInfoString(buf, " REPEATABLE ("); - get_rule_expr(tablesample->repeatable, context, true); - appendStringInfoChar(buf, ')'); - } -} /* ---------- * get_query_def - Parse back one query parsetree @@ -8781,9 +8736,6 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) only_marker(rte), generate_relation_name(rte->relid, context->namespaces)); - - if (rte->tablesample) - get_tablesample_def(rte->tablesample, context); break; case RTE_SUBQUERY: /* Subquery RTE */ @@ -8963,6 +8915,10 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) /* Else print column aliases as needed */ get_column_alias_list(colinfo, context); } + + /* Tablesample clause must go after any alias */ + if (rte->rtekind == RTE_RELATION && rte->tablesample) + get_tablesample_def(rte->tablesample, context); } else if (IsA(jtnode, JoinExpr)) { @@ -9162,6 +9118,44 @@ get_from_clause_coldeflist(RangeTblFunction *rtfunc, appendStringInfoChar(buf, ')'); } +/* + * get_tablesample_def - print a TableSampleClause + */ +static void +get_tablesample_def(TableSampleClause *tablesample, deparse_context *context) +{ + StringInfo buf = context->buf; + Oid argtypes[1]; + int nargs; + ListCell *l; + + /* + * We should qualify the handler's function name if it wouldn't be + * resolved by lookup in the current search path. + */ + argtypes[0] = INTERNALOID; + appendStringInfo(buf, " TABLESAMPLE %s (", + generate_function_name(tablesample->tsmhandler, 1, + NIL, argtypes, + false, NULL, EXPR_KIND_NONE)); + + nargs = 0; + foreach(l, tablesample->args) + { + if (nargs++ > 0) + appendStringInfoString(buf, ", "); + get_rule_expr((Node *) lfirst(l), context, false); + } + appendStringInfoChar(buf, ')'); + + if (tablesample->repeatable != NULL) + { + appendStringInfoString(buf, " REPEATABLE ("); + get_rule_expr((Node *) tablesample->repeatable, context, false); + appendStringInfoChar(buf, ')'); + } +} + /* * get_opclass_name - fetch name of an index operator class * diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 7b32247d34..1dc293297d 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -32,7 +32,6 @@ #include "catalog/pg_range.h" #include "catalog/pg_statistic.h" #include "catalog/pg_transform.h" -#include "catalog/pg_tablesample_method.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -2997,29 +2996,3 @@ get_range_subtype(Oid rangeOid) else return InvalidOid; } - -/* ---------- PG_TABLESAMPLE_METHOD CACHE ---------- */ - -/* - * get_tablesample_method_name - given a tablesample method OID, - * look up the name or NULL if not found - */ -char * -get_tablesample_method_name(Oid tsmid) -{ - HeapTuple tuple; - - tuple = SearchSysCache1(TABLESAMPLEMETHODOID, ObjectIdGetDatum(tsmid)); - if (HeapTupleIsValid(tuple)) - { - Form_pg_tablesample_method tup = - (Form_pg_tablesample_method) GETSTRUCT(tuple); - char *result; - - result = pstrdup(NameStr(tup->tsmname)); - ReleaseSysCache(tuple); - return result; - } - else - return NULL; -} diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index b6333e362f..efce7b9a3d 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -56,7 +56,6 @@ #include "catalog/pg_shseclabel.h" #include "catalog/pg_replication_origin.h" #include "catalog/pg_statistic.h" -#include "catalog/pg_tablesample_method.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" #include "catalog/pg_ts_config.h" @@ -667,28 +666,6 @@ static const struct cachedesc cacheinfo[] = { }, 128 }, - {TableSampleMethodRelationId, /* TABLESAMPLEMETHODNAME */ - TableSampleMethodNameIndexId, - 1, - { - Anum_pg_tablesample_method_tsmname, - 0, - 0, - 0, - }, - 2 - }, - {TableSampleMethodRelationId, /* TABLESAMPLEMETHODOID */ - TableSampleMethodOidIndexId, - 1, - { - ObjectIdAttributeNumber, - 0, - 0, - 0, - }, - 2 - }, {TableSpaceRelationId, /* TABLESPACEOID */ TablespaceOidIndexId, 1, diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt index 6cc3ed96c4..7b97d45a53 100644 --- a/src/backend/utils/errcodes.txt +++ b/src/backend/utils/errcodes.txt @@ -177,6 +177,8 @@ Section: Class 22 - Data Exception 2201B E ERRCODE_INVALID_REGULAR_EXPRESSION invalid_regular_expression 2201W E ERRCODE_INVALID_ROW_COUNT_IN_LIMIT_CLAUSE invalid_row_count_in_limit_clause 2201X E ERRCODE_INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE invalid_row_count_in_result_offset_clause +2202H E ERRCODE_INVALID_TABLESAMPLE_ARGUMENT invalid_tablesample_argument +2202G E ERRCODE_INVALID_TABLESAMPLE_REPEAT invalid_tablesample_repeat 22009 E ERRCODE_INVALID_TIME_ZONE_DISPLACEMENT_VALUE invalid_time_zone_displacement_value 2200C E ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER invalid_use_of_escape_character 2200G E ERRCODE_MOST_SPECIFIC_TYPE_MISMATCH most_specific_type_mismatch diff --git a/src/backend/utils/misc/sampling.c b/src/backend/utils/misc/sampling.c index 6191f79734..4142e01123 100644 --- a/src/backend/utils/misc/sampling.c +++ b/src/backend/utils/misc/sampling.c @@ -228,7 +228,7 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) void sampler_random_init_state(long seed, SamplerRandomState randstate) { - randstate[0] = RAND48_SEED_0; + randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */ randstate[1] = (unsigned short) seed; randstate[2] = (unsigned short) (seed >> 16); } diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c index 9596af6a7b..ece0515549 100644 --- a/src/bin/psql/tab-complete.c +++ b/src/bin/psql/tab-complete.c @@ -738,13 +738,15 @@ static const SchemaQuery Query_for_list_of_matviews = { " WHERE substring(pg_catalog.quote_ident(evtname),1,%d)='%s'" #define Query_for_list_of_tablesample_methods \ -" SELECT pg_catalog.quote_ident(tsmname) "\ -" FROM pg_catalog.pg_tablesample_method "\ -" WHERE substring(pg_catalog.quote_ident(tsmname),1,%d)='%s'" +" SELECT pg_catalog.quote_ident(proname) "\ +" FROM pg_catalog.pg_proc "\ +" WHERE prorettype = 'pg_catalog.tsm_handler'::pg_catalog.regtype AND "\ +" proargtypes[0] = 'pg_catalog.internal'::pg_catalog.regtype AND "\ +" substring(pg_catalog.quote_ident(proname),1,%d)='%s'" #define Query_for_list_of_policies \ " SELECT pg_catalog.quote_ident(polname) "\ -" FROM pg_catalog.pg_policy " \ +" FROM pg_catalog.pg_policy "\ " WHERE substring(pg_catalog.quote_ident(polname),1,%d)='%s'" #define Query_for_list_of_tables_for_policy \ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 31139cbd0c..75e6b72f9e 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -116,11 +116,13 @@ extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, ScanKey key); extern HeapScanDesc heap_beginscan_sampling(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_pagemode); + bool allow_strat, bool allow_sync, bool allow_pagemode); extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber endBlk); extern void heapgetpage(HeapScanDesc scan, BlockNumber page); extern void heap_rescan(HeapScanDesc scan, ScanKey key); +extern void heap_rescan_set_params(HeapScanDesc scan, ScanKey key, + bool allow_strat, bool allow_sync, bool allow_pagemode); extern void heap_endscan(HeapScanDesc scan); extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction); diff --git a/src/include/access/tablesample.h b/src/include/access/tablesample.h deleted file mode 100644 index a02e93d322..0000000000 --- a/src/include/access/tablesample.h +++ /dev/null @@ -1,61 +0,0 @@ -/*------------------------------------------------------------------------- - * - * tablesample.h - * Public header file for TABLESAMPLE clause interface - * - * - * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * src/include/access/tablesample.h - * - *------------------------------------------------------------------------- - */ -#ifndef TABLESAMPLE_H -#define TABLESAMPLE_H - -#include "access/relscan.h" -#include "executor/executor.h" - -typedef struct TableSampleDesc -{ - HeapScanDesc heapScan; - TupleDesc tupDesc; /* Mostly useful for tsmexaminetuple */ - - void *tsmdata; /* private method data */ - - /* These point to he function of the TABLESAMPLE Method. */ - FmgrInfo tsminit; - FmgrInfo tsmnextblock; - FmgrInfo tsmnexttuple; - FmgrInfo tsmexaminetuple; - FmgrInfo tsmreset; - FmgrInfo tsmend; -} TableSampleDesc; - - -extern TableSampleDesc *tablesample_init(SampleScanState *scanstate, - TableSampleClause *tablesample); -extern HeapTuple tablesample_getnext(TableSampleDesc *desc); -extern void tablesample_reset(TableSampleDesc *desc); -extern void tablesample_end(TableSampleDesc *desc); -extern HeapTuple tablesample_source_getnext(TableSampleDesc *desc); -extern HeapTuple tablesample_source_gettup(TableSampleDesc *desc, ItemPointer tid, - bool *visible); - -extern Datum tsm_system_init(PG_FUNCTION_ARGS); -extern Datum tsm_system_nextblock(PG_FUNCTION_ARGS); -extern Datum tsm_system_nexttuple(PG_FUNCTION_ARGS); -extern Datum tsm_system_end(PG_FUNCTION_ARGS); -extern Datum tsm_system_reset(PG_FUNCTION_ARGS); -extern Datum tsm_system_cost(PG_FUNCTION_ARGS); - -extern Datum tsm_bernoulli_init(PG_FUNCTION_ARGS); -extern Datum tsm_bernoulli_nextblock(PG_FUNCTION_ARGS); -extern Datum tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS); -extern Datum tsm_bernoulli_end(PG_FUNCTION_ARGS); -extern Datum tsm_bernoulli_reset(PG_FUNCTION_ARGS); -extern Datum tsm_bernoulli_cost(PG_FUNCTION_ARGS); - - -#endif diff --git a/src/include/access/tsmapi.h b/src/include/access/tsmapi.h new file mode 100644 index 0000000000..4b59ffabd6 --- /dev/null +++ b/src/include/access/tsmapi.h @@ -0,0 +1,81 @@ +/*------------------------------------------------------------------------- + * + * tsmapi.h + * API for tablesample methods + * + * Copyright (c) 2015, PostgreSQL Global Development Group + * + * src/include/access/tsmapi.h + * + *------------------------------------------------------------------------- + */ +#ifndef TSMAPI_H +#define TSMAPI_H + +#include "nodes/execnodes.h" +#include "nodes/relation.h" + + +/* + * Callback function signatures --- see tablesample-method.sgml for more info. + */ + +typedef void (*SampleScanGetSampleSize_function) (PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); + +typedef void (*InitSampleScan_function) (SampleScanState *node, + int eflags); + +typedef void (*BeginSampleScan_function) (SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); + +typedef BlockNumber (*NextSampleBlock_function) (SampleScanState *node); + +typedef OffsetNumber (*NextSampleTuple_function) (SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + +typedef void (*EndSampleScan_function) (SampleScanState *node); + +/* + * TsmRoutine is the struct returned by a tablesample method's handler + * function. It provides pointers to the callback functions needed by the + * planner and executor, as well as additional information about the method. + * + * More function pointers are likely to be added in the future. + * Therefore it's recommended that the handler initialize the struct with + * makeNode(TsmRoutine) so that all fields are set to NULL. This will + * ensure that no fields are accidentally left undefined. + */ +typedef struct TsmRoutine +{ + NodeTag type; + + /* List of datatype OIDs for the arguments of the TABLESAMPLE clause */ + List *parameterTypes; + + /* Can method produce repeatable samples across, or even within, queries? */ + bool repeatable_across_queries; + bool repeatable_across_scans; + + /* Functions for planning a SampleScan on a physical table */ + SampleScanGetSampleSize_function SampleScanGetSampleSize; + + /* Functions for executing a SampleScan on a physical table */ + InitSampleScan_function InitSampleScan; /* can be NULL */ + BeginSampleScan_function BeginSampleScan; + NextSampleBlock_function NextSampleBlock; /* can be NULL */ + NextSampleTuple_function NextSampleTuple; + EndSampleScan_function EndSampleScan; /* can be NULL */ +} TsmRoutine; + + +/* Functions in access/tablesample/tablesample.c */ +extern TsmRoutine *GetTsmRoutine(Oid tsmhandler); + +#endif /* TSMAPI_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 8f6685fd0c..0e98327931 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201507171 +#define CATALOG_VERSION_NO 201507252 #endif diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h index 748aadde94..c38958d6c5 100644 --- a/src/include/catalog/indexing.h +++ b/src/include/catalog/indexing.h @@ -316,11 +316,6 @@ DECLARE_UNIQUE_INDEX(pg_replication_origin_roiident_index, 6001, on pg_replicati DECLARE_UNIQUE_INDEX(pg_replication_origin_roname_index, 6002, on pg_replication_origin using btree(roname text_pattern_ops)); #define ReplicationOriginNameIndex 6002 -DECLARE_UNIQUE_INDEX(pg_tablesample_method_name_index, 3331, on pg_tablesample_method using btree(tsmname name_ops)); -#define TableSampleMethodNameIndexId 3331 -DECLARE_UNIQUE_INDEX(pg_tablesample_method_oid_index, 3332, on pg_tablesample_method using btree(oid oid_ops)); -#define TableSampleMethodOidIndexId 3332 - /* last step of initialization script: build the indexes declared above */ BUILD_INDICES diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 1d68ad7209..09bf1439c4 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -3734,6 +3734,16 @@ DATA(insert OID = 3116 ( fdw_handler_in PGNSP PGUID 12 1 0 0 0 f f f f f f i 1 DESCR("I/O"); DATA(insert OID = 3117 ( fdw_handler_out PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2275 "3115" _null_ _null_ _null_ _null_ _null_ fdw_handler_out _null_ _null_ _null_ )); DESCR("I/O"); +DATA(insert OID = 3311 ( tsm_handler_in PGNSP PGUID 12 1 0 0 0 f f f f f f i 1 0 3310 "2275" _null_ _null_ _null_ _null_ _null_ tsm_handler_in _null_ _null_ _null_ )); +DESCR("I/O"); +DATA(insert OID = 3312 ( tsm_handler_out PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2275 "3310" _null_ _null_ _null_ _null_ _null_ tsm_handler_out _null_ _null_ _null_ )); +DESCR("I/O"); + +/* tablesample method handlers */ +DATA(insert OID = 3313 ( bernoulli PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 3310 "2281" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_handler _null_ _null_ _null_ )); +DESCR("BERNOULLI tablesample method handler"); +DATA(insert OID = 3314 ( system PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 3310 "2281" _null_ _null_ _null_ _null_ _null_ tsm_system_handler _null_ _null_ _null_ )); +DESCR("SYSTEM tablesample method handler"); /* cryptographic */ DATA(insert OID = 2311 ( md5 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "25" _null_ _null_ _null_ _null_ _null_ md5_text _null_ _null_ _null_ )); @@ -5321,33 +5331,6 @@ DESCR("get an individual replication origin's replication progress"); DATA(insert OID = 6014 ( pg_show_replication_origin_status PGNSP PGUID 12 1 100 0 0 f f f f f t v 0 0 2249 "" "{26,25,3220,3220}" "{o,o,o,o}" "{local_id, external_id, remote_lsn, local_lsn}" _null_ _null_ pg_show_replication_origin_status _null_ _null_ _null_ )); DESCR("get progress for all replication origins"); -/* tablesample */ -DATA(insert OID = 3335 ( tsm_system_init PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2278 "2281 23 700" _null_ _null_ _null_ _null_ _null_ tsm_system_init _null_ _null_ _null_ )); -DESCR("tsm_system_init(internal)"); -DATA(insert OID = 3336 ( tsm_system_nextblock PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 23 "2281 16" _null_ _null_ _null_ _null_ _null_ tsm_system_nextblock _null_ _null_ _null_ )); -DESCR("tsm_system_nextblock(internal)"); -DATA(insert OID = 3337 ( tsm_system_nexttuple PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 21 "2281 23 21 16" _null_ _null_ _null_ _null_ _null_ tsm_system_nexttuple _null_ _null_ _null_ )); -DESCR("tsm_system_nexttuple(internal)"); -DATA(insert OID = 3338 ( tsm_system_end PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ tsm_system_end _null_ _null_ _null_ )); -DESCR("tsm_system_end(internal)"); -DATA(insert OID = 3339 ( tsm_system_reset PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ tsm_system_reset _null_ _null_ _null_ )); -DESCR("tsm_system_reset(internal)"); -DATA(insert OID = 3340 ( tsm_system_cost PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ _null_ tsm_system_cost _null_ _null_ _null_ )); -DESCR("tsm_system_cost(internal)"); - -DATA(insert OID = 3341 ( tsm_bernoulli_init PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2278 "2281 23 700" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_init _null_ _null_ _null_ )); -DESCR("tsm_bernoulli_init(internal)"); -DATA(insert OID = 3342 ( tsm_bernoulli_nextblock PGNSP PGUID 12 1 0 0 0 f f f f t f v 2 0 23 "2281 16" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_nextblock _null_ _null_ _null_ )); -DESCR("tsm_bernoulli_nextblock(internal)"); -DATA(insert OID = 3343 ( tsm_bernoulli_nexttuple PGNSP PGUID 12 1 0 0 0 f f f f t f v 4 0 21 "2281 23 21 16" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_nexttuple _null_ _null_ _null_ )); -DESCR("tsm_bernoulli_nexttuple(internal)"); -DATA(insert OID = 3344 ( tsm_bernoulli_end PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_end _null_ _null_ _null_ )); -DESCR("tsm_bernoulli_end(internal)"); -DATA(insert OID = 3345 ( tsm_bernoulli_reset PGNSP PGUID 12 1 0 0 0 f f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_reset _null_ _null_ _null_ )); -DESCR("tsm_bernoulli_reset(internal)"); -DATA(insert OID = 3346 ( tsm_bernoulli_cost PGNSP PGUID 12 1 0 0 0 f f f f t f v 7 0 2278 "2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ _null_ tsm_bernoulli_cost _null_ _null_ _null_ )); -DESCR("tsm_bernoulli_cost(internal)"); - /* * Symbolic values for provolatile column: these indicate whether the result * of a function is dependent *only* on the values of its explicit arguments, diff --git a/src/include/catalog/pg_tablesample_method.h b/src/include/catalog/pg_tablesample_method.h deleted file mode 100644 index b422414d08..0000000000 --- a/src/include/catalog/pg_tablesample_method.h +++ /dev/null @@ -1,81 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_tablesample_method.h - * definition of the table scan methods. - * - * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * src/include/catalog/pg_tablesample_method.h - * - * - *------------------------------------------------------------------------- - */ -#ifndef PG_TABLESAMPLE_METHOD_H -#define PG_TABLESAMPLE_METHOD_H - -#include "catalog/genbki.h" -#include "catalog/objectaddress.h" - -/* ---------------- - * pg_tablesample_method definition. cpp turns this into - * typedef struct FormData_pg_tablesample_method - * ---------------- - */ -#define TableSampleMethodRelationId 3330 - -CATALOG(pg_tablesample_method,3330) -{ - NameData tsmname; /* tablesample method name */ - bool tsmseqscan; /* does this method scan whole table - * sequentially? */ - bool tsmpagemode; /* does this method scan page at a time? */ - regproc tsminit; /* init scan function */ - regproc tsmnextblock; /* function returning next block to sample or - * InvalidBlockOffset if finished */ - regproc tsmnexttuple; /* function returning next tuple offset from - * current block or InvalidOffsetNumber if end - * of the block was reacher */ - regproc tsmexaminetuple;/* optional function which can examine tuple - * contents and decide if tuple should be - * returned or not */ - regproc tsmend; /* end scan function */ - regproc tsmreset; /* reset state - used by rescan */ - regproc tsmcost; /* costing function */ -} FormData_pg_tablesample_method; - -/* ---------------- - * Form_pg_tablesample_method corresponds to a pointer to a tuple with - * the format of pg_tablesample_method relation. - * ---------------- - */ -typedef FormData_pg_tablesample_method *Form_pg_tablesample_method; - -/* ---------------- - * compiler constants for pg_tablesample_method - * ---------------- - */ -#define Natts_pg_tablesample_method 10 -#define Anum_pg_tablesample_method_tsmname 1 -#define Anum_pg_tablesample_method_tsmseqscan 2 -#define Anum_pg_tablesample_method_tsmpagemode 3 -#define Anum_pg_tablesample_method_tsminit 4 -#define Anum_pg_tablesample_method_tsmnextblock 5 -#define Anum_pg_tablesample_method_tsmnexttuple 6 -#define Anum_pg_tablesample_method_tsmexaminetuple 7 -#define Anum_pg_tablesample_method_tsmend 8 -#define Anum_pg_tablesample_method_tsmreset 9 -#define Anum_pg_tablesample_method_tsmcost 10 - -/* ---------------- - * initial contents of pg_tablesample_method - * ---------------- - */ - -DATA(insert OID = 3333 ( system false true tsm_system_init tsm_system_nextblock tsm_system_nexttuple - tsm_system_end tsm_system_reset tsm_system_cost )); -DESCR("SYSTEM table sampling method"); -DATA(insert OID = 3334 ( bernoulli true false tsm_bernoulli_init tsm_bernoulli_nextblock tsm_bernoulli_nexttuple - tsm_bernoulli_end tsm_bernoulli_reset tsm_bernoulli_cost )); -DESCR("BERNOULLI table sampling method"); - -#endif /* PG_TABLESAMPLE_METHOD_H */ diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index da123f6c49..7dc95c8d2c 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -694,6 +694,8 @@ DATA(insert OID = 3500 ( anyenum PGNSP PGUID 4 t p P f t \054 0 0 0 anyenum_in #define ANYENUMOID 3500 DATA(insert OID = 3115 ( fdw_handler PGNSP PGUID 4 t p P f t \054 0 0 0 fdw_handler_in fdw_handler_out - - - - - i p f 0 -1 0 0 _null_ _null_ _null_ )); #define FDW_HANDLEROID 3115 +DATA(insert OID = 3310 ( tsm_handler PGNSP PGUID 4 t p P f t \054 0 0 0 tsm_handler_in tsm_handler_out - - - - - i p f 0 -1 0 0 _null_ _null_ _null_ )); +#define TSM_HANDLEROID 3310 DATA(insert OID = 3831 ( anyrange PGNSP PGUID -1 f p P f t \054 0 0 0 anyrange_in anyrange_out - - - - - d x f 0 -1 0 0 _null_ _null_ _null_ )); #define ANYRANGEOID 3831 diff --git a/src/include/executor/nodeSamplescan.h b/src/include/executor/nodeSamplescan.h index 4b769daec8..a0cc6ce467 100644 --- a/src/include/executor/nodeSamplescan.h +++ b/src/include/executor/nodeSamplescan.h @@ -4,7 +4,7 @@ * * * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/include/executor/nodeSamplescan.h diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 541ee18735..303fc3c1c7 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1257,13 +1257,22 @@ typedef struct ScanState */ typedef ScanState SeqScanState; -/* - * SampleScan +/* ---------------- + * SampleScanState information + * ---------------- */ typedef struct SampleScanState { ScanState ss; - struct TableSampleDesc *tsdesc; + List *args; /* expr states for TABLESAMPLE params */ + ExprState *repeatable; /* expr state for REPEATABLE expr */ + /* use struct pointer to avoid including tsmapi.h here */ + struct TsmRoutine *tsmroutine; /* descriptor for tablesample method */ + void *tsm_state; /* tablesample method can keep state here */ + bool use_bulkread; /* use bulkread buffer access strategy? */ + bool use_pagemode; /* use page-at-a-time visibility checking? */ + bool begun; /* false means need to call BeginSampleScan */ + uint32 seed; /* random seed */ } SampleScanState; /* diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index f8acda4eed..748e434a27 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -51,6 +51,7 @@ typedef enum NodeTag T_BitmapOr, T_Scan, T_SeqScan, + T_SampleScan, T_IndexScan, T_IndexOnlyScan, T_BitmapIndexScan, @@ -61,7 +62,6 @@ typedef enum NodeTag T_ValuesScan, T_CteScan, T_WorkTableScan, - T_SampleScan, T_ForeignScan, T_CustomScan, T_Join, @@ -400,6 +400,7 @@ typedef enum NodeTag T_WindowDef, T_RangeSubselect, T_RangeFunction, + T_RangeTableSample, T_TypeName, T_ColumnDef, T_IndexElem, @@ -407,6 +408,7 @@ typedef enum NodeTag T_DefElem, T_RangeTblEntry, T_RangeTblFunction, + T_TableSampleClause, T_WithCheckOption, T_SortGroupClause, T_GroupingSet, @@ -425,8 +427,6 @@ typedef enum NodeTag T_OnConflictClause, T_CommonTableExpr, T_RoleSpec, - T_RangeTableSample, - T_TableSampleClause, /* * TAGS FOR REPLICATION GRAMMAR PARSE NODES (replnodes.h) @@ -452,7 +452,8 @@ typedef enum NodeTag T_WindowObjectData, /* private in nodeWindowAgg.c */ T_TIDBitmap, /* in nodes/tidbitmap.h */ T_InlineCodeBlock, /* in nodes/parsenodes.h */ - T_FdwRoutine /* in foreign/fdwapi.h */ + T_FdwRoutine, /* in foreign/fdwapi.h */ + T_TsmRoutine /* in access/tsmapi.h */ } NodeTag; /* diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index b336ff9c6a..151c93a078 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -337,26 +337,6 @@ typedef struct FuncCall int location; /* token location, or -1 if unknown */ } FuncCall; -/* - * TableSampleClause - a sampling method information - */ -typedef struct TableSampleClause -{ - NodeTag type; - Oid tsmid; - bool tsmseqscan; - bool tsmpagemode; - Oid tsminit; - Oid tsmnextblock; - Oid tsmnexttuple; - Oid tsmexaminetuple; - Oid tsmend; - Oid tsmreset; - Oid tsmcost; - Node *repeatable; - List *args; -} TableSampleClause; - /* * A_Star - '*' representing all columns of a table or compound field * @@ -558,19 +538,23 @@ typedef struct RangeFunction } RangeFunction; /* - * RangeTableSample - represents TABLESAMPLE () REPEATABLE () + * RangeTableSample - TABLESAMPLE appearing in a raw FROM clause * - * SQL Standard specifies only one parameter which is percentage. But we allow - * custom tablesample methods which may need different input arguments so we - * accept list of arguments. + * This node, appearing only in raw parse trees, represents + * TABLESAMPLE () REPEATABLE () + * Currently, the can only be a RangeVar, but we might in future + * allow RangeSubselect and other options. Note that the RangeTableSample + * is wrapped around the node representing the , rather than being + * a subfield of it. */ typedef struct RangeTableSample { NodeTag type; - RangeVar *relation; - char *method; /* sampling method */ - Node *repeatable; - List *args; /* arguments for sampling method */ + Node *relation; /* relation to be sampled */ + List *method; /* sampling method name (possibly qualified) */ + List *args; /* argument(s) for sampling method */ + Node *repeatable; /* REPEATABLE expression, or NULL if none */ + int location; /* method name location, or -1 if unknown */ } RangeTableSample; /* @@ -810,7 +794,7 @@ typedef struct RangeTblEntry */ Oid relid; /* OID of the relation */ char relkind; /* relation kind (see pg_class.relkind) */ - TableSampleClause *tablesample; /* sampling method and parameters */ + struct TableSampleClause *tablesample; /* sampling info, or NULL */ /* * Fields valid for a subquery RTE (else NULL): @@ -912,6 +896,19 @@ typedef struct RangeTblFunction Bitmapset *funcparams; /* PARAM_EXEC Param IDs affecting this func */ } RangeTblFunction; +/* + * TableSampleClause - TABLESAMPLE appearing in a transformed FROM clause + * + * Unlike RangeTableSample, this is a subnode of the relevant RangeTblEntry. + */ +typedef struct TableSampleClause +{ + NodeTag type; + Oid tsmhandler; /* OID of the tablesample handler function */ + List *args; /* tablesample argument expression(s) */ + Expr *repeatable; /* REPEATABLE expression, or NULL if none */ +} TableSampleClause; + /* * WithCheckOption - * representation of WITH CHECK OPTION checks to be applied to new tuples @@ -2520,7 +2517,7 @@ typedef struct RenameStmt typedef struct AlterObjectSchemaStmt { NodeTag type; - ObjectType objectType; /* OBJECT_TABLE, OBJECT_TYPE, etc */ + ObjectType objectType; /* OBJECT_TABLE, OBJECT_TYPE, etc */ RangeVar *relation; /* in case it's a table */ List *object; /* in case it's some other object */ List *objarg; /* argument types, if applicable */ @@ -2535,7 +2532,7 @@ typedef struct AlterObjectSchemaStmt typedef struct AlterOwnerStmt { NodeTag type; - ObjectType objectType; /* OBJECT_TABLE, OBJECT_TYPE, etc */ + ObjectType objectType; /* OBJECT_TABLE, OBJECT_TYPE, etc */ RangeVar *relation; /* in case it's a table */ List *object; /* in case it's some other object */ List *objarg; /* argument types, if applicable */ diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 5f538f3e8c..0654d0266c 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -287,7 +287,12 @@ typedef Scan SeqScan; * table sample scan node * ---------------- */ -typedef Scan SampleScan; +typedef struct SampleScan +{ + Scan scan; + /* use struct pointer to avoid including parsenodes.h here */ + struct TableSampleClause *tablesample; +} SampleScan; /* ---------------- * index scan node diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 24003ae359..dd43e45d0c 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -68,7 +68,8 @@ extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, double index_pages, PlannerInfo *root); extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel, ParamPathInfo *param_info); -extern void cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel); +extern void cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel, + ParamPathInfo *param_info); extern void cost_index(IndexPath *path, PlannerInfo *root, double loop_count); extern void cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, diff --git a/src/include/parser/parse_func.h b/src/include/parser/parse_func.h index 3194da4639..32646918e2 100644 --- a/src/include/parser/parse_func.h +++ b/src/include/parser/parse_func.h @@ -33,11 +33,6 @@ typedef enum extern Node *ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, FuncCall *fn, int location); -extern TableSampleClause *ParseTableSample(ParseState *pstate, - char *samplemethod, - Node *repeatable, List *args, - int location); - extern FuncDetailCode func_get_detail(List *funcname, List *fargs, List *fargnames, int nargs, Oid *argtypes, diff --git a/src/include/port.h b/src/include/port.h index 71113c0394..3787cbfb76 100644 --- a/src/include/port.h +++ b/src/include/port.h @@ -357,10 +357,6 @@ extern off_t ftello(FILE *stream); #endif #endif -#define RAND48_SEED_0 (0x330e) -#define RAND48_SEED_1 (0xabcd) -#define RAND48_SEED_2 (0x1234) - extern double pg_erand48(unsigned short xseed[3]); extern long pg_lrand48(void); extern void pg_srand48(long seed); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index fcb0bf0ce8..49caa56557 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -566,6 +566,8 @@ extern Datum language_handler_in(PG_FUNCTION_ARGS); extern Datum language_handler_out(PG_FUNCTION_ARGS); extern Datum fdw_handler_in(PG_FUNCTION_ARGS); extern Datum fdw_handler_out(PG_FUNCTION_ARGS); +extern Datum tsm_handler_in(PG_FUNCTION_ARGS); +extern Datum tsm_handler_out(PG_FUNCTION_ARGS); extern Datum internal_in(PG_FUNCTION_ARGS); extern Datum internal_out(PG_FUNCTION_ARGS); extern Datum opaque_in(PG_FUNCTION_ARGS); @@ -1213,6 +1215,12 @@ extern Datum ginqueryarrayextract(PG_FUNCTION_ARGS); extern Datum ginarrayconsistent(PG_FUNCTION_ARGS); extern Datum ginarraytriconsistent(PG_FUNCTION_ARGS); +/* access/tablesample/bernoulli.c */ +extern Datum tsm_bernoulli_handler(PG_FUNCTION_ARGS); + +/* access/tablesample/system.c */ +extern Datum tsm_system_handler(PG_FUNCTION_ARGS); + /* access/transam/twophase.c */ extern Datum pg_prepared_xact(PG_FUNCTION_ARGS); diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index a40c9b1273..9711538432 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -156,7 +156,6 @@ extern void free_attstatsslot(Oid atttype, extern char *get_namespace_name(Oid nspid); extern char *get_namespace_name_or_temp(Oid nspid); extern Oid get_range_subtype(Oid rangeOid); -extern char *get_tablesample_method_name(Oid tsmid); #define type_is_array(typid) (get_element_type(typid) != InvalidOid) /* type_is_array_domain accepts both plain arrays and domains over arrays */ diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index f06f03a996..18404e266e 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -81,8 +81,6 @@ enum SysCacheIdentifier REPLORIGNAME, RULERELNAME, STATRELATTINH, - TABLESAMPLEMETHODNAME, - TABLESAMPLEMETHODOID, TABLESPACEOID, TRFOID, TRFTYPELANG, diff --git a/src/port/erand48.c b/src/port/erand48.c index 12efd8193c..9d471197c3 100644 --- a/src/port/erand48.c +++ b/src/port/erand48.c @@ -33,6 +33,9 @@ #include +#define RAND48_SEED_0 (0x330e) +#define RAND48_SEED_1 (0xabcd) +#define RAND48_SEED_2 (0x1234) #define RAND48_MULT_0 (0xe66d) #define RAND48_MULT_1 (0xdeec) #define RAND48_MULT_2 (0x0005) diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out index 414299a694..e7c242cd22 100644 --- a/src/test/regress/expected/rowsecurity.out +++ b/src/test/regress/expected/rowsecurity.out @@ -101,15 +101,17 @@ NOTICE: f_leak => great manga 44 | 8 | 1 | rls_regress_user2 | great manga | manga (4 rows) -SELECT * FROM document TABLESAMPLE BERNOULLI (50) REPEATABLE(1) WHERE f_leak(dtitle) ORDER BY did; -NOTICE: f_leak => my first novel +-- try a sampled version +SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0) + WHERE f_leak(dtitle) ORDER BY did; NOTICE: f_leak => my first manga NOTICE: f_leak => great science fiction +NOTICE: f_leak => great manga did | cid | dlevel | dauthor | dtitle -----+-----+--------+-------------------+----------------------- - 1 | 11 | 1 | rls_regress_user1 | my first novel 4 | 44 | 1 | rls_regress_user1 | my first manga 6 | 22 | 1 | rls_regress_user2 | great science fiction + 8 | 44 | 1 | rls_regress_user2 | great manga (3 rows) -- viewpoint from rls_regress_user2 @@ -156,20 +158,20 @@ NOTICE: f_leak => great manga 44 | 8 | 1 | rls_regress_user2 | great manga | manga (8 rows) -SELECT * FROM document TABLESAMPLE BERNOULLI (50) REPEATABLE(1) WHERE f_leak(dtitle) ORDER BY did; -NOTICE: f_leak => my first novel -NOTICE: f_leak => my second novel +-- try a sampled version +SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0) + WHERE f_leak(dtitle) ORDER BY did; NOTICE: f_leak => my first manga +NOTICE: f_leak => my second manga NOTICE: f_leak => great science fiction -NOTICE: f_leak => great technology book +NOTICE: f_leak => great manga did | cid | dlevel | dauthor | dtitle -----+-----+--------+-------------------+----------------------- - 1 | 11 | 1 | rls_regress_user1 | my first novel - 2 | 11 | 2 | rls_regress_user1 | my second novel 4 | 44 | 1 | rls_regress_user1 | my first manga + 5 | 44 | 2 | rls_regress_user1 | my second manga 6 | 22 | 1 | rls_regress_user2 | great science fiction - 7 | 33 | 2 | rls_regress_user2 | great technology book -(5 rows) + 8 | 44 | 1 | rls_regress_user2 | great manga +(4 rows) EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle); QUERY PLAN diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index cd5337531d..1e5b0b9a2c 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2202,6 +2202,10 @@ street| SELECT r.name, FROM ONLY road r, real_city c WHERE (c.outline ## r.thepath); +test_tablesample_v1| SELECT test_tablesample.id + FROM test_tablesample TABLESAMPLE system ((10 * 2)) REPEATABLE (2); +test_tablesample_v2| SELECT test_tablesample.id + FROM test_tablesample TABLESAMPLE system (99); toyemp| SELECT emp.name, emp.age, emp.location, diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index 14acd16da3..eb0bc88ef1 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -128,7 +128,6 @@ pg_shdepend|t pg_shdescription|t pg_shseclabel|t pg_statistic|t -pg_tablesample_method|t pg_tablespace|t pg_transform|t pg_trigger|t diff --git a/src/test/regress/expected/tablesample.out b/src/test/regress/expected/tablesample.out index 04e5eb8b80..727a835439 100644 --- a/src/test/regress/expected/tablesample.out +++ b/src/test/regress/expected/tablesample.out @@ -1,107 +1,123 @@ -CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages -INSERT INTO test_tablesample SELECT i, repeat(i::text, 200) FROM generate_series(0, 9) s(i) ORDER BY i; -SELECT t.id FROM test_tablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (10); +CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); +-- use fillfactor so we don't have to load too much data to get multiple pages +INSERT INTO test_tablesample + SELECT i, repeat(i::text, 200) FROM generate_series(0, 9) s(i); +SELECT t.id FROM test_tablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (0); id ---- - 0 - 1 - 2 3 4 5 - 9 -(7 rows) - -SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (100.0/11) REPEATABLE (9999); - id ----- 6 7 8 -(3 rows) +(6 rows) +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (100.0/11) REPEATABLE (0); + id +---- +(0 rows) + +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (0); + id +---- + 3 + 4 + 5 + 6 + 7 + 8 +(6 rows) + +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (0); + id +---- + 4 + 5 + 6 + 7 + 8 +(5 rows) + +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (5.5) REPEATABLE (0); + id +---- + 7 +(1 row) + +-- 100% should give repeatable count results (ie, all rows) in any case SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100); count ------- 10 (1 row) -SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); - id ----- - 0 - 1 - 2 - 6 - 7 - 8 - 9 -(7 rows) - -SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (100); - id ----- - 0 - 1 - 3 - 4 - 5 -(5 rows) - -SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); - id ----- - 0 - 5 -(2 rows) - -CREATE VIEW test_tablesample_v1 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (10*2) REPEATABLE (2); -CREATE VIEW test_tablesample_v2 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (99); -SELECT pg_get_viewdef('test_tablesample_v1'::regclass); - pg_get_viewdef --------------------------------------------------------------------------------- - SELECT test_tablesample.id + - FROM test_tablesample TABLESAMPLE system (((10 * 2))::real) REPEATABLE (2); +SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100) REPEATABLE (1+2); + count +------- + 10 (1 row) -SELECT pg_get_viewdef('test_tablesample_v2'::regclass); - pg_get_viewdef ------------------------------------------------------------ - SELECT test_tablesample.id + - FROM test_tablesample TABLESAMPLE system ((99)::real); +SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100) REPEATABLE (0.4); + count +------- + 10 (1 row) +CREATE VIEW test_tablesample_v1 AS + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (10*2) REPEATABLE (2); +CREATE VIEW test_tablesample_v2 AS + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (99); +\d+ test_tablesample_v1 + View "public.test_tablesample_v1" + Column | Type | Modifiers | Storage | Description +--------+---------+-----------+---------+------------- + id | integer | | plain | +View definition: + SELECT test_tablesample.id + FROM test_tablesample TABLESAMPLE system ((10 * 2)) REPEATABLE (2); + +\d+ test_tablesample_v2 + View "public.test_tablesample_v2" + Column | Type | Modifiers | Storage | Description +--------+---------+-----------+---------+------------- + id | integer | | plain | +View definition: + SELECT test_tablesample.id + FROM test_tablesample TABLESAMPLE system (99); + +-- check a sampled query doesn't affect cursor in progress BEGIN; -DECLARE tablesample_cur CURSOR FOR SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); +DECLARE tablesample_cur CURSOR FOR + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (0); FETCH FIRST FROM tablesample_cur; id ---- - 0 + 3 (1 row) FETCH NEXT FROM tablesample_cur; id ---- - 1 + 4 (1 row) FETCH NEXT FROM tablesample_cur; id ---- - 2 + 5 (1 row) -SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (10); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (0); id ---- - 0 - 1 - 2 3 4 5 - 9 -(7 rows) + 6 + 7 + 8 +(6 rows) FETCH NEXT FROM tablesample_cur; id @@ -124,19 +140,19 @@ FETCH NEXT FROM tablesample_cur; FETCH FIRST FROM tablesample_cur; id ---- - 0 + 3 (1 row) FETCH NEXT FROM tablesample_cur; id ---- - 1 + 4 (1 row) FETCH NEXT FROM tablesample_cur; id ---- - 2 + 5 (1 row) FETCH NEXT FROM tablesample_cur; @@ -159,41 +175,129 @@ FETCH NEXT FROM tablesample_cur; CLOSE tablesample_cur; END; -EXPLAIN SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (10); - QUERY PLAN -------------------------------------------------------------------------------- - Sample Scan (system) on test_tablesample (cost=0.00..26.35 rows=635 width=4) +EXPLAIN (COSTS OFF) + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (2); + QUERY PLAN +-------------------------------------------------------------------- + Sample Scan on test_tablesample + Sampling: system ('50'::real) REPEATABLE ('2'::double precision) +(2 rows) + +EXPLAIN (COSTS OFF) + SELECT * FROM test_tablesample_v1; + QUERY PLAN +-------------------------------------------------------------------- + Sample Scan on test_tablesample + Sampling: system ('20'::real) REPEATABLE ('2'::double precision) +(2 rows) + +-- check inheritance behavior +explain (costs off) + select count(*) from person tablesample bernoulli (100); + QUERY PLAN +------------------------------------------------- + Aggregate + -> Append + -> Sample Scan on person + Sampling: bernoulli ('100'::real) + -> Sample Scan on emp + Sampling: bernoulli ('100'::real) + -> Sample Scan on student + Sampling: bernoulli ('100'::real) + -> Sample Scan on stud_emp + Sampling: bernoulli ('100'::real) +(10 rows) + +select count(*) from person tablesample bernoulli (100); + count +------- + 58 (1 row) -EXPLAIN SELECT * FROM test_tablesample_v1; - QUERY PLAN -------------------------------------------------------------------------------- - Sample Scan (system) on test_tablesample (cost=0.00..10.54 rows=254 width=4) +select count(*) from person; + count +------- + 58 +(1 row) + +-- check that collations get assigned within the tablesample arguments +SELECT count(*) FROM test_tablesample TABLESAMPLE bernoulli (('1'::text < '0'::text)::int); + count +------- + 0 +(1 row) + +-- check behavior during rescans, as well as correct handling of min/max pct +select * from + (values (0),(100)) v(pct), + lateral (select count(*) from tenk1 tablesample bernoulli (pct)) ss; + pct | count +-----+------- + 0 | 0 + 100 | 10000 +(2 rows) + +select * from + (values (0),(100)) v(pct), + lateral (select count(*) from tenk1 tablesample system (pct)) ss; + pct | count +-----+------- + 0 | 0 + 100 | 10000 +(2 rows) + +explain (costs off) +select pct, count(unique1) from + (values (0),(100)) v(pct), + lateral (select * from tenk1 tablesample bernoulli (pct)) ss + group by pct; + QUERY PLAN +-------------------------------------------------------- + HashAggregate + Group Key: "*VALUES*".column1 + -> Nested Loop + -> Values Scan on "*VALUES*" + -> Sample Scan on tenk1 + Sampling: bernoulli ("*VALUES*".column1) +(6 rows) + +select pct, count(unique1) from + (values (0),(100)) v(pct), + lateral (select * from tenk1 tablesample bernoulli (pct)) ss + group by pct; + pct | count +-----+------- + 100 | 10000 +(1 row) + +select pct, count(unique1) from + (values (0),(100)) v(pct), + lateral (select * from tenk1 tablesample system (pct)) ss + group by pct; + pct | count +-----+------- + 100 | 10000 (1 row) -- errors SELECT id FROM test_tablesample TABLESAMPLE FOOBAR (1); -ERROR: tablesample method "foobar" does not exist +ERROR: tablesample method foobar does not exist LINE 1: SELECT id FROM test_tablesample TABLESAMPLE FOOBAR (1); - ^ + ^ +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (NULL); +ERROR: TABLESAMPLE parameter cannot be null SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (NULL); -ERROR: REPEATABLE clause must be NOT NULL numeric value -LINE 1: ... test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (NULL); - ^ +ERROR: TABLESAMPLE REPEATABLE parameter cannot be null SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (-1); -ERROR: invalid sample size -HINT: Sample size must be numeric value between 0 and 100 (inclusive). +ERROR: sample percentage must be between 0 and 100 SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (200); -ERROR: invalid sample size -HINT: Sample size must be numeric value between 0 and 100 (inclusive). +ERROR: sample percentage must be between 0 and 100 SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (-1); -ERROR: invalid sample size -HINT: Sample size must be numeric value between 0 and 100 (inclusive). +ERROR: sample percentage must be between 0 and 100 SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (200); -ERROR: invalid sample size -HINT: Sample size must be numeric value between 0 and 100 (inclusive). +ERROR: sample percentage must be between 0 and 100 SELECT id FROM test_tablesample_v1 TABLESAMPLE BERNOULLI (1); -ERROR: TABLESAMPLE clause can only be used on tables and materialized views +ERROR: TABLESAMPLE clause can only be applied to tables and materialized views LINE 1: SELECT id FROM test_tablesample_v1 TABLESAMPLE BERNOULLI (1)... ^ INSERT INTO test_tablesample_v1 VALUES(1); @@ -202,30 +306,10 @@ DETAIL: Views containing TABLESAMPLE are not automatically updatable. HINT: To enable inserting into the view, provide an INSTEAD OF INSERT trigger or an unconditional ON INSERT DO INSTEAD rule. WITH query_select AS (SELECT * FROM test_tablesample) SELECT * FROM query_select TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); -ERROR: TABLESAMPLE clause can only be used on tables and materialized views +ERROR: TABLESAMPLE clause can only be applied to tables and materialized views LINE 2: SELECT * FROM query_select TABLESAMPLE BERNOULLI (5.5) REPEA... ^ SELECT q.* FROM (SELECT * FROM test_tablesample) as q TABLESAMPLE BERNOULLI (5); ERROR: syntax error at or near "TABLESAMPLE" LINE 1: ...CT q.* FROM (SELECT * FROM test_tablesample) as q TABLESAMPL... ^ --- catalog sanity -SELECT * -FROM pg_tablesample_method -WHERE tsminit IS NULL - OR tsmseqscan IS NULL - OR tsmpagemode IS NULL - OR tsmnextblock IS NULL - OR tsmnexttuple IS NULL - OR tsmend IS NULL - OR tsmreset IS NULL - OR tsmcost IS NULL; - tsmname | tsmseqscan | tsmpagemode | tsminit | tsmnextblock | tsmnexttuple | tsmexaminetuple | tsmend | tsmreset | tsmcost ----------+------------+-------------+---------+--------------+--------------+-----------------+--------+----------+--------- -(0 rows) - --- done -DROP TABLE test_tablesample CASCADE; -NOTICE: drop cascades to 2 other objects -DETAIL: drop cascades to view test_tablesample_v1 -drop cascades to view test_tablesample_v2 diff --git a/src/test/regress/output/misc.source b/src/test/regress/output/misc.source index 70c9cc356a..9eedb363d0 100644 --- a/src/test/regress/output/misc.source +++ b/src/test/regress/output/misc.source @@ -686,6 +686,9 @@ SELECT user_relns() AS user_relns test_range_excl test_range_gist test_range_spgist + test_tablesample + test_tablesample_v1 + test_tablesample_v2 test_tsvector testjsonb text_tbl @@ -705,7 +708,7 @@ SELECT user_relns() AS user_relns tvvmv varchar_tbl xacttest -(127 rows) +(130 rows) SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer'))); name diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 3a607cff46..15d74d4e6e 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -110,6 +110,7 @@ test: lock test: replica_identity test: rowsecurity test: object_address +test: tablesample test: alter_generic test: alter_operator test: misc @@ -156,4 +157,3 @@ test: with test: xml test: event_trigger test: stats -test: tablesample diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql index 039070b85b..e86f814314 100644 --- a/src/test/regress/sql/rowsecurity.sql +++ b/src/test/regress/sql/rowsecurity.sql @@ -94,14 +94,18 @@ SET row_security TO ON; SELECT * FROM document WHERE f_leak(dtitle) ORDER BY did; SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did; -SELECT * FROM document TABLESAMPLE BERNOULLI (50) REPEATABLE(1) WHERE f_leak(dtitle) ORDER BY did; +-- try a sampled version +SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0) + WHERE f_leak(dtitle) ORDER BY did; -- viewpoint from rls_regress_user2 SET SESSION AUTHORIZATION rls_regress_user2; SELECT * FROM document WHERE f_leak(dtitle) ORDER BY did; SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle) ORDER BY did; -SELECT * FROM document TABLESAMPLE BERNOULLI (50) REPEATABLE(1) WHERE f_leak(dtitle) ORDER BY did; +-- try a sampled version +SELECT * FROM document TABLESAMPLE BERNOULLI(50) REPEATABLE(0) + WHERE f_leak(dtitle) ORDER BY did; EXPLAIN (COSTS OFF) SELECT * FROM document WHERE f_leak(dtitle); EXPLAIN (COSTS OFF) SELECT * FROM document NATURAL JOIN category WHERE f_leak(dtitle); diff --git a/src/test/regress/sql/tablesample.sql b/src/test/regress/sql/tablesample.sql index 7b3eb9bedf..eec9793496 100644 --- a/src/test/regress/sql/tablesample.sql +++ b/src/test/regress/sql/tablesample.sql @@ -1,26 +1,37 @@ -CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); -- force smaller pages so we don't have to load too much data to get multiple pages +CREATE TABLE test_tablesample (id int, name text) WITH (fillfactor=10); +-- use fillfactor so we don't have to load too much data to get multiple pages -INSERT INTO test_tablesample SELECT i, repeat(i::text, 200) FROM generate_series(0, 9) s(i) ORDER BY i; +INSERT INTO test_tablesample + SELECT i, repeat(i::text, 200) FROM generate_series(0, 9) s(i); -SELECT t.id FROM test_tablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (10); -SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (100.0/11) REPEATABLE (9999); +SELECT t.id FROM test_tablesample AS t TABLESAMPLE SYSTEM (50) REPEATABLE (0); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (100.0/11) REPEATABLE (0); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (0); +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (0); +SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (5.5) REPEATABLE (0); + +-- 100% should give repeatable count results (ie, all rows) in any case SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100); -SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); -SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (50) REPEATABLE (100); -SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); +SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100) REPEATABLE (1+2); +SELECT count(*) FROM test_tablesample TABLESAMPLE SYSTEM (100) REPEATABLE (0.4); -CREATE VIEW test_tablesample_v1 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (10*2) REPEATABLE (2); -CREATE VIEW test_tablesample_v2 AS SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (99); -SELECT pg_get_viewdef('test_tablesample_v1'::regclass); -SELECT pg_get_viewdef('test_tablesample_v2'::regclass); +CREATE VIEW test_tablesample_v1 AS + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (10*2) REPEATABLE (2); +CREATE VIEW test_tablesample_v2 AS + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (99); +\d+ test_tablesample_v1 +\d+ test_tablesample_v2 +-- check a sampled query doesn't affect cursor in progress BEGIN; -DECLARE tablesample_cur CURSOR FOR SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (100); +DECLARE tablesample_cur CURSOR FOR + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (0); + FETCH FIRST FROM tablesample_cur; FETCH NEXT FROM tablesample_cur; FETCH NEXT FROM tablesample_cur; -SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (10); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (0); FETCH NEXT FROM tablesample_cur; FETCH NEXT FROM tablesample_cur; @@ -36,12 +47,45 @@ FETCH NEXT FROM tablesample_cur; CLOSE tablesample_cur; END; -EXPLAIN SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (10); -EXPLAIN SELECT * FROM test_tablesample_v1; +EXPLAIN (COSTS OFF) + SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (2); +EXPLAIN (COSTS OFF) + SELECT * FROM test_tablesample_v1; + +-- check inheritance behavior +explain (costs off) + select count(*) from person tablesample bernoulli (100); +select count(*) from person tablesample bernoulli (100); +select count(*) from person; + +-- check that collations get assigned within the tablesample arguments +SELECT count(*) FROM test_tablesample TABLESAMPLE bernoulli (('1'::text < '0'::text)::int); + +-- check behavior during rescans, as well as correct handling of min/max pct +select * from + (values (0),(100)) v(pct), + lateral (select count(*) from tenk1 tablesample bernoulli (pct)) ss; +select * from + (values (0),(100)) v(pct), + lateral (select count(*) from tenk1 tablesample system (pct)) ss; +explain (costs off) +select pct, count(unique1) from + (values (0),(100)) v(pct), + lateral (select * from tenk1 tablesample bernoulli (pct)) ss + group by pct; +select pct, count(unique1) from + (values (0),(100)) v(pct), + lateral (select * from tenk1 tablesample bernoulli (pct)) ss + group by pct; +select pct, count(unique1) from + (values (0),(100)) v(pct), + lateral (select * from tenk1 tablesample system (pct)) ss + group by pct; -- errors SELECT id FROM test_tablesample TABLESAMPLE FOOBAR (1); +SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (NULL); SELECT id FROM test_tablesample TABLESAMPLE SYSTEM (50) REPEATABLE (NULL); SELECT id FROM test_tablesample TABLESAMPLE BERNOULLI (-1); @@ -56,19 +100,3 @@ WITH query_select AS (SELECT * FROM test_tablesample) SELECT * FROM query_select TABLESAMPLE BERNOULLI (5.5) REPEATABLE (1); SELECT q.* FROM (SELECT * FROM test_tablesample) as q TABLESAMPLE BERNOULLI (5); - --- catalog sanity - -SELECT * -FROM pg_tablesample_method -WHERE tsminit IS NULL - OR tsmseqscan IS NULL - OR tsmpagemode IS NULL - OR tsmnextblock IS NULL - OR tsmnexttuple IS NULL - OR tsmend IS NULL - OR tsmreset IS NULL - OR tsmcost IS NULL; - --- done -DROP TABLE test_tablesample CASCADE;