postgresql/src/backend/utils/misc/sampling.c

/*-------------------------------------------------------------------------
 *
 * sampling.c
 *	  Relation block sampling routines.
 *
 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  src/backend/utils/misc/sampling.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <math.h>

#include "utils/sampling.h"


/*
 * BlockSampler_Init -- prepare for random sampling of blocknumbers
 *
 * BlockSampler provides algorithm for block level sampling of a relation
 * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
 * It selects a random sample of samplesize blocks out of
 * the nblocks blocks in the table. If the table has less than
 * samplesize blocks, all blocks are selected.
 *
 * Since we know the total number of blocks in advance, we can use the
 * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
 * algorithm.
 */
void
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
				  long randseed)
{
	bs->N = nblocks;			/* measured table size */

	/*
	 * If we decide to reduce samplesize for tables that have less or not much
	 * more than samplesize blocks, here is the place to do it.
	 */
	bs->n = samplesize;
	bs->t = 0;					/* blocks scanned so far */
	bs->m = 0;					/* blocks selected so far */

	sampler_random_init_state(randseed, bs->randstate);
}

bool
BlockSampler_HasMore(BlockSampler bs)
{
	return (bs->t < bs->N) && (bs->m < bs->n);
}

BlockNumber
BlockSampler_Next(BlockSampler bs)
{
	BlockNumber K = bs->N - bs->t;		/* remaining blocks */
	int			k = bs->n - bs->m;		/* blocks still to sample */
	double		p;				/* probability to skip block */
	double		V;				/* random */

	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */

	if ((BlockNumber) k >= K)
	{
		/* need all the rest */
		bs->m++;
		return bs->t++;
	}

	/*----------
	 * It is not obvious that this code matches Knuth's Algorithm S.
	 * Knuth says to skip the current block with probability 1 - k/K.
	 * If we are to skip, we should advance t (hence decrease K), and
	 * repeat the same probabilistic test for the next block.  The naive
	 * implementation thus requires a sampler_random_fract() call for each
	 * block number.  But we can reduce this to one sampler_random_fract()
	 * call per selected block, by noting that each time the while-test
	 * succeeds, we can reinterpret V as a uniform random number in the range
	 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
	 * the appropriate fraction of its former value, and our next loop
	 * makes the appropriate probabilistic test.
	 *
	 * We have initially K > k > 0.  If the loop reduces K to equal k,
	 * the next while-test must fail since p will become exactly zero
	 * (we assume there will not be roundoff error in the division).
	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
	 * to be doubly sure about roundoff error.)  Therefore K cannot become
	 * less than k, which means that we cannot fail to select enough blocks.
	 *----------
	 */
	V = sampler_random_fract(bs->randstate);
	p = 1.0 - (double) k / (double) K;
	while (V < p)
	{
		/* skip */
		bs->t++;
		K--;					/* keep K == N - t */

		/* adjust p to be new cutoff point in reduced range */
		p *= 1.0 - (double) k / (double) K;
	}

	/* select */
	bs->m++;
	return bs->t++;
}

/*
 * These two routines embody Algorithm Z from "Random sampling with a
 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
 * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
 * of the count S of records to skip before processing another record.
 * It is computed primarily based on t, the number of records already read.
 * The only extra state needed between calls is W, a random state variable.
 *
 * reservoir_init_selection_state computes the initial W value.
 *
 * Given that we've already read t records (t >= n), reservoir_get_next_S
 * determines the number of records to skip before the next record is
 * processed.
 */
void
reservoir_init_selection_state(ReservoirState rs, int n)
{
	/*
	 * Reservoir sampling is not used anywhere where it would need to return
	 * repeatable results so we can initialize it randomly.
	 */
	sampler_random_init_state(random(), rs->randstate);

	/* Initial value of W (for use when Algorithm Z is first applied) */
	rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
}

double
reservoir_get_next_S(ReservoirState rs, double t, int n)
{
	double		S;

	/* The magic constant here is T from Vitter's paper */
	if (t <= (22.0 * n))
	{
		/* Process records using Algorithm X until t is large enough */
		double		V,
					quot;

		V = sampler_random_fract(rs->randstate);		/* Generate V */
		S = 0;
		t += 1;
		/* Note: "num" in Vitter's code is always equal to t - n */
		quot = (t - (double) n) / t;
		/* Find min S satisfying (4.1) */
		while (quot > V)
		{
			S += 1;
			t += 1;
			quot *= (t - (double) n) / t;
		}
	}
	else
	{
		/* Now apply Algorithm Z */
		double		W = rs->W;
		double		term = t - (double) n + 1;

		for (;;)
		{
			double		numer,
						numer_lim,
						denom;
			double		U,
						X,
						lhs,
						rhs,
						y,
						tmp;

			/* Generate U and X */
			U = sampler_random_fract(rs->randstate);
			X = t * (W - 1.0);
			S = floor(X);		/* S is tentatively set to floor(X) */
			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
			tmp = (t + 1) / term;
			lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
			rhs = (((t + X) / (term + S)) * term) / t;
			if (lhs <= rhs)
			{
				W = rhs / lhs;
				break;
			}
			/* Test if U <= f(S)/cg(X) */
			y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
			if ((double) n < S)
			{
				denom = t;
				numer_lim = term + S;
			}
			else
			{
				denom = t - (double) n + S;
				numer_lim = t + 1;
			}
			for (numer = t + S; numer >= numer_lim; numer -= 1)
			{
				y *= numer / denom;
				denom -= 1;
			}
			W = exp(-log(sampler_random_fract(rs->randstate)) / n);		/* Generate W in advance */
			if (exp(log(y) / n) <= (t + X) / t)
				break;
		}
		rs->W = W;
	}
	return S;
}


/*----------
 * Random number generator used by sampling
 *----------
 */
void
sampler_random_init_state(long seed, SamplerRandomState randstate)
{
	randstate[0] = 0x330e;		/* same as pg_erand48, but could be anything */
	randstate[1] = (unsigned short) seed;
	randstate[2] = (unsigned short) (seed >> 16);
}

/* Select a random value R uniformly distributed in (0 - 1) */
double
sampler_random_fract(SamplerRandomState randstate)
{
	double		res;

	/* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
	do
	{
		res = pg_erand48(randstate);
	} while (res == 0.0);
	return res;
}


/*
 * Backwards-compatible API for block sampling
 *
 * This code is now deprecated, but since it's still in use by many FDWs,
 * we should keep it for awhile at least.  The functionality is the same as
 * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
 * except that a common random state is used across all callers.
 */
static ReservoirStateData oldrs;

double
anl_random_fract(void)
{
	/* initialize if first time through */
	if (oldrs.randstate[0] == 0)
		sampler_random_init_state(random(), oldrs.randstate);

	/* and compute a random fraction */
	return sampler_random_fract(oldrs.randstate);
}

double
anl_init_selection_state(int n)
{
	/* initialize if first time through */
	if (oldrs.randstate[0] == 0)
		sampler_random_init_state(random(), oldrs.randstate);

	/* Initial value of W (for use when Algorithm Z is first applied) */
	return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
}

double
anl_get_next_S(double t, int n, double *stateptr)
{
	double		result;

	oldrs.W = *stateptr;
	result = reservoir_get_next_S(&oldrs, t, n);
	*stateptr = oldrs.W;
	return result;
}
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`/*-------------------------------------------------------------------------`
			`*`
			`* sampling.c`
			`* Relation block sampling routines.`
			`*`
Update copyright for 2016 Backpatch certain files through 9.1 2016-01-02 19:33:40 +01:00			`* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`* Portions Copyright (c) 1994, Regents of the University of California`
			`*`
			`*`
			`* IDENTIFICATION`
			`* src/backend/utils/misc/sampling.c`
			`*`
			`*-------------------------------------------------------------------------`
			`*/`

			`#include "postgres.h"`

			`#include <math.h>`

			`#include "utils/sampling.h"`


			`/*`
			`* BlockSampler_Init -- prepare for random sampling of blocknumbers`
			`*`
			`* BlockSampler provides algorithm for block level sampling of a relation`
			`* as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")`
			`* It selects a random sample of samplesize blocks out of`
			`* the nblocks blocks in the table. If the table has less than`
			`* samplesize blocks, all blocks are selected.`
			`*`
			`* Since we know the total number of blocks in advance, we can use the`
			`* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's`
			`* algorithm.`
			`*/`
			`void`
			`BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,`
			`long randseed)`
			`{`
			`bs->N = nblocks; /* measured table size */`

			`/*`
			`* If we decide to reduce samplesize for tables that have less or not much`
			`* more than samplesize blocks, here is the place to do it.`
			`*/`
			`bs->n = samplesize;`
			`bs->t = 0; /* blocks scanned so far */`
			`bs->m = 0; /* blocks selected so far */`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00
			`sampler_random_init_state(randseed, bs->randstate);`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`}`

			`bool`
			`BlockSampler_HasMore(BlockSampler bs)`
			`{`
			`return (bs->t < bs->N) && (bs->m < bs->n);`
			`}`

			`BlockNumber`
			`BlockSampler_Next(BlockSampler bs)`
			`{`
			`BlockNumber K = bs->N - bs->t; /* remaining blocks */`
			`int k = bs->n - bs->m; /* blocks still to sample */`
			`double p; /* probability to skip block */`
			`double V; /* random */`

			`Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */`

			`if ((BlockNumber) k >= K)`
			`{`
			`/* need all the rest */`
			`bs->m++;`
			`return bs->t++;`
			`}`

			`/*----------`
			`* It is not obvious that this code matches Knuth's Algorithm S.`
			`* Knuth says to skip the current block with probability 1 - k/K.`
			`* If we are to skip, we should advance t (hence decrease K), and`
			`* repeat the same probabilistic test for the next block. The naive`
Collection of typo fixes. Use "a" and "an" correctly, mostly in comments. Two error messages were also fixed (they were just elogs, so no translation work required). Two function comments in pg_proc.h were also fixed. Etsuro Fujita reported one of these, but I found a lot more with grep. Also fix a few other typos spotted while grepping for the a/an typos. For example, "consists out of ..." -> "consists of ...". Plus a "though"/ "through" mixup reported by Euler Taveira. Many of these typos were in old code, which would be nice to backpatch to make future backpatching easier. But much of the code was new, and I didn't feel like crafting separate patches for each branch. So no backpatching. 2015-05-20 15:18:11 +02:00			`* implementation thus requires a sampler_random_fract() call for each`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`* block number. But we can reduce this to one sampler_random_fract()`
			`* call per selected block, by noting that each time the while-test`
			`* succeeds, we can reinterpret V as a uniform random number in the range`
			`* 0 to p. Therefore, instead of choosing a new V, we just adjust p to be`
			`* the appropriate fraction of its former value, and our next loop`
			`* makes the appropriate probabilistic test.`
			`*`
			`* We have initially K > k > 0. If the loop reduces K to equal k,`
			`* the next while-test must fail since p will become exactly zero`
			`* (we assume there will not be roundoff error in the division).`
			`* (Note: Knuth suggests a "<=" loop condition, but we use "<" just`
			`* to be doubly sure about roundoff error.) Therefore K cannot become`
			`* less than k, which means that we cannot fail to select enough blocks.`
			`*----------`
			`*/`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`V = sampler_random_fract(bs->randstate);`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`p = 1.0 - (double) k / (double) K;`
			`while (V < p)`
			`{`
			`/* skip */`
			`bs->t++;`
			`K--; /* keep K == N - t */`

			`/* adjust p to be new cutoff point in reduced range */`
			`p *= 1.0 - (double) k / (double) K;`
			`}`

			`/* select */`
			`bs->m++;`
			`return bs->t++;`
			`}`

			`/*`
			`* These two routines embody Algorithm Z from "Random sampling with a`
			`* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1`
			`* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms`
			`* of the count S of records to skip before processing another record.`
			`* It is computed primarily based on t, the number of records already read.`
			`* The only extra state needed between calls is W, a random state variable.`
			`*`
			`* reservoir_init_selection_state computes the initial W value.`
			`*`
			`* Given that we've already read t records (t >= n), reservoir_get_next_S`
			`* determines the number of records to skip before the next record is`
			`* processed.`
			`*/`
			`void`
			`reservoir_init_selection_state(ReservoirState rs, int n)`
			`{`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`/*`
			`* Reservoir sampling is not used anywhere where it would need to return`
			`* repeatable results so we can initialize it randomly.`
			`*/`
			`sampler_random_init_state(random(), rs->randstate);`

Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`/* Initial value of W (for use when Algorithm Z is first applied) */`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`}`

			`double`
			`reservoir_get_next_S(ReservoirState rs, double t, int n)`
			`{`
			`double S;`

			`/* The magic constant here is T from Vitter's paper */`
			`if (t <= (22.0 * n))`
			`{`
			`/* Process records using Algorithm X until t is large enough */`
			`double V,`
			`quot;`

pgindent run for 9.5 2015-05-24 03:35:49 +02:00			`V = sampler_random_fract(rs->randstate); /* Generate V */`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`S = 0;`
			`t += 1;`
			`/* Note: "num" in Vitter's code is always equal to t - n */`
			`quot = (t - (double) n) / t;`
			`/* Find min S satisfying (4.1) */`
			`while (quot > V)`
			`{`
			`S += 1;`
			`t += 1;`
			`quot *= (t - (double) n) / t;`
			`}`
			`}`
			`else`
			`{`
			`/* Now apply Algorithm Z */`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`double W = rs->W;`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`double term = t - (double) n + 1;`

			`for (;;)`
			`{`
			`double numer,`
			`numer_lim,`
			`denom;`
			`double U,`
			`X,`
			`lhs,`
			`rhs,`
			`y,`
			`tmp;`

			`/* Generate U and X */`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`U = sampler_random_fract(rs->randstate);`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`X = t * (W - 1.0);`
			`S = floor(X); /* S is tentatively set to floor(X) */`
			`/* Test if U <= h(S)/cg(X) in the manner of (6.3) */`
			`tmp = (t + 1) / term;`
			`lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);`
			`rhs = (((t + X) / (term + S)) * term) / t;`
			`if (lhs <= rhs)`
			`{`
			`W = rhs / lhs;`
			`break;`
			`}`
			`/* Test if U <= f(S)/cg(X) */`
			`y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);`
			`if ((double) n < S)`
			`{`
			`denom = t;`
			`numer_lim = term + S;`
			`}`
			`else`
			`{`
			`denom = t - (double) n + S;`
			`numer_lim = t + 1;`
			`}`
			`for (numer = t + S; numer >= numer_lim; numer -= 1)`
			`{`
			`y *= numer / denom;`
			`denom -= 1;`
			`}`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`if (exp(log(y) / n) <= (t + X) / t)`
			`break;`
			`}`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`rs->W = W;`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`}`
			`return S;`
			`}`


			`/*----------`
			`* Random number generator used by sampling`
			`*----------`
			`*/`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`void`
			`sampler_random_init_state(long seed, SamplerRandomState randstate)`
			`{`
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production. 2015-07-25 20:39:00 +02:00			`randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`randstate[1] = (unsigned short) seed;`
			`randstate[2] = (unsigned short) (seed >> 16);`
			`}`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00
			`/* Select a random value R uniformly distributed in (0 - 1) */`
			`double`
TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs 2015-05-15 20:37:10 +02:00			`sampler_random_fract(SamplerRandomState randstate)`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`{`
Make sampler_random_fract() actually obey its API contract. This function is documented to return a value in the range (0,1), which is what its predecessor anl_random_fract() did. However, the new version depends on pg_erand48() which returns a value in [0,1). The possibility of returning zero creates hazards of division by zero or trying to compute log(0) at some call sites, and it might well break third-party modules using anl_random_fract() too. So let's change it to never return zero. Spotted by Coverity. Michael Paquier, cosmetically adjusted by me 2015-07-02 00:07:48 +02:00			`double res;`

			`/* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */`
			`do`
			`{`
			`res = pg_erand48(randstate);`
			`} while (res == 0.0);`
			`return res;`
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`}`
Put back a backwards-compatible version of sampling support functions. Commit 83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c removed the longstanding support functions for block sampling without any consideration of the impact this would have on third-party FDWs. The new API is not notably more functional for FDWs than the old, so forcing them to change doesn't seem like a good thing. We can provide the old API as a wrapper (more or less) around the new one for a minimal amount of extra code. 2015-05-19 00:34:37 +02:00

			`/*`
			`* Backwards-compatible API for block sampling`
			`*`
			`* This code is now deprecated, but since it's still in use by many FDWs,`
			`* we should keep it for awhile at least. The functionality is the same as`
			`* sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,`
			`* except that a common random state is used across all callers.`
			`*/`
			`static ReservoirStateData oldrs;`

			`double`
			`anl_random_fract(void)`
			`{`
			`/* initialize if first time through */`
			`if (oldrs.randstate[0] == 0)`
			`sampler_random_init_state(random(), oldrs.randstate);`

			`/* and compute a random fraction */`
			`return sampler_random_fract(oldrs.randstate);`
			`}`

			`double`
			`anl_init_selection_state(int n)`
			`{`
			`/* initialize if first time through */`
			`if (oldrs.randstate[0] == 0)`
			`sampler_random_init_state(random(), oldrs.randstate);`

			`/* Initial value of W (for use when Algorithm Z is first applied) */`
			`return exp(-log(sampler_random_fract(oldrs.randstate)) / n);`
			`}`

			`double`
			`anl_get_next_S(double t, int n, double *stateptr)`
			`{`
pgindent run for 9.5 2015-05-24 03:35:49 +02:00			`double result;`
Put back a backwards-compatible version of sampling support functions. Commit 83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c removed the longstanding support functions for block sampling without any consideration of the impact this would have on third-party FDWs. The new API is not notably more functional for FDWs than the old, so forcing them to change doesn't seem like a good thing. We can provide the old API as a wrapper (more or less) around the new one for a minimal amount of extra code. 2015-05-19 00:34:37 +02:00
			`oldrs.W = *stateptr;`
			`result = reservoir_get_next_S(&oldrs, t, n);`
			`*stateptr = oldrs.W;`
			`return result;`
			`}`