postgresql/src/backend/utils/misc/sampling.c

/*-------------------------------------------------------------------------
 *
 * sampling.c
 *	  Relation block sampling routines.
 *
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  src/backend/utils/misc/sampling.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <math.h>

#include "utils/sampling.h"


/*
 * BlockSampler_Init -- prepare for random sampling of blocknumbers
 *
 * BlockSampler provides algorithm for block level sampling of a relation
 * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
 * It selects a random sample of samplesize blocks out of
 * the nblocks blocks in the table. If the table has less than
 * samplesize blocks, all blocks are selected.
 *
 * Since we know the total number of blocks in advance, we can use the
 * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
 * algorithm.
 */
void
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
				  long randseed)
{
	bs->N = nblocks;			/* measured table size */

	/*
	 * If we decide to reduce samplesize for tables that have less or not much
	 * more than samplesize blocks, here is the place to do it.
	 */
	bs->n = samplesize;
	bs->t = 0;					/* blocks scanned so far */
	bs->m = 0;					/* blocks selected so far */
}

bool
BlockSampler_HasMore(BlockSampler bs)
{
	return (bs->t < bs->N) && (bs->m < bs->n);
}

BlockNumber
BlockSampler_Next(BlockSampler bs)
{
	BlockNumber K = bs->N - bs->t;		/* remaining blocks */
	int			k = bs->n - bs->m;		/* blocks still to sample */
	double		p;				/* probability to skip block */
	double		V;				/* random */

	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */

	if ((BlockNumber) k >= K)
	{
		/* need all the rest */
		bs->m++;
		return bs->t++;
	}

	/*----------
	 * It is not obvious that this code matches Knuth's Algorithm S.
	 * Knuth says to skip the current block with probability 1 - k/K.
	 * If we are to skip, we should advance t (hence decrease K), and
	 * repeat the same probabilistic test for the next block.  The naive
	 * implementation thus requires an sampler_random_fract() call for each
	 * block number.  But we can reduce this to one sampler_random_fract()
	 * call per selected block, by noting that each time the while-test
	 * succeeds, we can reinterpret V as a uniform random number in the range
	 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
	 * the appropriate fraction of its former value, and our next loop
	 * makes the appropriate probabilistic test.
	 *
	 * We have initially K > k > 0.  If the loop reduces K to equal k,
	 * the next while-test must fail since p will become exactly zero
	 * (we assume there will not be roundoff error in the division).
	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
	 * to be doubly sure about roundoff error.)  Therefore K cannot become
	 * less than k, which means that we cannot fail to select enough blocks.
	 *----------
	 */
	V = sampler_random_fract();
	p = 1.0 - (double) k / (double) K;
	while (V < p)
	{
		/* skip */
		bs->t++;
		K--;					/* keep K == N - t */

		/* adjust p to be new cutoff point in reduced range */
		p *= 1.0 - (double) k / (double) K;
	}

	/* select */
	bs->m++;
	return bs->t++;
}

/*
 * These two routines embody Algorithm Z from "Random sampling with a
 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
 * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
 * of the count S of records to skip before processing another record.
 * It is computed primarily based on t, the number of records already read.
 * The only extra state needed between calls is W, a random state variable.
 *
 * reservoir_init_selection_state computes the initial W value.
 *
 * Given that we've already read t records (t >= n), reservoir_get_next_S
 * determines the number of records to skip before the next record is
 * processed.
 */
void
reservoir_init_selection_state(ReservoirState rs, int n)
{
	/* Initial value of W (for use when Algorithm Z is first applied) */
	*rs = exp(-log(sampler_random_fract()) / n);
}

double
reservoir_get_next_S(ReservoirState rs, double t, int n)
{
	double		S;

	/* The magic constant here is T from Vitter's paper */
	if (t <= (22.0 * n))
	{
		/* Process records using Algorithm X until t is large enough */
		double		V,
					quot;

		V = sampler_random_fract(); /* Generate V */
		S = 0;
		t += 1;
		/* Note: "num" in Vitter's code is always equal to t - n */
		quot = (t - (double) n) / t;
		/* Find min S satisfying (4.1) */
		while (quot > V)
		{
			S += 1;
			t += 1;
			quot *= (t - (double) n) / t;
		}
	}
	else
	{
		/* Now apply Algorithm Z */
		double		W = *rs;
		double		term = t - (double) n + 1;

		for (;;)
		{
			double		numer,
						numer_lim,
						denom;
			double		U,
						X,
						lhs,
						rhs,
						y,
						tmp;

			/* Generate U and X */
			U = sampler_random_fract();
			X = t * (W - 1.0);
			S = floor(X);		/* S is tentatively set to floor(X) */
			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
			tmp = (t + 1) / term;
			lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
			rhs = (((t + X) / (term + S)) * term) / t;
			if (lhs <= rhs)
			{
				W = rhs / lhs;
				break;
			}
			/* Test if U <= f(S)/cg(X) */
			y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
			if ((double) n < S)
			{
				denom = t;
				numer_lim = term + S;
			}
			else
			{
				denom = t - (double) n + S;
				numer_lim = t + 1;
			}
			for (numer = t + S; numer >= numer_lim; numer -= 1)
			{
				y *= numer / denom;
				denom -= 1;
			}
			W = exp(-log(sampler_random_fract()) / n);		/* Generate W in advance */
			if (exp(log(y) / n) <= (t + X) / t)
				break;
		}
		*rs = W;
	}
	return S;
}


/*----------
 * Random number generator used by sampling
 *----------
 */

/* Select a random value R uniformly distributed in (0 - 1) */
double
sampler_random_fract()
{
	return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);
}
Separate block sampling functions Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek 2015-05-15 04:02:54 +02:00			`/*-------------------------------------------------------------------------`
			`*`
			`* sampling.c`
			`* Relation block sampling routines.`
			`*`
			`* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group`
			`* Portions Copyright (c) 1994, Regents of the University of California`
			`*`
			`*`
			`* IDENTIFICATION`
			`* src/backend/utils/misc/sampling.c`
			`*`
			`*-------------------------------------------------------------------------`
			`*/`

			`#include "postgres.h"`

			`#include <math.h>`

			`#include "utils/sampling.h"`


			`/*`
			`* BlockSampler_Init -- prepare for random sampling of blocknumbers`
			`*`
			`* BlockSampler provides algorithm for block level sampling of a relation`
			`* as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")`
			`* It selects a random sample of samplesize blocks out of`
			`* the nblocks blocks in the table. If the table has less than`
			`* samplesize blocks, all blocks are selected.`
			`*`
			`* Since we know the total number of blocks in advance, we can use the`
			`* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's`
			`* algorithm.`
			`*/`
			`void`
			`BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,`
			`long randseed)`
			`{`
			`bs->N = nblocks; /* measured table size */`

			`/*`
			`* If we decide to reduce samplesize for tables that have less or not much`
			`* more than samplesize blocks, here is the place to do it.`
			`*/`
			`bs->n = samplesize;`
			`bs->t = 0; /* blocks scanned so far */`
			`bs->m = 0; /* blocks selected so far */`
			`}`

			`bool`
			`BlockSampler_HasMore(BlockSampler bs)`
			`{`
			`return (bs->t < bs->N) && (bs->m < bs->n);`
			`}`

			`BlockNumber`
			`BlockSampler_Next(BlockSampler bs)`
			`{`
			`BlockNumber K = bs->N - bs->t; /* remaining blocks */`
			`int k = bs->n - bs->m; /* blocks still to sample */`
			`double p; /* probability to skip block */`
			`double V; /* random */`

			`Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */`

			`if ((BlockNumber) k >= K)`
			`{`
			`/* need all the rest */`
			`bs->m++;`
			`return bs->t++;`
			`}`

			`/*----------`
			`* It is not obvious that this code matches Knuth's Algorithm S.`
			`* Knuth says to skip the current block with probability 1 - k/K.`
			`* If we are to skip, we should advance t (hence decrease K), and`
			`* repeat the same probabilistic test for the next block. The naive`
			`* implementation thus requires an sampler_random_fract() call for each`
			`* block number. But we can reduce this to one sampler_random_fract()`
			`* call per selected block, by noting that each time the while-test`
			`* succeeds, we can reinterpret V as a uniform random number in the range`
			`* 0 to p. Therefore, instead of choosing a new V, we just adjust p to be`
			`* the appropriate fraction of its former value, and our next loop`
			`* makes the appropriate probabilistic test.`
			`*`
			`* We have initially K > k > 0. If the loop reduces K to equal k,`
			`* the next while-test must fail since p will become exactly zero`
			`* (we assume there will not be roundoff error in the division).`
			`* (Note: Knuth suggests a "<=" loop condition, but we use "<" just`
			`* to be doubly sure about roundoff error.) Therefore K cannot become`
			`* less than k, which means that we cannot fail to select enough blocks.`
			`*----------`
			`*/`
			`V = sampler_random_fract();`
			`p = 1.0 - (double) k / (double) K;`
			`while (V < p)`
			`{`
			`/* skip */`
			`bs->t++;`
			`K--; /* keep K == N - t */`

			`/* adjust p to be new cutoff point in reduced range */`
			`p *= 1.0 - (double) k / (double) K;`
			`}`

			`/* select */`
			`bs->m++;`
			`return bs->t++;`
			`}`

			`/*`
			`* These two routines embody Algorithm Z from "Random sampling with a`
			`* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1`
			`* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms`
			`* of the count S of records to skip before processing another record.`
			`* It is computed primarily based on t, the number of records already read.`
			`* The only extra state needed between calls is W, a random state variable.`
			`*`
			`* reservoir_init_selection_state computes the initial W value.`
			`*`
			`* Given that we've already read t records (t >= n), reservoir_get_next_S`
			`* determines the number of records to skip before the next record is`
			`* processed.`
			`*/`
			`void`
			`reservoir_init_selection_state(ReservoirState rs, int n)`
			`{`
			`/* Initial value of W (for use when Algorithm Z is first applied) */`
			`*rs = exp(-log(sampler_random_fract()) / n);`
			`}`

			`double`
			`reservoir_get_next_S(ReservoirState rs, double t, int n)`
			`{`
			`double S;`

			`/* The magic constant here is T from Vitter's paper */`
			`if (t <= (22.0 * n))`
			`{`
			`/* Process records using Algorithm X until t is large enough */`
			`double V,`
			`quot;`

			`V = sampler_random_fract(); /* Generate V */`
			`S = 0;`
			`t += 1;`
			`/* Note: "num" in Vitter's code is always equal to t - n */`
			`quot = (t - (double) n) / t;`
			`/* Find min S satisfying (4.1) */`
			`while (quot > V)`
			`{`
			`S += 1;`
			`t += 1;`
			`quot *= (t - (double) n) / t;`
			`}`
			`}`
			`else`
			`{`
			`/* Now apply Algorithm Z */`
			`double W = *rs;`
			`double term = t - (double) n + 1;`

			`for (;;)`
			`{`
			`double numer,`
			`numer_lim,`
			`denom;`
			`double U,`
			`X,`
			`lhs,`
			`rhs,`
			`y,`
			`tmp;`

			`/* Generate U and X */`
			`U = sampler_random_fract();`
			`X = t * (W - 1.0);`
			`S = floor(X); /* S is tentatively set to floor(X) */`
			`/* Test if U <= h(S)/cg(X) in the manner of (6.3) */`
			`tmp = (t + 1) / term;`
			`lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);`
			`rhs = (((t + X) / (term + S)) * term) / t;`
			`if (lhs <= rhs)`
			`{`
			`W = rhs / lhs;`
			`break;`
			`}`
			`/* Test if U <= f(S)/cg(X) */`
			`y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);`
			`if ((double) n < S)`
			`{`
			`denom = t;`
			`numer_lim = term + S;`
			`}`
			`else`
			`{`
			`denom = t - (double) n + S;`
			`numer_lim = t + 1;`
			`}`
			`for (numer = t + S; numer >= numer_lim; numer -= 1)`
			`{`
			`y *= numer / denom;`
			`denom -= 1;`
			`}`
			`W = exp(-log(sampler_random_fract()) / n); /* Generate W in advance */`
			`if (exp(log(y) / n) <= (t + X) / t)`
			`break;`
			`}`
			`*rs = W;`
			`}`
			`return S;`
			`}`


			`/*----------`
			`* Random number generator used by sampling`
			`*----------`
			`*/`

			`/* Select a random value R uniformly distributed in (0 - 1) */`
			`double`
			`sampler_random_fract()`
			`{`
			`return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);`
			`}`