New two-stage sampling method for ANALYZE, as per discussions a few weeks

ago. This should give significantly better results when the density of live tuples is not uniform throughout a table. Manfred Koizar, with minor kibitzing from Tom Lane.
2004-05-23 21:24:02 +00:00 · 2004-05-23 21:24:02 +00:00 · 9d6570b8a4
parent 27edff700e
commit 9d6570b8a4
1 changed files with 226 additions and 163 deletions
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.71 2004/05/08 19:09:24 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.72 2004/05/23 21:24:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -39,6 +39,16 @@
 #include "utils/tuplesort.h"


+/* Data structure for Algorithm S from Knuth 3.4.2 */
+typedef struct
+{
+	BlockNumber	N;				/* number of blocks, known in advance */
+	int			n;				/* desired sample size */
+	BlockNumber	t;				/* current block number */
+	int			m;				/* blocks selected so far */
+} BlockSamplerData;
+typedef BlockSamplerData *BlockSampler;
+
 /* Per-index data for ANALYZE */
 typedef struct AnlIndexData
 {
@ -57,6 +67,10 @@ static int	elevel = -1;
 static MemoryContext anl_context = NULL;


+static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
+							  int samplesize);
+static bool BlockSampler_HasMore(BlockSampler bs);
+static BlockNumber BlockSampler_Next(BlockSampler bs);
 static void compute_index_stats(Relation onerel, double totalrows,
 								AnlIndexData *indexdata, int nindexes,
 								HeapTuple *rows, int numrows,
@ -66,7 +80,7 @@ static int acquire_sample_rows(Relation onerel, HeapTuple *rows,
 					int targrows, double *totalrows);
 static double random_fract(void);
 static double init_selection_state(int n);
-static double select_next_random_record(double t, int n, double *stateptr);
+static double get_next_S(double t, int n, double *stateptr);
 static int	compare_rows(const void *a, const void *b);
 static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
@ -637,16 +651,118 @@ examine_attribute(Relation onerel, int attnum)
 	return stats;
 }

+/*
+ * BlockSampler_Init -- prepare for random sampling of blocknumbers
+ *
+ * BlockSampler is used for stage one of our new two-stage tuple
+ * sampling mechanism as discussed on pgsql-hackers 2004-04-02 (subject
+ * "Large DB").  It selects a random sample of samplesize blocks out of
+ * the nblocks blocks in the table.  If the table has less than
+ * samplesize blocks, all blocks are selected.
+ *
+ * Since we know the total number of blocks in advance, we can use the
+ * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
+ * algorithm.
+ */
+static void
+BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
+{
+	bs->N = nblocks;			/* measured table size */
+	/*
+	 * If we decide to reduce samplesize for tables that have less or
+	 * not much more than samplesize blocks, here is the place to do
+	 * it.
+	 */
+	bs->n = samplesize;
+	bs->t = 0;					/* blocks scanned so far */
+	bs->m = 0;					/* blocks selected so far */
+}
+
+static bool
+BlockSampler_HasMore(BlockSampler bs)
+{
+	return (bs->t < bs->N) && (bs->m < bs->n);
+}
+
+static BlockNumber
+BlockSampler_Next(BlockSampler bs)
+{
+	BlockNumber	K = bs->N - bs->t;		/* remaining blocks */
+	int			k = bs->n - bs->m;		/* blocks still to sample */
+	double		p;						/* probability to skip block */
+	double		V;						/* random */
+
+	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */
+
+	if ((BlockNumber) k >= K)
+	{
+		/* need all the rest */
+		bs->m++;
+		return bs->t++;
+	}
+
+	/*----------
+	 * It is not obvious that this code matches Knuth's Algorithm S.
+	 * Knuth says to skip the current block with probability 1 - k/K.
+	 * If we are to skip, we should advance t (hence decrease K), and
+	 * repeat the same probabilistic test for the next block.  The naive
+	 * implementation thus requires a random_fract() call for each block
+	 * number.  But we can reduce this to one random_fract() call per
+	 * selected block, by noting that each time the while-test succeeds,
+	 * we can reinterpret V as a uniform random number in the range 0 to p.
+	 * Therefore, instead of choosing a new V, we just adjust p to be
+	 * the appropriate fraction of its former value, and our next loop
+	 * makes the appropriate probabilistic test.
+	 *
+	 * We have initially K > k > 0.  If the loop reduces K to equal k,
+	 * the next while-test must fail since p will become exactly zero
+	 * (we assume there will not be roundoff error in the division).
+	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
+	 * to be doubly sure about roundoff error.)  Therefore K cannot become
+	 * less than k, which means that we cannot fail to select enough blocks.
+	 *----------
+	 */
+	V = random_fract();
+	p = 1.0 - (double) k / (double) K;
+	while (V < p)
+	{
+		/* skip */
+		bs->t++;
+		K--;					/* keep K == N - t */
+
+		/* adjust p to be new cutoff point in reduced range */
+		p *= 1.0 - (double) k / (double) K;
+	}
+
+	/* select */
+	bs->m++;
+	return bs->t++;
+}
+
 /*
 * acquire_sample_rows -- acquire a random sample of rows from the table
 *
- * Up to targrows rows are collected (if there are fewer than that many
- * rows in the table, all rows are collected).	When the table is larger
- * than targrows, a truly random sample is collected: every row has an
- * equal chance of ending up in the final sample.
+ * As of May 2004 we use a new two-stage method:  Stage one selects up
+ * to targrows random blocks (or all blocks, if there aren't so many).
+ * Stage two scans these blocks and uses the Vitter algorithm to create
+ * a random sample of targrows rows (or less, if there are less in the
+ * sample of blocks).  The two stages are executed simultaneously: each
+ * block is processed as soon as stage one returns its number and while
+ * the rows are read stage two controls which ones are to be inserted
+ * into the sample.
+ *
+ * Although every row has an equal chance of ending up in the final
+ * sample, this sampling method is not perfect: not every possible
+ * sample has an equal chance of being selected.  For large relations
+ * the number of different blocks represented by the sample tends to be
+ * too small.  We can live with that for now.  Improvements are welcome.
 *
 * We also estimate the total number of rows in the table, and return that
- * into *totalrows.
+ * into *totalrows.  An important property of this sampling method is that
+ * because we do look at a statistically unbiased set of blocks, we should
+ * get an unbiased estimate of the average number of live rows per block.
+ * The previous sampling method put too much credence in the row density near
+ * the start of the table.
 *
 * The returned list of tuples is in order by physical position in the table.
 * (We will rely on this later to derive correlation estimates.)
@ -655,101 +771,27 @@ static int
 acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows,
 					double *totalrows)
 {
-	int			numrows = 0;
-	HeapScanDesc scan;
+	int			numrows = 0;					/* # rows collected */
+	double		liverows = 0;					/* # rows seen */
+	double		deadrows = 0;
+	double		rowstoskip = -1;				/* -1 means not set yet */
 	BlockNumber	totalblocks;
-	HeapTuple	tuple;
-	ItemPointer lasttuple;
-	BlockNumber lastblock,
-				estblock;
-	OffsetNumber lastoffset;
-	int			numest;
-	double		tuplesperpage;
-	double		t;
+	BlockSamplerData bs;
 	double		rstate;

 	Assert(targrows > 1);

-	/*
-	 * Do a simple linear scan until we reach the target number of rows.
-	 */
-	scan = heap_beginscan(onerel, SnapshotNow, 0, NULL);
-	totalblocks = scan->rs_nblocks;	/* grab current relation size */
-	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
-	{
-		rows[numrows++] = heap_copytuple(tuple);
-		if (numrows >= targrows)
-			break;
-		vacuum_delay_point();
-	}
-	heap_endscan(scan);
+	totalblocks = RelationGetNumberOfBlocks(onerel);

-	/*
-	 * If we ran out of tuples then we're done, no matter how few we
-	 * collected.  No sort is needed, since they're already in order.
-	 */
-	if (!HeapTupleIsValid(tuple))
-	{
-		*totalrows = (double) numrows;
-
-		ereport(elevel,
-				(errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows",
-						RelationGetRelationName(onerel),
-						totalblocks, numrows, *totalrows)));
-
-		return numrows;
-	}
-
-	/*
-	 * Otherwise, start replacing tuples in the sample until we reach the
-	 * end of the relation.  This algorithm is from Jeff Vitter's paper
-	 * (see full citation below).  It works by repeatedly computing the
-	 * number of the next tuple we want to fetch, which will replace a
-	 * randomly chosen element of the reservoir (current set of tuples).
-	 * At all times the reservoir is a true random sample of the tuples
-	 * we've passed over so far, so when we fall off the end of the
-	 * relation we're done.
-	 *
-	 * A slight difficulty is that since we don't want to fetch tuples or
-	 * even pages that we skip over, it's not possible to fetch *exactly*
-	 * the N'th tuple at each step --- we don't know how many valid tuples
-	 * are on the skipped pages.  We handle this by assuming that the
-	 * average number of valid tuples/page on the pages already scanned
-	 * over holds good for the rest of the relation as well; this lets us
-	 * estimate which page the next tuple should be on and its position in
-	 * the page.  Then we fetch the first valid tuple at or after that
-	 * position, being careful not to use the same tuple twice.  This
-	 * approach should still give a good random sample, although it's not
-	 * perfect.
-	 */
-	lasttuple = &(rows[numrows - 1]->t_self);
-	lastblock = ItemPointerGetBlockNumber(lasttuple);
-	lastoffset = ItemPointerGetOffsetNumber(lasttuple);
-
-	/*
-	 * If possible, estimate tuples/page using only completely-scanned
-	 * pages.
-	 */
-	for (numest = numrows; numest > 0; numest--)
-	{
-		if (ItemPointerGetBlockNumber(&(rows[numest - 1]->t_self)) != lastblock)
-			break;
-	}
-	if (numest == 0)
-	{
-		numest = numrows;		/* don't have a full page? */
-		estblock = lastblock + 1;
-	}
-	else
-		estblock = lastblock;
-	tuplesperpage = (double) numest / (double) estblock;
-
-	t = (double) numrows;		/* t is the # of records processed so far */
+	/* Prepare for sampling block numbers */
+	BlockSampler_Init(&bs, totalblocks, targrows);
+	/* Prepare for sampling rows */
 	rstate = init_selection_state(targrows);
-	for (;;)
+
+	/* Outer loop over blocks to sample */
+	while (BlockSampler_HasMore(&bs))
 	{
-		double		targpos;
-		BlockNumber targblock;
+		BlockNumber targblock = BlockSampler_Next(&bs);
 		Buffer		targbuffer;
 		Page		targpage;
 		OffsetNumber targoffset,
@ -757,28 +799,6 @@ acquire_sample_rows(Relation onerel, HeapTuple *rows, int targrows,

 		vacuum_delay_point();

-		t = select_next_random_record(t, targrows, &rstate);
-		/* Try to read the t'th record in the table */
-		targpos = t / tuplesperpage;
-		targblock = (BlockNumber) targpos;
-		targoffset = ((int) ((targpos - targblock) * tuplesperpage)) +
-			FirstOffsetNumber;
-		/* Make sure we are past the last selected record */
-		if (targblock <= lastblock)
-		{
-			targblock = lastblock;
-			if (targoffset <= lastoffset)
-				targoffset = lastoffset + 1;
-		}
-		/* Loop to find first valid record at or after given position */
-pageloop:;
-
-		/*
-		 * Have we fallen off the end of the relation?
-		 */
-		if (targblock >= totalblocks)
-			break;
-
 		/*
 		 * We must maintain a pin on the target page's buffer to ensure
 		 * that the maxoffset value stays good (else concurrent VACUUM
@ -795,62 +815,109 @@ pageloop:;
 		maxoffset = PageGetMaxOffsetNumber(targpage);
 		LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);

-		for (;;)
+		/* Inner loop over all tuples on the selected page */
+		for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++)
 		{
 			HeapTupleData targtuple;
 			Buffer		tupbuffer;

-			if (targoffset > maxoffset)
-			{
-				/* Fell off end of this page, try next */
-				ReleaseBuffer(targbuffer);
-				targblock++;
-				targoffset = FirstOffsetNumber;
-				goto pageloop;
-			}
 			ItemPointerSet(&targtuple.t_self, targblock, targoffset);
 			if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
 						   false, NULL))
 			{
 				/*
-				 * Found a suitable tuple, so save it, replacing one old
-				 * tuple at random
+				 * The first targrows live rows are simply copied into the
+				 * reservoir.
+				 * Then we start replacing tuples in the sample until
+				 * we reach the end of the relation.  This algorithm is
+				 * from Jeff Vitter's paper (see full citation below).
+				 * It works by repeatedly computing the number of tuples
+				 * to skip before selecting a tuple, which replaces a
+				 * randomly chosen element of the reservoir (current
+				 * set of tuples).  At all times the reservoir is a true
+				 * random sample of the tuples we've passed over so far,
+				 * so when we fall off the end of the relation we're done.
 				 */
-				int			k = (int) (targrows * random_fract());
+				if (numrows < targrows)
+					rows[numrows++] = heap_copytuple(&targtuple);
+				else
+				{
+					/*
+					 * t in Vitter's paper is the number of records already
+					 * processed.  If we need to compute a new S value, we
+					 * must use the not-yet-incremented value of liverows
+					 * as t.
+					 */
+					if (rowstoskip < 0)
+						rowstoskip = get_next_S(liverows, targrows, &rstate);

-				Assert(k >= 0 && k < targrows);
-				heap_freetuple(rows[k]);
-				rows[k] = heap_copytuple(&targtuple);
-				/* this releases the second pin acquired by heap_fetch: */
+					if (rowstoskip <= 0)
+					{
+						/*
+						 * Found a suitable tuple, so save it,
+						 * replacing one old tuple at random
+						 */
+						int	k = (int) (targrows * random_fract());
+
+						Assert(k >= 0 && k < targrows);
+						heap_freetuple(rows[k]);
+						rows[k] = heap_copytuple(&targtuple);
+					}
+
+					rowstoskip -= 1;
+				}
+
+				/* must release the extra pin acquired by heap_fetch */
 				ReleaseBuffer(tupbuffer);
-				/* this releases the initial pin: */
-				ReleaseBuffer(targbuffer);
-				lastblock = targblock;
-				lastoffset = targoffset;
-				break;
+
+				liverows += 1;
+			}
+			else
+			{
+				/*
+				 * Count dead rows, but not empty slots.  This information is
+				 * currently not used, but it seems likely we'll want it
+				 * someday.
+				 */
+				if (targtuple.t_data != NULL)
+					deadrows += 1;
 			}
-			/* this tuple is dead, so advance to next one on same page */
-			targoffset++;
 		}
+
+		/* Now release the initial pin on the page */
+		ReleaseBuffer(targbuffer);
 	}

 	/*
-	 * Now we need to sort the collected tuples by position (itempointer).
+	 * If we didn't find as many tuples as we wanted then we're done.
+	 * No sort is needed, since they're already in order.
+	 *
+	 * Otherwise we need to sort the collected tuples by position
+	 * (itempointer).  It's not worth worrying about corner cases
+	 * where the tuples are already sorted.
 	 */
-	qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
+	if (numrows == targrows)
+		qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);

 	/*
-	 * Estimate total number of valid rows in relation.
+	 * Estimate total number of live rows in relation.
 	 */
-	*totalrows = floor((double) totalblocks * tuplesperpage + 0.5);
+	if (bs.m > 0)
+		*totalrows = floor((liverows * totalblocks) / bs.m + 0.5);
+	else
+		*totalrows = 0.0;

 	/*
 	 * Emit some interesting relation info 
 	 */
 	ereport(elevel,
-			(errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows",
+			(errmsg("\"%s\": scanned %d of %u pages, "
+					"containing %.0f live rows and %.0f dead rows; "
+					"%d rows in sample, %.0f estimated total rows",
 					RelationGetRelationName(onerel),
-					totalblocks, numrows, *totalrows)));
+					bs.m, totalblocks,
+					liverows, deadrows,
+					numrows, *totalrows)));

 	return numrows;
 }
@ -872,23 +939,16 @@ random_fract(void)
 /*
 * These two routines embody Algorithm Z from "Random sampling with a
 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
- * (Mar. 1985), Pages 37-57.  While Vitter describes his algorithm in terms
- * of the count S of records to skip before processing another record,
- * it is convenient to work primarily with t, the index (counting from 1)
- * of the last record processed and next record to process.  The only extra
- * state needed between calls is W, a random state variable.
- *
- * Note: the original algorithm defines t, S, numer, and denom as integers.
- * Here we express them as doubles to avoid overflow if the number of rows
- * in the table exceeds INT_MAX.  The algorithm should work as long as the
- * row count does not become so large that it is not represented accurately
- * in a double (on IEEE-math machines this would be around 2^52 rows).
+ * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
+ * of the count S of records to skip before processing another record.
+ * It is computed primarily based on t, the number of records already read.
+ * The only extra state needed between calls is W, a random state variable.
 *
 * init_selection_state computes the initial W value.
 *
- * Given that we've already processed t records (t >= n),
- * select_next_random_record determines the number of the next record to
- * process.
+ * Given that we've already read t records (t >= n), get_next_S
+ * determines the number of records to skip before the next record is
+ * processed.
 */
 static double
 init_selection_state(int n)
@ -898,8 +958,10 @@ init_selection_state(int n)
 }

 static double
-select_next_random_record(double t, int n, double *stateptr)
+get_next_S(double t, int n, double *stateptr)
 {
+	double		S;
+
 	/* The magic constant here is T from Vitter's paper */
 	if (t <= (22.0 * n))
 	{
@ -908,11 +970,14 @@ select_next_random_record(double t, int n, double *stateptr)
 					quot;

 		V = random_fract();		/* Generate V */
+		S = 0;
 		t += 1;
+		/* Note: "num" in Vitter's code is always equal to t - n */
 		quot = (t - (double) n) / t;
 		/* Find min S satisfying (4.1) */
 		while (quot > V)
 		{
+			S += 1;
 			t += 1;
 			quot *= (t - (double) n) / t;
 		}
@ -922,7 +987,6 @@ select_next_random_record(double t, int n, double *stateptr)
 		/* Now apply Algorithm Z */
 		double		W = *stateptr;
 		double		term = t - (double) n + 1;
-		double		S;

 		for (;;)
 		{
@ -970,10 +1034,9 @@ select_next_random_record(double t, int n, double *stateptr)
 			if (exp(log(y) / n) <= (t + X) / t)
 				break;
 		}
-		t += S + 1;
 		*stateptr = W;
 	}
-	return t;
+	return S;
 }

 /*