postgresql/src/backend/statistics/mvdistinct.c

701 lines
18 KiB
C

/*-------------------------------------------------------------------------
*
* mvdistinct.c
* POSTGRES multivariate ndistinct coefficients
*
* Estimating number of groups in a combination of columns (e.g. for GROUP BY)
* is tricky, and the estimation error is often significant.
* The multivariate ndistinct coefficients address this by storing ndistinct
* estimates for combinations of the user-specified columns. So for example
* given a statistics object on three columns (a,b,c), this module estimates
* and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column
* estimates are already available in pg_statistic.
*
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/statistics/mvdistinct.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include "access/htup_details.h"
#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_statistic_ext_data.h"
#include "lib/stringinfo.h"
#include "statistics/extended_stats_internal.h"
#include "statistics/statistics.h"
#include "utils/fmgrprotos.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#include "utils/typcache.h"
static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
int k, int *combination);
static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
static int n_choose_k(int n, int k);
static int num_combinations(int n);
/* size of the struct header fields (magic, type, nitems) */
#define SizeOfHeader (3 * sizeof(uint32))
/* size of a serialized ndistinct item (coefficient, natts, atts) */
#define SizeOfItem(natts) \
(sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber))
/* minimal size of a ndistinct item (with two attributes) */
#define MinSizeOfItem SizeOfItem(2)
/* minimal size of mvndistinct, when all items are minimal */
#define MinSizeOfItems(nitems) \
(SizeOfHeader + (nitems) * MinSizeOfItem)
/* Combination generator API */
/* internal state for generator of k-combinations of n elements */
typedef struct CombinationGenerator
{
int k; /* size of the combination */
int n; /* total number of elements */
int current; /* index of the next combination to return */
int ncombinations; /* number of combinations (size of array) */
int *combinations; /* array of pre-built combinations */
} CombinationGenerator;
static CombinationGenerator *generator_init(int n, int k);
static void generator_free(CombinationGenerator *state);
static int *generator_next(CombinationGenerator *state);
static void generate_combinations(CombinationGenerator *state);
/*
* statext_ndistinct_build
* Compute ndistinct coefficient for the combination of attributes.
*
* This computes the ndistinct estimate using the same estimator used
* in analyze.c and then computes the coefficient.
*
* To handle expressions easily, we treat them as system attributes with
* negative attnums, and offset everything by number of expressions to
* allow using Bitmapsets.
*/
MVNDistinct *
statext_ndistinct_build(double totalrows, StatsBuildData *data)
{
MVNDistinct *result;
int k;
int itemcnt;
int numattrs = data->nattnums;
int numcombs = num_combinations(numattrs);
result = palloc(offsetof(MVNDistinct, items) +
numcombs * sizeof(MVNDistinctItem));
result->magic = STATS_NDISTINCT_MAGIC;
result->type = STATS_NDISTINCT_TYPE_BASIC;
result->nitems = numcombs;
itemcnt = 0;
for (k = 2; k <= numattrs; k++)
{
int *combination;
CombinationGenerator *generator;
/* generate combinations of K out of N elements */
generator = generator_init(numattrs, k);
while ((combination = generator_next(generator)))
{
MVNDistinctItem *item = &result->items[itemcnt];
int j;
item->attributes = palloc(sizeof(AttrNumber) * k);
item->nattributes = k;
/* translate the indexes to attnums */
for (j = 0; j < k; j++)
{
item->attributes[j] = data->attnums[combination[j]];
Assert(AttributeNumberIsValid(item->attributes[j]));
}
item->ndistinct =
ndistinct_for_combination(totalrows, data, k, combination);
itemcnt++;
Assert(itemcnt <= result->nitems);
}
generator_free(generator);
}
/* must consume exactly the whole output array */
Assert(itemcnt == result->nitems);
return result;
}
/*
* statext_ndistinct_load
* Load the ndistinct value for the indicated pg_statistic_ext tuple
*/
MVNDistinct *
statext_ndistinct_load(Oid mvoid, bool inh)
{
MVNDistinct *result;
bool isnull;
Datum ndist;
HeapTuple htup;
htup = SearchSysCache2(STATEXTDATASTXOID,
ObjectIdGetDatum(mvoid), BoolGetDatum(inh));
if (!HeapTupleIsValid(htup))
elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
ndist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
Anum_pg_statistic_ext_data_stxdndistinct, &isnull);
if (isnull)
elog(ERROR,
"requested statistics kind \"%c\" is not yet built for statistics object %u",
STATS_EXT_NDISTINCT, mvoid);
result = statext_ndistinct_deserialize(DatumGetByteaPP(ndist));
ReleaseSysCache(htup);
return result;
}
/*
* statext_ndistinct_serialize
* serialize ndistinct to the on-disk bytea format
*/
bytea *
statext_ndistinct_serialize(MVNDistinct *ndistinct)
{
int i;
bytea *output;
char *tmp;
Size len;
Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC);
Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC);
/*
* Base size is size of scalar fields in the struct, plus one base struct
* for each item, including number of items for each.
*/
len = VARHDRSZ + SizeOfHeader;
/* and also include space for the actual attribute numbers */
for (i = 0; i < ndistinct->nitems; i++)
{
int nmembers;
nmembers = ndistinct->items[i].nattributes;
Assert(nmembers >= 2);
len += SizeOfItem(nmembers);
}
output = (bytea *) palloc(len);
SET_VARSIZE(output, len);
tmp = VARDATA(output);
/* Store the base struct values (magic, type, nitems) */
memcpy(tmp, &ndistinct->magic, sizeof(uint32));
tmp += sizeof(uint32);
memcpy(tmp, &ndistinct->type, sizeof(uint32));
tmp += sizeof(uint32);
memcpy(tmp, &ndistinct->nitems, sizeof(uint32));
tmp += sizeof(uint32);
/*
* store number of attributes and attribute numbers for each entry
*/
for (i = 0; i < ndistinct->nitems; i++)
{
MVNDistinctItem item = ndistinct->items[i];
int nmembers = item.nattributes;
memcpy(tmp, &item.ndistinct, sizeof(double));
tmp += sizeof(double);
memcpy(tmp, &nmembers, sizeof(int));
tmp += sizeof(int);
memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
tmp += nmembers * sizeof(AttrNumber);
/* protect against overflows */
Assert(tmp <= ((char *) output + len));
}
/* check we used exactly the expected space */
Assert(tmp == ((char *) output + len));
return output;
}
/*
* statext_ndistinct_deserialize
* Read an on-disk bytea format MVNDistinct to in-memory format
*/
MVNDistinct *
statext_ndistinct_deserialize(bytea *data)
{
int i;
Size minimum_size;
MVNDistinct ndist;
MVNDistinct *ndistinct;
char *tmp;
if (data == NULL)
return NULL;
/* we expect at least the basic fields of MVNDistinct struct */
if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)",
VARSIZE_ANY_EXHDR(data), SizeOfHeader);
/* initialize pointer to the data part (skip the varlena header) */
tmp = VARDATA_ANY(data);
/* read the header fields and perform basic sanity checks */
memcpy(&ndist.magic, tmp, sizeof(uint32));
tmp += sizeof(uint32);
memcpy(&ndist.type, tmp, sizeof(uint32));
tmp += sizeof(uint32);
memcpy(&ndist.nitems, tmp, sizeof(uint32));
tmp += sizeof(uint32);
if (ndist.magic != STATS_NDISTINCT_MAGIC)
elog(ERROR, "invalid ndistinct magic %08x (expected %08x)",
ndist.magic, STATS_NDISTINCT_MAGIC);
if (ndist.type != STATS_NDISTINCT_TYPE_BASIC)
elog(ERROR, "invalid ndistinct type %d (expected %d)",
ndist.type, STATS_NDISTINCT_TYPE_BASIC);
if (ndist.nitems == 0)
elog(ERROR, "invalid zero-length item array in MVNDistinct");
/* what minimum bytea size do we expect for those parameters */
minimum_size = MinSizeOfItems(ndist.nitems);
if (VARSIZE_ANY_EXHDR(data) < minimum_size)
elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)",
VARSIZE_ANY_EXHDR(data), minimum_size);
/*
* Allocate space for the ndistinct items (no space for each item's
* attnos: those live in bitmapsets allocated separately)
*/
ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) +
(ndist.nitems * sizeof(MVNDistinctItem)));
ndistinct->magic = ndist.magic;
ndistinct->type = ndist.type;
ndistinct->nitems = ndist.nitems;
for (i = 0; i < ndistinct->nitems; i++)
{
MVNDistinctItem *item = &ndistinct->items[i];
/* ndistinct value */
memcpy(&item->ndistinct, tmp, sizeof(double));
tmp += sizeof(double);
/* number of attributes */
memcpy(&item->nattributes, tmp, sizeof(int));
tmp += sizeof(int);
Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
item->attributes
= (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
tmp += sizeof(AttrNumber) * item->nattributes;
/* still within the bytea */
Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
}
/* we should have consumed the whole bytea exactly */
Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
return ndistinct;
}
/*
* pg_ndistinct_in
* input routine for type pg_ndistinct
*
* pg_ndistinct is real enough to be a table column, but it has no
* operations of its own, and disallows input (just like pg_node_tree).
*/
Datum
pg_ndistinct_in(PG_FUNCTION_ARGS)
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot accept a value of type %s", "pg_ndistinct")));
PG_RETURN_VOID(); /* keep compiler quiet */
}
/*
* pg_ndistinct
* output routine for type pg_ndistinct
*
* Produces a human-readable representation of the value.
*/
Datum
pg_ndistinct_out(PG_FUNCTION_ARGS)
{
bytea *data = PG_GETARG_BYTEA_PP(0);
MVNDistinct *ndist = statext_ndistinct_deserialize(data);
int i;
StringInfoData str;
initStringInfo(&str);
appendStringInfoChar(&str, '{');
for (i = 0; i < ndist->nitems; i++)
{
int j;
MVNDistinctItem item = ndist->items[i];
if (i > 0)
appendStringInfoString(&str, ", ");
for (j = 0; j < item.nattributes; j++)
{
AttrNumber attnum = item.attributes[j];
appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum);
}
appendStringInfo(&str, "\": %d", (int) item.ndistinct);
}
appendStringInfoChar(&str, '}');
PG_RETURN_CSTRING(str.data);
}
/*
* pg_ndistinct_recv
* binary input routine for type pg_ndistinct
*/
Datum
pg_ndistinct_recv(PG_FUNCTION_ARGS)
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot accept a value of type %s", "pg_ndistinct")));
PG_RETURN_VOID(); /* keep compiler quiet */
}
/*
* pg_ndistinct_send
* binary output routine for type pg_ndistinct
*
* n-distinct is serialized into a bytea value, so let's send that.
*/
Datum
pg_ndistinct_send(PG_FUNCTION_ARGS)
{
return byteasend(fcinfo);
}
/*
* ndistinct_for_combination
* Estimates number of distinct values in a combination of columns.
*
* This uses the same ndistinct estimator as compute_scalar_stats() in
* ANALYZE, i.e.,
* n*d / (n - f1 + f1*n/N)
*
* except that instead of values in a single column we are dealing with
* combination of multiple columns.
*/
static double
ndistinct_for_combination(double totalrows, StatsBuildData *data,
int k, int *combination)
{
int i,
j;
int f1,
cnt,
d;
bool *isnull;
Datum *values;
SortItem *items;
MultiSortSupport mss;
int numrows = data->numrows;
mss = multi_sort_init(k);
/*
* In order to determine the number of distinct elements, create separate
* values[]/isnull[] arrays with all the data we have, then sort them
* using the specified column combination as dimensions. We could try to
* sort in place, but it'd probably be more complex and bug-prone.
*/
items = (SortItem *) palloc(numrows * sizeof(SortItem));
values = (Datum *) palloc0(sizeof(Datum) * numrows * k);
isnull = (bool *) palloc0(sizeof(bool) * numrows * k);
for (i = 0; i < numrows; i++)
{
items[i].values = &values[i * k];
items[i].isnull = &isnull[i * k];
}
/*
* For each dimension, set up sort-support and fill in the values from the
* sample data.
*
* We use the column data types' default sort operators and collations;
* perhaps at some point it'd be worth using column-specific collations?
*/
for (i = 0; i < k; i++)
{
Oid typid;
TypeCacheEntry *type;
Oid collid = InvalidOid;
VacAttrStats *colstat = data->stats[combination[i]];
typid = colstat->attrtypid;
collid = colstat->attrcollid;
type = lookup_type_cache(typid, TYPECACHE_LT_OPR);
if (type->lt_opr == InvalidOid) /* shouldn't happen */
elog(ERROR, "cache lookup failed for ordering operator for type %u",
typid);
/* prepare the sort function for this dimension */
multi_sort_add_dimension(mss, i, type->lt_opr, collid);
/* accumulate all the data for this dimension into the arrays */
for (j = 0; j < numrows; j++)
{
items[j].values[i] = data->values[combination[i]][j];
items[j].isnull[i] = data->nulls[combination[i]][j];
}
}
/* We can sort the array now ... */
qsort_interruptible(items, numrows, sizeof(SortItem),
multi_sort_compare, mss);
/* ... and count the number of distinct combinations */
f1 = 0;
cnt = 1;
d = 1;
for (i = 1; i < numrows; i++)
{
if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
{
if (cnt == 1)
f1 += 1;
d++;
cnt = 0;
}
cnt += 1;
}
if (cnt == 1)
f1 += 1;
return estimate_ndistinct(totalrows, numrows, d, f1);
}
/* The Duj1 estimator (already used in analyze.c). */
static double
estimate_ndistinct(double totalrows, int numrows, int d, int f1)
{
double numer,
denom,
ndistinct;
numer = (double) numrows * (double) d;
denom = (double) (numrows - f1) +
(double) f1 * (double) numrows / totalrows;
ndistinct = numer / denom;
/* Clamp to sane range in case of roundoff error */
if (ndistinct < (double) d)
ndistinct = (double) d;
if (ndistinct > totalrows)
ndistinct = totalrows;
return floor(ndistinct + 0.5);
}
/*
* n_choose_k
* computes binomial coefficients using an algorithm that is both
* efficient and prevents overflows
*/
static int
n_choose_k(int n, int k)
{
int d,
r;
Assert((k > 0) && (n >= k));
/* use symmetry of the binomial coefficients */
k = Min(k, n - k);
r = 1;
for (d = 1; d <= k; ++d)
{
r *= n--;
r /= d;
}
return r;
}
/*
* num_combinations
* number of combinations, excluding single-value combinations
*/
static int
num_combinations(int n)
{
return (1 << n) - (n + 1);
}
/*
* generator_init
* initialize the generator of combinations
*
* The generator produces combinations of K elements in the interval (0..N).
* We prebuild all the combinations in this method, which is simpler than
* generating them on the fly.
*/
static CombinationGenerator *
generator_init(int n, int k)
{
CombinationGenerator *state;
Assert((n >= k) && (k > 0));
/* allocate the generator state as a single chunk of memory */
state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator));
state->ncombinations = n_choose_k(n, k);
/* pre-allocate space for all combinations */
state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations);
state->current = 0;
state->k = k;
state->n = n;
/* now actually pre-generate all the combinations of K elements */
generate_combinations(state);
/* make sure we got the expected number of combinations */
Assert(state->current == state->ncombinations);
/* reset the number, so we start with the first one */
state->current = 0;
return state;
}
/*
* generator_next
* returns the next combination from the prebuilt list
*
* Returns a combination of K array indexes (0 .. N), as specified to
* generator_init), or NULL when there are no more combination.
*/
static int *
generator_next(CombinationGenerator *state)
{
if (state->current == state->ncombinations)
return NULL;
return &state->combinations[state->k * state->current++];
}
/*
* generator_free
* free the internal state of the generator
*
* Releases the generator internal state (pre-built combinations).
*/
static void
generator_free(CombinationGenerator *state)
{
pfree(state->combinations);
pfree(state);
}
/*
* generate_combinations_recurse
* given a prefix, generate all possible combinations
*
* Given a prefix (first few elements of the combination), generate following
* elements recursively. We generate the combinations in lexicographic order,
* which eliminates permutations of the same combination.
*/
static void
generate_combinations_recurse(CombinationGenerator *state,
int index, int start, int *current)
{
/* If we haven't filled all the elements, simply recurse. */
if (index < state->k)
{
int i;
/*
* The values have to be in ascending order, so make sure we start
* with the value passed by parameter.
*/
for (i = start; i < state->n; i++)
{
current[index] = i;
generate_combinations_recurse(state, (index + 1), (i + 1), current);
}
return;
}
else
{
/* we got a valid combination, add it to the array */
memcpy(&state->combinations[(state->k * state->current)],
current, state->k * sizeof(int));
state->current++;
}
}
/*
* generate_combinations
* generate all k-combinations of N elements
*/
static void
generate_combinations(CombinationGenerator *state)
{
int *current = (int *) palloc0(sizeof(int) * state->k);
generate_combinations_recurse(state, 0, 0, current);
pfree(current);
}