postgresql/src/backend/statistics/extended_stats.c
Tomas Vondra 7300a69950 Add support for multivariate MCV lists
Introduce a third extended statistic type, supported by the CREATE
STATISTICS command - MCV lists, a generalization of the statistic
already built and used for individual columns.

Compared to the already supported types (n-distinct coefficients and
functional dependencies), MCV lists are more complex, include column
values and allow estimation of much wider range of common clauses
(equality and inequality conditions, IS NULL, IS NOT NULL etc.).
Similarly to the other types, a new pseudo-type (pg_mcv_list) is used.

Author: Tomas Vondra
Reviewed-by: Dean Rasheed, David Rowley, Mark Dilger, Alvaro Herrera
Discussion: https://postgr.es/m/dfdac334-9cf2-2597-fb27-f0fb3753f435@2ndquadrant.com
2019-03-27 18:32:18 +01:00

1133 lines
32 KiB
C

/*-------------------------------------------------------------------------
*
* extended_stats.c
* POSTGRES extended statistics
*
* Generic code supporting statistics objects created via CREATE STATISTICS.
*
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/statistics/extended_stats.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/htup_details.h"
#include "access/table.h"
#include "access/tuptoaster.h"
#include "catalog/indexing.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_statistic_ext.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/optimizer.h"
#include "postmaster/autovacuum.h"
#include "statistics/extended_stats_internal.h"
#include "statistics/statistics.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/selfuncs.h"
#include "utils/syscache.h"
/*
* To avoid consuming too much memory during analysis and/or too much space
* in the resulting pg_statistic rows, we ignore varlena datums that are wider
* than WIDTH_THRESHOLD (after detoasting!). This is legitimate for MCV
* and distinct-value calculations since a wide value is unlikely to be
* duplicated at all, much less be a most-common value. For the same reason,
* ignoring wide values will not affect our estimates of histogram bin
* boundaries very much.
*/
#define WIDTH_THRESHOLD 1024
/*
* Used internally to refer to an individual statistics object, i.e.,
* a pg_statistic_ext entry.
*/
typedef struct StatExtEntry
{
Oid statOid; /* OID of pg_statistic_ext entry */
char *schema; /* statistics object's schema */
char *name; /* statistics object's name */
Bitmapset *columns; /* attribute numbers covered by the object */
List *types; /* 'char' list of enabled statistic kinds */
} StatExtEntry;
static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
int nvacatts, VacAttrStats **vacatts);
static void statext_store(Relation pg_stext, Oid relid,
MVNDistinct *ndistinct, MVDependencies *dependencies,
MCVList * mcvlist, VacAttrStats **stats);
/*
* Compute requested extended stats, using the rows sampled for the plain
* (single-column) stats.
*
* This fetches a list of stats types from pg_statistic_ext, computes the
* requested stats, and serializes them back into the catalog.
*/
void
BuildRelationExtStatistics(Relation onerel, double totalrows,
int numrows, HeapTuple *rows,
int natts, VacAttrStats **vacattrstats)
{
Relation pg_stext;
ListCell *lc;
List *stats;
MemoryContext cxt;
MemoryContext oldcxt;
cxt = AllocSetContextCreate(CurrentMemoryContext,
"BuildRelationExtStatistics",
ALLOCSET_DEFAULT_SIZES);
oldcxt = MemoryContextSwitchTo(cxt);
pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
foreach(lc, stats)
{
StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
MVNDistinct *ndistinct = NULL;
MVDependencies *dependencies = NULL;
MCVList *mcv = NULL;
VacAttrStats **stats;
ListCell *lc2;
/*
* Check if we can build these stats based on the column analyzed. If
* not, report this fact (except in autovacuum) and move on.
*/
stats = lookup_var_attr_stats(onerel, stat->columns,
natts, vacattrstats);
if (!stats)
{
if (!IsAutoVacuumWorkerProcess())
ereport(WARNING,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("statistics object \"%s.%s\" could not be computed for relation \"%s.%s\"",
stat->schema, stat->name,
get_namespace_name(onerel->rd_rel->relnamespace),
RelationGetRelationName(onerel)),
errtable(onerel)));
continue;
}
/* check allowed number of dimensions */
Assert(bms_num_members(stat->columns) >= 2 &&
bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
/* compute statistic of each requested type */
foreach(lc2, stat->types)
{
char t = (char) lfirst_int(lc2);
if (t == STATS_EXT_NDISTINCT)
ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
stat->columns, stats);
else if (t == STATS_EXT_DEPENDENCIES)
dependencies = statext_dependencies_build(numrows, rows,
stat->columns, stats);
else if (t == STATS_EXT_MCV)
mcv = statext_mcv_build(numrows, rows, stat->columns, stats,
totalrows);
}
/* store the statistics in the catalog */
statext_store(pg_stext, stat->statOid, ndistinct, dependencies, mcv, stats);
}
table_close(pg_stext, RowExclusiveLock);
MemoryContextSwitchTo(oldcxt);
MemoryContextDelete(cxt);
}
/*
* statext_is_kind_built
* Is this stat kind built in the given pg_statistic_ext tuple?
*/
bool
statext_is_kind_built(HeapTuple htup, char type)
{
AttrNumber attnum;
switch (type)
{
case STATS_EXT_NDISTINCT:
attnum = Anum_pg_statistic_ext_stxndistinct;
break;
case STATS_EXT_DEPENDENCIES:
attnum = Anum_pg_statistic_ext_stxdependencies;
break;
case STATS_EXT_MCV:
attnum = Anum_pg_statistic_ext_stxmcv;
break;
default:
elog(ERROR, "unexpected statistics type requested: %d", type);
}
return !heap_attisnull(htup, attnum, NULL);
}
/*
* Return a list (of StatExtEntry) of statistics objects for the given relation.
*/
static List *
fetch_statentries_for_relation(Relation pg_statext, Oid relid)
{
SysScanDesc scan;
ScanKeyData skey;
HeapTuple htup;
List *result = NIL;
/*
* Prepare to scan pg_statistic_ext for entries having stxrelid = this
* rel.
*/
ScanKeyInit(&skey,
Anum_pg_statistic_ext_stxrelid,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(relid));
scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true,
NULL, 1, &skey);
while (HeapTupleIsValid(htup = systable_getnext(scan)))
{
StatExtEntry *entry;
Datum datum;
bool isnull;
int i;
ArrayType *arr;
char *enabled;
Form_pg_statistic_ext staForm;
entry = palloc0(sizeof(StatExtEntry));
staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
entry->statOid = staForm->oid;
entry->schema = get_namespace_name(staForm->stxnamespace);
entry->name = pstrdup(NameStr(staForm->stxname));
for (i = 0; i < staForm->stxkeys.dim1; i++)
{
entry->columns = bms_add_member(entry->columns,
staForm->stxkeys.values[i]);
}
/* decode the stxkind char array into a list of chars */
datum = SysCacheGetAttr(STATEXTOID, htup,
Anum_pg_statistic_ext_stxkind, &isnull);
Assert(!isnull);
arr = DatumGetArrayTypeP(datum);
if (ARR_NDIM(arr) != 1 ||
ARR_HASNULL(arr) ||
ARR_ELEMTYPE(arr) != CHAROID)
elog(ERROR, "stxkind is not a 1-D char array");
enabled = (char *) ARR_DATA_PTR(arr);
for (i = 0; i < ARR_DIMS(arr)[0]; i++)
{
Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
(enabled[i] == STATS_EXT_DEPENDENCIES) ||
(enabled[i] == STATS_EXT_MCV));
entry->types = lappend_int(entry->types, (int) enabled[i]);
}
result = lappend(result, entry);
}
systable_endscan(scan);
return result;
}
/*
* Using 'vacatts' of size 'nvacatts' as input data, return a newly built
* VacAttrStats array which includes only the items corresponding to
* attributes indicated by 'stxkeys'. If we don't have all of the per column
* stats available to compute the extended stats, then we return NULL to indicate
* to the caller that the stats should not be built.
*/
static VacAttrStats **
lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
int nvacatts, VacAttrStats **vacatts)
{
int i = 0;
int x = -1;
VacAttrStats **stats;
stats = (VacAttrStats **)
palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
/* lookup VacAttrStats info for the requested columns (same attnum) */
while ((x = bms_next_member(attrs, x)) >= 0)
{
int j;
stats[i] = NULL;
for (j = 0; j < nvacatts; j++)
{
if (x == vacatts[j]->tupattnum)
{
stats[i] = vacatts[j];
break;
}
}
if (!stats[i])
{
/*
* Looks like stats were not gathered for one of the columns
* required. We'll be unable to build the extended stats without
* this column.
*/
pfree(stats);
return NULL;
}
/*
* Sanity check that the column is not dropped - stats should have
* been removed in this case.
*/
Assert(!stats[i]->attr->attisdropped);
i++;
}
return stats;
}
/*
* statext_store
* Serializes the statistics and stores them into the pg_statistic_ext tuple.
*/
static void
statext_store(Relation pg_stext, Oid statOid,
MVNDistinct *ndistinct, MVDependencies *dependencies,
MCVList * mcv, VacAttrStats **stats)
{
HeapTuple stup,
oldtup;
Datum values[Natts_pg_statistic_ext];
bool nulls[Natts_pg_statistic_ext];
bool replaces[Natts_pg_statistic_ext];
memset(nulls, true, sizeof(nulls));
memset(replaces, false, sizeof(replaces));
memset(values, 0, sizeof(values));
/*
* Construct a new pg_statistic_ext tuple, replacing the calculated stats.
*/
if (ndistinct != NULL)
{
bytea *data = statext_ndistinct_serialize(ndistinct);
nulls[Anum_pg_statistic_ext_stxndistinct - 1] = (data == NULL);
values[Anum_pg_statistic_ext_stxndistinct - 1] = PointerGetDatum(data);
}
if (dependencies != NULL)
{
bytea *data = statext_dependencies_serialize(dependencies);
nulls[Anum_pg_statistic_ext_stxdependencies - 1] = (data == NULL);
values[Anum_pg_statistic_ext_stxdependencies - 1] = PointerGetDatum(data);
}
if (mcv != NULL)
{
bytea *data = statext_mcv_serialize(mcv, stats);
nulls[Anum_pg_statistic_ext_stxmcv - 1] = (data == NULL);
values[Anum_pg_statistic_ext_stxmcv - 1] = PointerGetDatum(data);
}
/* always replace the value (either by bytea or NULL) */
replaces[Anum_pg_statistic_ext_stxndistinct - 1] = true;
replaces[Anum_pg_statistic_ext_stxdependencies - 1] = true;
replaces[Anum_pg_statistic_ext_stxmcv - 1] = true;
/* there should already be a pg_statistic_ext tuple */
oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
if (!HeapTupleIsValid(oldtup))
elog(ERROR, "cache lookup failed for statistics object %u", statOid);
/* replace it */
stup = heap_modify_tuple(oldtup,
RelationGetDescr(pg_stext),
values,
nulls,
replaces);
ReleaseSysCache(oldtup);
CatalogTupleUpdate(pg_stext, &stup->t_self, stup);
heap_freetuple(stup);
}
/* initialize multi-dimensional sort */
MultiSortSupport
multi_sort_init(int ndims)
{
MultiSortSupport mss;
Assert(ndims >= 2);
mss = (MultiSortSupport) palloc0(offsetof(MultiSortSupportData, ssup)
+ sizeof(SortSupportData) * ndims);
mss->ndims = ndims;
return mss;
}
/*
* Prepare sort support info using the given sort operator and collation
* at the position 'sortdim'
*/
void
multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
Oid oper, Oid collation)
{
SortSupport ssup = &mss->ssup[sortdim];
ssup->ssup_cxt = CurrentMemoryContext;
ssup->ssup_collation = collation;
ssup->ssup_nulls_first = false;
PrepareSortSupportFromOrderingOp(oper, ssup);
}
/* compare all the dimensions in the selected order */
int
multi_sort_compare(const void *a, const void *b, void *arg)
{
MultiSortSupport mss = (MultiSortSupport) arg;
SortItem *ia = (SortItem *) a;
SortItem *ib = (SortItem *) b;
int i;
for (i = 0; i < mss->ndims; i++)
{
int compare;
compare = ApplySortComparator(ia->values[i], ia->isnull[i],
ib->values[i], ib->isnull[i],
&mss->ssup[i]);
if (compare != 0)
return compare;
}
/* equal by default */
return 0;
}
/* compare selected dimension */
int
multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b,
MultiSortSupport mss)
{
return ApplySortComparator(a->values[dim], a->isnull[dim],
b->values[dim], b->isnull[dim],
&mss->ssup[dim]);
}
int
multi_sort_compare_dims(int start, int end,
const SortItem *a, const SortItem *b,
MultiSortSupport mss)
{
int dim;
for (dim = start; dim <= end; dim++)
{
int r = ApplySortComparator(a->values[dim], a->isnull[dim],
b->values[dim], b->isnull[dim],
&mss->ssup[dim]);
if (r != 0)
return r;
}
return 0;
}
int
compare_scalars_simple(const void *a, const void *b, void *arg)
{
return compare_datums_simple(*(Datum *) a,
*(Datum *) b,
(SortSupport) arg);
}
int
compare_datums_simple(Datum a, Datum b, SortSupport ssup)
{
return ApplySortComparator(a, false, b, false, ssup);
}
/* simple counterpart to qsort_arg */
void *
bsearch_arg(const void *key, const void *base, size_t nmemb, size_t size,
int (*compar) (const void *, const void *, void *),
void *arg)
{
size_t l,
u,
idx;
const void *p;
int comparison;
l = 0;
u = nmemb;
while (l < u)
{
idx = (l + u) / 2;
p = (void *) (((const char *) base) + (idx * size));
comparison = (*compar) (key, p, arg);
if (comparison < 0)
u = idx;
else if (comparison > 0)
l = idx + 1;
else
return (void *) p;
}
return NULL;
}
/*
* build_attnums_array
* Transforms a bitmap into an array of AttrNumber values.
*
* This is used for extended statistics only, so all the attribute must be
* user-defined. That means offsetting by FirstLowInvalidHeapAttributeNumber
* is not necessary here (and when querying the bitmap).
*/
AttrNumber *
build_attnums_array(Bitmapset *attrs, int *numattrs)
{
int i,
j;
AttrNumber *attnums;
int num = bms_num_members(attrs);
if (numattrs)
*numattrs = num;
/* build attnums from the bitmapset */
attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * num);
i = 0;
j = -1;
while ((j = bms_next_member(attrs, j)) >= 0)
{
/*
* Make sure the bitmap contains only user-defined attributes. As
* bitmaps can't contain negative values, this can be violated in
* two ways. Firstly, the bitmap might contain 0 as a member, and
* secondly the integer value might be larger than MaxAttrNumber.
*/
Assert(AttrNumberIsForUserDefinedAttr(j));
Assert(j <= MaxAttrNumber);
attnums[i++] = (AttrNumber) j;
/* protect against overflows */
Assert(i <= num);
}
return attnums;
}
/*
* build_sorted_items
* build a sorted array of SortItem with values from rows
*
* Note: All the memory is allocated in a single chunk, so that the caller
* can simply pfree the return value to release all of it.
*/
SortItem *
build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
MultiSortSupport mss, int numattrs, AttrNumber *attnums)
{
int i,
j,
len,
idx;
int nvalues = numrows * numattrs;
SortItem *items;
Datum *values;
bool *isnull;
char *ptr;
/* Compute the total amount of memory we need (both items and values). */
len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
/* Allocate the memory and split it into the pieces. */
ptr = palloc0(len);
/* items to sort */
items = (SortItem *) ptr;
ptr += numrows * sizeof(SortItem);
/* values and null flags */
values = (Datum *) ptr;
ptr += nvalues * sizeof(Datum);
isnull = (bool *) ptr;
ptr += nvalues * sizeof(bool);
/* make sure we consumed the whole buffer exactly */
Assert((ptr - (char *) items) == len);
/* fix the pointers to Datum and bool arrays */
idx = 0;
for (i = 0; i < numrows; i++)
{
bool toowide = false;
items[idx].values = &values[idx * numattrs];
items[idx].isnull = &isnull[idx * numattrs];
/* load the values/null flags from sample rows */
for (j = 0; j < numattrs; j++)
{
Datum value;
bool isnull;
value = heap_getattr(rows[i], attnums[j], tdesc, &isnull);
/*
* If this is a varlena value, check if it's too wide and if yes
* then skip the whole item. Otherwise detoast the value.
*
* XXX It may happen that we've already detoasted some preceding
* values for the current item. We don't bother to cleanup those
* on the assumption that those are small (below WIDTH_THRESHOLD)
* and will be discarded at the end of analyze.
*/
if ((!isnull) &&
(TupleDescAttr(tdesc, attnums[j] - 1)->attlen == -1))
{
if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
{
toowide = true;
break;
}
value = PointerGetDatum(PG_DETOAST_DATUM(value));
}
items[idx].values[j] = value;
items[idx].isnull[j] = isnull;
}
if (toowide)
continue;
idx++;
}
/* store the actual number of items (ignoring the too-wide ones) */
*nitems = idx;
/* all items were too wide */
if (idx == 0)
{
/* everything is allocated as a single chunk */
pfree(items);
return NULL;
}
/* do the sort, using the multi-sort */
qsort_arg((void *) items, idx, sizeof(SortItem),
multi_sort_compare, mss);
return items;
}
/*
* has_stats_of_kind
* Check whether the list contains statistic of a given kind
*/
bool
has_stats_of_kind(List *stats, char requiredkind)
{
ListCell *l;
foreach(l, stats)
{
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
if (stat->kind == requiredkind)
return true;
}
return false;
}
/*
* choose_best_statistics
* Look for and return statistics with the specified 'requiredkind' which
* have keys that match at least two of the given attnums. Return NULL if
* there's no match.
*
* The current selection criteria is very simple - we choose the statistics
* object referencing the most of the requested attributes, breaking ties
* in favor of objects with fewer keys overall.
*
* XXX If multiple statistics objects tie on both criteria, then which object
* is chosen depends on the order that they appear in the stats list. Perhaps
* further tiebreakers are needed.
*/
StatisticExtInfo *
choose_best_statistics(List *stats, Bitmapset *attnums, char requiredkind)
{
ListCell *lc;
StatisticExtInfo *best_match = NULL;
int best_num_matched = 2; /* goal #1: maximize */
int best_match_keys = (STATS_MAX_DIMENSIONS + 1); /* goal #2: minimize */
foreach(lc, stats)
{
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
int num_matched;
int numkeys;
Bitmapset *matched;
/* skip statistics that are not of the correct type */
if (info->kind != requiredkind)
continue;
/* determine how many attributes of these stats can be matched to */
matched = bms_intersect(attnums, info->keys);
num_matched = bms_num_members(matched);
bms_free(matched);
/*
* save the actual number of keys in the stats so that we can choose
* the narrowest stats with the most matching keys.
*/
numkeys = bms_num_members(info->keys);
/*
* Use this object when it increases the number of matched clauses or
* when it matches the same number of attributes but these stats have
* fewer keys than any previous match.
*/
if (num_matched > best_num_matched ||
(num_matched == best_num_matched && numkeys < best_match_keys))
{
best_match = info;
best_num_matched = num_matched;
best_match_keys = numkeys;
}
}
return best_match;
}
/*
* statext_is_compatible_clause_internal
* Determines if the clause is compatible with MCV lists.
*
* Does the heavy lifting of actually inspecting the clauses for
* statext_is_compatible_clause. It needs to be split like this because
* of recursion. The attnums bitmap is an input/output parameter collecting
* attribute numbers from all compatible clauses (recursively).
*/
static bool
statext_is_compatible_clause_internal(Node *clause, Index relid, Bitmapset **attnums)
{
/* Look inside any binary-compatible relabeling (as in examine_variable) */
if (IsA(clause, RelabelType))
clause = (Node *) ((RelabelType *) clause)->arg;
/* plain Var references (boolean Vars or recursive checks) */
if (IsA(clause, Var))
{
Var *var = (Var *) clause;
/* Ensure var is from the correct relation */
if (var->varno != relid)
return false;
/* we also better ensure the Var is from the current level */
if (var->varlevelsup > 0)
return false;
/* Also skip system attributes (we don't allow stats on those). */
if (!AttrNumberIsForUserDefinedAttr(var->varattno))
return false;
*attnums = bms_add_member(*attnums, var->varattno);
return true;
}
/* (Var op Const) or (Const op Var) */
if (is_opclause(clause))
{
OpExpr *expr = (OpExpr *) clause;
Var *var;
bool varonleft = true;
bool ok;
/* Only expressions with two arguments are considered compatible. */
if (list_length(expr->args) != 2)
return false;
/* see if it actually has the right shape (one Var, one Const) */
ok = (NumRelids((Node *) expr) == 1) &&
(is_pseudo_constant_clause(lsecond(expr->args)) ||
(varonleft = false,
is_pseudo_constant_clause(linitial(expr->args))));
/* unsupported structure (two variables or so) */
if (!ok)
return false;
/*
* If it's not one of the supported operators ("=", "<", ">", etc.),
* just ignore the clause, as it's not compatible with MCV lists.
*
* This uses the function for estimating selectivity, not the operator
* directly (a bit awkward, but well ...).
*/
switch (get_oprrest(expr->opno))
{
case F_EQSEL:
case F_NEQSEL:
case F_SCALARLTSEL:
case F_SCALARLESEL:
case F_SCALARGTSEL:
case F_SCALARGESEL:
/* supported, will continue with inspection of the Var */
break;
default:
/* other estimators are considered unknown/unsupported */
return false;
}
var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
return statext_is_compatible_clause_internal((Node *) var, relid, attnums);
}
/* AND/OR/NOT clause */
if (is_andclause(clause) ||
is_orclause(clause) ||
is_notclause(clause))
{
/*
* AND/OR/NOT-clauses are supported if all sub-clauses are supported
*
* Perhaps we could improve this by handling mixed cases, when some of
* the clauses are supported and some are not. Selectivity for the
* supported subclauses would be computed using extended statistics,
* and the remaining clauses would be estimated using the traditional
* algorithm (product of selectivities).
*
* It however seems overly complex, and in a way we already do that
* because if we reject the whole clause as unsupported here, it will
* be eventually passed to clauselist_selectivity() which does exactly
* this (split into supported/unsupported clauses etc).
*/
BoolExpr *expr = (BoolExpr *) clause;
ListCell *lc;
foreach(lc, expr->args)
{
/*
* Had we found incompatible clause in the arguments, treat the
* whole clause as incompatible.
*/
if (!statext_is_compatible_clause_internal((Node *) lfirst(lc),
relid, attnums))
return false;
}
return true;
}
/* Var IS NULL */
if (IsA(clause, NullTest))
{
NullTest *nt = (NullTest *) clause;
/*
* Only simple (Var IS NULL) expressions supported for now. Maybe we
* could use examine_variable to fix this?
*/
if (!IsA(nt->arg, Var))
return false;
return statext_is_compatible_clause_internal((Node *) (nt->arg), relid, attnums);
}
return false;
}
/*
* statext_is_compatible_clause
* Determines if the clause is compatible with MCV lists.
*
* Currently, we only support three types of clauses:
*
* (a) OpExprs of the form (Var op Const), or (Const op Var), where the op
* is one of ("=", "<", ">", ">=", "<=")
*
* (b) (Var IS [NOT] NULL)
*
* (c) combinations using AND/OR/NOT
*
* In the future, the range of supported clauses may be expanded to more
* complex cases, for example (Var op Var).
*/
static bool
statext_is_compatible_clause(Node *clause, Index relid, Bitmapset **attnums)
{
RestrictInfo *rinfo = (RestrictInfo *) clause;
if (!IsA(rinfo, RestrictInfo))
return false;
/* Pseudoconstants are not really interesting here. */
if (rinfo->pseudoconstant)
return false;
/* clauses referencing multiple varnos are incompatible */
if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
return false;
return statext_is_compatible_clause_internal((Node *) rinfo->clause,
relid, attnums);
}
/*
* statext_mcv_clauselist_selectivity
* Estimate clauses using the best multi-column statistics.
*
* Selects the best extended (multi-column) statistic on a table (measured by
* the number of attributes extracted from the clauses and covered by it), and
* computes the selectivity for the supplied clauses.
*
* One of the main challenges with using MCV lists is how to extrapolate the
* estimate to the data not covered by the MCV list. To do that, we compute
* not only the "MCV selectivity" (selectivities for MCV items matching the
* supplied clauses), but also a couple of derived selectivities:
*
* - simple selectivity: Computed without extended statistic, i.e. as if the
* columns/clauses were independent
*
* - base selectivity: Similar to simple selectivity, but is computed using
* the extended statistic by adding up the base frequencies (that we compute
* and store for each MCV item) of matching MCV items.
*
* - total selectivity: Selectivity covered by the whole MCV list.
*
* - other selectivity: A selectivity estimate for data not covered by the MCV
* list (i.e. satisfying the clauses, but not common enough to make it into
* the MCV list)
*
* Note: While simple and base selectivities are defined in a quite similar
* way, the values are computed differently and are not therefore equal. The
* simple selectivity is computed as a product of per-clause estimates, while
* the base selectivity is computed by adding up base frequencies of matching
* items of the multi-column MCV list. So the values may differ for two main
* reasons - (a) the MCV list may not cover 100% of the data and (b) some of
* the MCV items did not match the estimated clauses.
*
* As both (a) and (b) reduce the base selectivity value, it generally holds
* that (simple_selectivity >= base_selectivity). If the MCV list covers all
* the data, the values may be equal.
*
* So, (simple_selectivity - base_selectivity) is an estimate for the part
* not covered by the MCV list, and (mcv_selectivity - base_selectivity) may
* be seen as a correction for the part covered by the MCV list. Those two
* statements are actually equivalent.
*
* Note: Due to rounding errors and minor differences in how the estimates
* are computed, the inequality may not always hold. Which is why we clamp
* the selectivities to prevent strange estimate (negative etc.).
*
* 'estimatedclauses' is an input/output parameter. We set bits for the
* 0-based 'clauses' indexes we estimate for and also skip clause items that
* already have a bit set.
*
* XXX If we were to use multiple statistics, this is where it would happen.
* We would simply repeat this on a loop on the "remaining" clauses, possibly
* using the already estimated clauses as conditions (and combining the values
* using conditional probability formula).
*/
static Selectivity
statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
JoinType jointype, SpecialJoinInfo *sjinfo,
RelOptInfo *rel, Bitmapset **estimatedclauses)
{
ListCell *l;
Bitmapset *clauses_attnums = NULL;
Bitmapset **list_attnums;
int listidx;
StatisticExtInfo *stat;
List *stat_clauses;
Selectivity simple_sel,
mcv_sel,
mcv_basesel,
mcv_totalsel,
other_sel,
sel;
/* check if there's any stats that might be useful for us. */
if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV))
return 1.0;
list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
list_length(clauses));
/*
* Pre-process the clauses list to extract the attnums seen in each item.
* We need to determine if there's any clauses which will be useful for
* selectivity estimations with extended stats. Along the way we'll record
* all of the attnums for each clause in a list which we'll reference later
* so we don't need to repeat the same work again. We'll also keep track of
* all attnums seen.
*
* We also skip clauses that we already estimated using different types of
* statistics (we treat them as incompatible).
*/
listidx = 0;
foreach(l, clauses)
{
Node *clause = (Node *) lfirst(l);
Bitmapset *attnums = NULL;
if (!bms_is_member(listidx, *estimatedclauses) &&
statext_is_compatible_clause(clause, rel->relid, &attnums))
{
list_attnums[listidx] = attnums;
clauses_attnums = bms_add_members(clauses_attnums, attnums);
}
else
list_attnums[listidx] = NULL;
listidx++;
}
/* We need at least two attributes for multivariate statistics. */
if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
return 1.0;
/* find the best suited statistics object for these attnums */
stat = choose_best_statistics(rel->statlist, clauses_attnums, STATS_EXT_MCV);
/* if no matching stats could be found then we've nothing to do */
if (!stat)
return 1.0;
/* Ensure choose_best_statistics produced an expected stats type. */
Assert(stat->kind == STATS_EXT_MCV);
/* now filter the clauses to be estimated using the selected MCV */
stat_clauses = NIL;
listidx = 0;
foreach(l, clauses)
{
/*
* If the clause is compatible with the selected statistics, mark it
* as estimated and add it to the list to estimate.
*/
if (list_attnums[listidx] != NULL &&
bms_is_subset(list_attnums[listidx], stat->keys))
{
stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
}
listidx++;
}
/*
* First compute "simple" selectivity, i.e. without the extended statistics,
* and essentially assuming independence of the columns/clauses. We'll then
* use the various selectivities computed from MCV list to improve it.
*/
simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid,
jointype, sjinfo, NULL);
/*
* Now compute the multi-column estimate from the MCV list, along with the
* other selectivities (base & total selectivity).
*/
mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid,
jointype, sjinfo, rel,
&mcv_basesel, &mcv_totalsel);
/* Estimated selectivity of values not covered by MCV matches */
other_sel = simple_sel - mcv_basesel;
CLAMP_PROBABILITY(other_sel);
/* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */
if (other_sel > 1.0 - mcv_totalsel)
other_sel = 1.0 - mcv_totalsel;
/* Overall selectivity is the combination of MCV and non-MCV estimates. */
sel = mcv_sel + other_sel;
CLAMP_PROBABILITY(sel);
return sel;
}
/*
* statext_clauselist_selectivity
* Estimate clauses using the best multi-column statistics.
*/
Selectivity
statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
JoinType jointype, SpecialJoinInfo *sjinfo,
RelOptInfo *rel, Bitmapset **estimatedclauses)
{
Selectivity sel;
/* First, try estimating clauses using a multivariate MCV list. */
sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype,
sjinfo, rel, estimatedclauses);
/*
* Then, apply functional dependencies on the remaining clauses by
* calling dependencies_clauselist_selectivity. Pass 'estimatedclauses'
* so the function can properly skip clauses already estimated above.
*
* The reasoning for applying dependencies last is that the more complex
* stats can track more complex correlations between the attributes, and
* so may be considered more reliable.
*
* For example, MCV list can give us an exact selectivity for values in
* two columns, while functional dependencies can only provide information
* about the overall strength of the dependency.
*/
sel *= dependencies_clauselist_selectivity(root, clauses, varRelid,
jointype, sjinfo, rel,
estimatedclauses);
return sel;
}