1839 lines
52 KiB
C
1839 lines
52 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* dependencies.c
|
|
* POSTGRES functional dependencies
|
|
*
|
|
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/statistics/dependencies.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/htup_details.h"
|
|
#include "access/sysattr.h"
|
|
#include "catalog/pg_operator.h"
|
|
#include "catalog/pg_statistic_ext.h"
|
|
#include "catalog/pg_statistic_ext_data.h"
|
|
#include "lib/stringinfo.h"
|
|
#include "nodes/nodeFuncs.h"
|
|
#include "nodes/nodes.h"
|
|
#include "nodes/pathnodes.h"
|
|
#include "optimizer/clauses.h"
|
|
#include "optimizer/optimizer.h"
|
|
#include "parser/parsetree.h"
|
|
#include "statistics/extended_stats_internal.h"
|
|
#include "statistics/statistics.h"
|
|
#include "utils/bytea.h"
|
|
#include "utils/fmgroids.h"
|
|
#include "utils/fmgrprotos.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/selfuncs.h"
|
|
#include "utils/syscache.h"
|
|
#include "utils/typcache.h"
|
|
|
|
/* size of the struct header fields (magic, type, ndeps) */
|
|
#define SizeOfHeader (3 * sizeof(uint32))
|
|
|
|
/* size of a serialized dependency (degree, natts, atts) */
|
|
#define SizeOfItem(natts) \
|
|
(sizeof(double) + sizeof(AttrNumber) * (1 + (natts)))
|
|
|
|
/* minimal size of a dependency (with two attributes) */
|
|
#define MinSizeOfItem SizeOfItem(2)
|
|
|
|
/* minimal size of dependencies, when all deps are minimal */
|
|
#define MinSizeOfItems(ndeps) \
|
|
(SizeOfHeader + (ndeps) * MinSizeOfItem)
|
|
|
|
/*
|
|
* Internal state for DependencyGenerator of dependencies. Dependencies are similar to
|
|
* k-permutations of n elements, except that the order does not matter for the
|
|
* first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent.
|
|
*/
|
|
typedef struct DependencyGeneratorData
|
|
{
|
|
int k; /* size of the dependency */
|
|
int n; /* number of possible attributes */
|
|
int current; /* next dependency to return (index) */
|
|
AttrNumber ndependencies; /* number of dependencies generated */
|
|
AttrNumber *dependencies; /* array of pre-generated dependencies */
|
|
} DependencyGeneratorData;
|
|
|
|
typedef DependencyGeneratorData *DependencyGenerator;
|
|
|
|
static void generate_dependencies_recurse(DependencyGenerator state,
|
|
int index, AttrNumber start, AttrNumber *current);
|
|
static void generate_dependencies(DependencyGenerator state);
|
|
static DependencyGenerator DependencyGenerator_init(int n, int k);
|
|
static void DependencyGenerator_free(DependencyGenerator state);
|
|
static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
|
|
static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
|
|
static bool dependency_is_fully_matched(MVDependency *dependency,
|
|
Bitmapset *attnums);
|
|
static bool dependency_is_compatible_clause(Node *clause, Index relid,
|
|
AttrNumber *attnum);
|
|
static bool dependency_is_compatible_expression(Node *clause, Index relid,
|
|
List *statlist, Node **expr);
|
|
static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
|
|
int ndependencies, Bitmapset *attnums);
|
|
static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
|
|
int varRelid, JoinType jointype,
|
|
SpecialJoinInfo *sjinfo,
|
|
MVDependency **dependencies,
|
|
int ndependencies,
|
|
AttrNumber *list_attnums,
|
|
Bitmapset **estimatedclauses);
|
|
|
|
static void
|
|
generate_dependencies_recurse(DependencyGenerator state, int index,
|
|
AttrNumber start, AttrNumber *current)
|
|
{
|
|
/*
|
|
* The generator handles the first (k-1) elements differently from the
|
|
* last element.
|
|
*/
|
|
if (index < (state->k - 1))
|
|
{
|
|
AttrNumber i;
|
|
|
|
/*
|
|
* The first (k-1) values have to be in ascending order, which we
|
|
* generate recursively.
|
|
*/
|
|
|
|
for (i = start; i < state->n; i++)
|
|
{
|
|
current[index] = i;
|
|
generate_dependencies_recurse(state, (index + 1), (i + 1), current);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int i;
|
|
|
|
/*
|
|
* the last element is the implied value, which does not respect the
|
|
* ascending order. We just need to check that the value is not in the
|
|
* first (k-1) elements.
|
|
*/
|
|
|
|
for (i = 0; i < state->n; i++)
|
|
{
|
|
int j;
|
|
bool match = false;
|
|
|
|
current[index] = i;
|
|
|
|
for (j = 0; j < index; j++)
|
|
{
|
|
if (current[j] == i)
|
|
{
|
|
match = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If the value is not found in the first part of the dependency,
|
|
* we're done.
|
|
*/
|
|
if (!match)
|
|
{
|
|
state->dependencies = (AttrNumber *) repalloc(state->dependencies,
|
|
state->k * (state->ndependencies + 1) * sizeof(AttrNumber));
|
|
memcpy(&state->dependencies[(state->k * state->ndependencies)],
|
|
current, state->k * sizeof(AttrNumber));
|
|
state->ndependencies++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* generate all dependencies (k-permutations of n elements) */
|
|
static void
|
|
generate_dependencies(DependencyGenerator state)
|
|
{
|
|
AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k);
|
|
|
|
generate_dependencies_recurse(state, 0, 0, current);
|
|
|
|
pfree(current);
|
|
}
|
|
|
|
/*
|
|
* initialize the DependencyGenerator of variations, and prebuild the variations
|
|
*
|
|
* This pre-builds all the variations. We could also generate them in
|
|
* DependencyGenerator_next(), but this seems simpler.
|
|
*/
|
|
static DependencyGenerator
|
|
DependencyGenerator_init(int n, int k)
|
|
{
|
|
DependencyGenerator state;
|
|
|
|
Assert((n >= k) && (k > 0));
|
|
|
|
/* allocate the DependencyGenerator state */
|
|
state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData));
|
|
state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber));
|
|
|
|
state->ndependencies = 0;
|
|
state->current = 0;
|
|
state->k = k;
|
|
state->n = n;
|
|
|
|
/* now actually pre-generate all the variations */
|
|
generate_dependencies(state);
|
|
|
|
return state;
|
|
}
|
|
|
|
/* free the DependencyGenerator state */
|
|
static void
|
|
DependencyGenerator_free(DependencyGenerator state)
|
|
{
|
|
pfree(state->dependencies);
|
|
pfree(state);
|
|
|
|
}
|
|
|
|
/* generate next combination */
|
|
static AttrNumber *
|
|
DependencyGenerator_next(DependencyGenerator state)
|
|
{
|
|
if (state->current == state->ndependencies)
|
|
return NULL;
|
|
|
|
return &state->dependencies[state->k * state->current++];
|
|
}
|
|
|
|
|
|
/*
|
|
* validates functional dependency on the data
|
|
*
|
|
* An actual work horse of detecting functional dependencies. Given a variation
|
|
* of k attributes, it checks that the first (k-1) are sufficient to determine
|
|
* the last one.
|
|
*/
|
|
static double
|
|
dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
|
|
{
|
|
int i,
|
|
nitems;
|
|
MultiSortSupport mss;
|
|
SortItem *items;
|
|
AttrNumber *attnums_dep;
|
|
|
|
/* counters valid within a group */
|
|
int group_size = 0;
|
|
int n_violations = 0;
|
|
|
|
/* total number of rows supporting (consistent with) the dependency */
|
|
int n_supporting_rows = 0;
|
|
|
|
/* Make sure we have at least two input attributes. */
|
|
Assert(k >= 2);
|
|
|
|
/* sort info for all attributes columns */
|
|
mss = multi_sort_init(k);
|
|
|
|
/*
|
|
* Translate the array of indexes to regular attnums for the dependency
|
|
* (we will need this to identify the columns in StatsBuildData).
|
|
*/
|
|
attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
|
|
for (i = 0; i < k; i++)
|
|
attnums_dep[i] = data->attnums[dependency[i]];
|
|
|
|
/*
|
|
* Verify the dependency (a,b,...)->z, using a rather simple algorithm:
|
|
*
|
|
* (a) sort the data lexicographically
|
|
*
|
|
* (b) split the data into groups by first (k-1) columns
|
|
*
|
|
* (c) for each group count different values in the last column
|
|
*
|
|
* We use the column data types' default sort operators and collations;
|
|
* perhaps at some point it'd be worth using column-specific collations?
|
|
*/
|
|
|
|
/* prepare the sort function for the dimensions */
|
|
for (i = 0; i < k; i++)
|
|
{
|
|
VacAttrStats *colstat = data->stats[dependency[i]];
|
|
TypeCacheEntry *type;
|
|
|
|
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
|
|
if (type->lt_opr == InvalidOid) /* shouldn't happen */
|
|
elog(ERROR, "cache lookup failed for ordering operator for type %u",
|
|
colstat->attrtypid);
|
|
|
|
/* prepare the sort function for this dimension */
|
|
multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
|
|
}
|
|
|
|
/*
|
|
* build an array of SortItem(s) sorted using the multi-sort support
|
|
*
|
|
* XXX This relies on all stats entries pointing to the same tuple
|
|
* descriptor. For now that assumption holds, but it might change in the
|
|
* future for example if we support statistics on multiple tables.
|
|
*/
|
|
items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
|
|
|
|
/*
|
|
* Walk through the sorted array, split it into rows according to the
|
|
* first (k-1) columns. If there's a single value in the last column, we
|
|
* count the group as 'supporting' the functional dependency. Otherwise we
|
|
* count it as contradicting.
|
|
*/
|
|
|
|
/* start with the first row forming a group */
|
|
group_size = 1;
|
|
|
|
/* loop 1 beyond the end of the array so that we count the final group */
|
|
for (i = 1; i <= nitems; i++)
|
|
{
|
|
/*
|
|
* Check if the group ended, which may be either because we processed
|
|
* all the items (i==nitems), or because the i-th item is not equal to
|
|
* the preceding one.
|
|
*/
|
|
if (i == nitems ||
|
|
multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
|
|
{
|
|
/*
|
|
* If no violations were found in the group then track the rows of
|
|
* the group as supporting the functional dependency.
|
|
*/
|
|
if (n_violations == 0)
|
|
n_supporting_rows += group_size;
|
|
|
|
/* Reset counters for the new group */
|
|
n_violations = 0;
|
|
group_size = 1;
|
|
continue;
|
|
}
|
|
/* first columns match, but the last one does not (so contradicting) */
|
|
else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0)
|
|
n_violations++;
|
|
|
|
group_size++;
|
|
}
|
|
|
|
/* Compute the 'degree of validity' as (supporting/total). */
|
|
return (n_supporting_rows * 1.0 / data->numrows);
|
|
}
|
|
|
|
/*
|
|
* detects functional dependencies between groups of columns
|
|
*
|
|
* Generates all possible subsets of columns (variations) and computes
|
|
* the degree of validity for each one. For example when creating statistics
|
|
* on three columns (a,b,c) there are 9 possible dependencies
|
|
*
|
|
* two columns three columns
|
|
* ----------- -------------
|
|
* (a) -> b (a,b) -> c
|
|
* (a) -> c (a,c) -> b
|
|
* (b) -> a (b,c) -> a
|
|
* (b) -> c
|
|
* (c) -> a
|
|
* (c) -> b
|
|
*/
|
|
MVDependencies *
|
|
statext_dependencies_build(StatsBuildData *data)
|
|
{
|
|
int i,
|
|
k;
|
|
|
|
/* result */
|
|
MVDependencies *dependencies = NULL;
|
|
MemoryContext cxt;
|
|
|
|
Assert(data->nattnums >= 2);
|
|
|
|
/* tracks memory allocated by dependency_degree calls */
|
|
cxt = AllocSetContextCreate(CurrentMemoryContext,
|
|
"dependency_degree cxt",
|
|
ALLOCSET_DEFAULT_SIZES);
|
|
|
|
/*
|
|
* We'll try build functional dependencies starting from the smallest ones
|
|
* covering just 2 columns, to the largest ones, covering all columns
|
|
* included in the statistics object. We start from the smallest ones
|
|
* because we want to be able to skip already implied ones.
|
|
*/
|
|
for (k = 2; k <= data->nattnums; k++)
|
|
{
|
|
AttrNumber *dependency; /* array with k elements */
|
|
|
|
/* prepare a DependencyGenerator of variation */
|
|
DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
|
|
|
|
/* generate all possible variations of k values (out of n) */
|
|
while ((dependency = DependencyGenerator_next(DependencyGenerator)))
|
|
{
|
|
double degree;
|
|
MVDependency *d;
|
|
MemoryContext oldcxt;
|
|
|
|
/* release memory used by dependency degree calculation */
|
|
oldcxt = MemoryContextSwitchTo(cxt);
|
|
|
|
/* compute how valid the dependency seems */
|
|
degree = dependency_degree(data, k, dependency);
|
|
|
|
MemoryContextSwitchTo(oldcxt);
|
|
MemoryContextReset(cxt);
|
|
|
|
/*
|
|
* if the dependency seems entirely invalid, don't store it
|
|
*/
|
|
if (degree == 0.0)
|
|
continue;
|
|
|
|
d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
|
|
+ k * sizeof(AttrNumber));
|
|
|
|
/* copy the dependency (and keep the indexes into stxkeys) */
|
|
d->degree = degree;
|
|
d->nattributes = k;
|
|
for (i = 0; i < k; i++)
|
|
d->attributes[i] = data->attnums[dependency[i]];
|
|
|
|
/* initialize the list of dependencies */
|
|
if (dependencies == NULL)
|
|
{
|
|
dependencies
|
|
= (MVDependencies *) palloc0(sizeof(MVDependencies));
|
|
|
|
dependencies->magic = STATS_DEPS_MAGIC;
|
|
dependencies->type = STATS_DEPS_TYPE_BASIC;
|
|
dependencies->ndeps = 0;
|
|
}
|
|
|
|
dependencies->ndeps++;
|
|
dependencies = (MVDependencies *) repalloc(dependencies,
|
|
offsetof(MVDependencies, deps)
|
|
+ dependencies->ndeps * sizeof(MVDependency *));
|
|
|
|
dependencies->deps[dependencies->ndeps - 1] = d;
|
|
}
|
|
|
|
/*
|
|
* we're done with variations of k elements, so free the
|
|
* DependencyGenerator
|
|
*/
|
|
DependencyGenerator_free(DependencyGenerator);
|
|
}
|
|
|
|
MemoryContextDelete(cxt);
|
|
|
|
return dependencies;
|
|
}
|
|
|
|
|
|
/*
|
|
* Serialize list of dependencies into a bytea value.
|
|
*/
|
|
bytea *
|
|
statext_dependencies_serialize(MVDependencies *dependencies)
|
|
{
|
|
int i;
|
|
bytea *output;
|
|
char *tmp;
|
|
Size len;
|
|
|
|
/* we need to store ndeps, with a number of attributes for each one */
|
|
len = VARHDRSZ + SizeOfHeader;
|
|
|
|
/* and also include space for the actual attribute numbers and degrees */
|
|
for (i = 0; i < dependencies->ndeps; i++)
|
|
len += SizeOfItem(dependencies->deps[i]->nattributes);
|
|
|
|
output = (bytea *) palloc0(len);
|
|
SET_VARSIZE(output, len);
|
|
|
|
tmp = VARDATA(output);
|
|
|
|
/* Store the base struct values (magic, type, ndeps) */
|
|
memcpy(tmp, &dependencies->magic, sizeof(uint32));
|
|
tmp += sizeof(uint32);
|
|
memcpy(tmp, &dependencies->type, sizeof(uint32));
|
|
tmp += sizeof(uint32);
|
|
memcpy(tmp, &dependencies->ndeps, sizeof(uint32));
|
|
tmp += sizeof(uint32);
|
|
|
|
/* store number of attributes and attribute numbers for each dependency */
|
|
for (i = 0; i < dependencies->ndeps; i++)
|
|
{
|
|
MVDependency *d = dependencies->deps[i];
|
|
|
|
memcpy(tmp, &d->degree, sizeof(double));
|
|
tmp += sizeof(double);
|
|
|
|
memcpy(tmp, &d->nattributes, sizeof(AttrNumber));
|
|
tmp += sizeof(AttrNumber);
|
|
|
|
memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes);
|
|
tmp += sizeof(AttrNumber) * d->nattributes;
|
|
|
|
/* protect against overflow */
|
|
Assert(tmp <= ((char *) output + len));
|
|
}
|
|
|
|
/* make sure we've produced exactly the right amount of data */
|
|
Assert(tmp == ((char *) output + len));
|
|
|
|
return output;
|
|
}
|
|
|
|
/*
|
|
* Reads serialized dependencies into MVDependencies structure.
|
|
*/
|
|
MVDependencies *
|
|
statext_dependencies_deserialize(bytea *data)
|
|
{
|
|
int i;
|
|
Size min_expected_size;
|
|
MVDependencies *dependencies;
|
|
char *tmp;
|
|
|
|
if (data == NULL)
|
|
return NULL;
|
|
|
|
if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader)
|
|
elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)",
|
|
VARSIZE_ANY_EXHDR(data), SizeOfHeader);
|
|
|
|
/* read the MVDependencies header */
|
|
dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
|
|
|
|
/* initialize pointer to the data part (skip the varlena header) */
|
|
tmp = VARDATA_ANY(data);
|
|
|
|
/* read the header fields and perform basic sanity checks */
|
|
memcpy(&dependencies->magic, tmp, sizeof(uint32));
|
|
tmp += sizeof(uint32);
|
|
memcpy(&dependencies->type, tmp, sizeof(uint32));
|
|
tmp += sizeof(uint32);
|
|
memcpy(&dependencies->ndeps, tmp, sizeof(uint32));
|
|
tmp += sizeof(uint32);
|
|
|
|
if (dependencies->magic != STATS_DEPS_MAGIC)
|
|
elog(ERROR, "invalid dependency magic %d (expected %d)",
|
|
dependencies->magic, STATS_DEPS_MAGIC);
|
|
|
|
if (dependencies->type != STATS_DEPS_TYPE_BASIC)
|
|
elog(ERROR, "invalid dependency type %d (expected %d)",
|
|
dependencies->type, STATS_DEPS_TYPE_BASIC);
|
|
|
|
if (dependencies->ndeps == 0)
|
|
elog(ERROR, "invalid zero-length item array in MVDependencies");
|
|
|
|
/* what minimum bytea size do we expect for those parameters */
|
|
min_expected_size = SizeOfItem(dependencies->ndeps);
|
|
|
|
if (VARSIZE_ANY_EXHDR(data) < min_expected_size)
|
|
elog(ERROR, "invalid dependencies size %zd (expected at least %zd)",
|
|
VARSIZE_ANY_EXHDR(data), min_expected_size);
|
|
|
|
/* allocate space for the MCV items */
|
|
dependencies = repalloc(dependencies, offsetof(MVDependencies, deps)
|
|
+ (dependencies->ndeps * sizeof(MVDependency *)));
|
|
|
|
for (i = 0; i < dependencies->ndeps; i++)
|
|
{
|
|
double degree;
|
|
AttrNumber k;
|
|
MVDependency *d;
|
|
|
|
/* degree of validity */
|
|
memcpy(°ree, tmp, sizeof(double));
|
|
tmp += sizeof(double);
|
|
|
|
/* number of attributes */
|
|
memcpy(&k, tmp, sizeof(AttrNumber));
|
|
tmp += sizeof(AttrNumber);
|
|
|
|
/* is the number of attributes valid? */
|
|
Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS));
|
|
|
|
/* now that we know the number of attributes, allocate the dependency */
|
|
d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
|
|
+ (k * sizeof(AttrNumber)));
|
|
|
|
d->degree = degree;
|
|
d->nattributes = k;
|
|
|
|
/* copy attribute numbers */
|
|
memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes);
|
|
tmp += sizeof(AttrNumber) * d->nattributes;
|
|
|
|
dependencies->deps[i] = d;
|
|
|
|
/* still within the bytea */
|
|
Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
|
|
}
|
|
|
|
/* we should have consumed the whole bytea exactly */
|
|
Assert(tmp == ((char *) data + VARSIZE_ANY(data)));
|
|
|
|
return dependencies;
|
|
}
|
|
|
|
/*
|
|
* dependency_is_fully_matched
|
|
* checks that a functional dependency is fully matched given clauses on
|
|
* attributes (assuming the clauses are suitable equality clauses)
|
|
*/
|
|
static bool
|
|
dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums)
|
|
{
|
|
int j;
|
|
|
|
/*
|
|
* Check that the dependency actually is fully covered by clauses. We have
|
|
* to translate all attribute numbers, as those are referenced
|
|
*/
|
|
for (j = 0; j < dependency->nattributes; j++)
|
|
{
|
|
int attnum = dependency->attributes[j];
|
|
|
|
if (!bms_is_member(attnum, attnums))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* statext_dependencies_load
|
|
* Load the functional dependencies for the indicated pg_statistic_ext tuple
|
|
*/
|
|
MVDependencies *
|
|
statext_dependencies_load(Oid mvoid)
|
|
{
|
|
MVDependencies *result;
|
|
bool isnull;
|
|
Datum deps;
|
|
HeapTuple htup;
|
|
|
|
htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid));
|
|
if (!HeapTupleIsValid(htup))
|
|
elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
|
|
|
|
deps = SysCacheGetAttr(STATEXTDATASTXOID, htup,
|
|
Anum_pg_statistic_ext_data_stxddependencies, &isnull);
|
|
if (isnull)
|
|
elog(ERROR,
|
|
"requested statistics kind \"%c\" is not yet built for statistics object %u",
|
|
STATS_EXT_DEPENDENCIES, mvoid);
|
|
|
|
result = statext_dependencies_deserialize(DatumGetByteaPP(deps));
|
|
|
|
ReleaseSysCache(htup);
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* pg_dependencies_in - input routine for type pg_dependencies.
|
|
*
|
|
* pg_dependencies is real enough to be a table column, but it has no operations
|
|
* of its own, and disallows input too
|
|
*/
|
|
Datum
|
|
pg_dependencies_in(PG_FUNCTION_ARGS)
|
|
{
|
|
/*
|
|
* pg_node_list stores the data in binary form and parsing text input is
|
|
* not needed, so disallow this.
|
|
*/
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("cannot accept a value of type %s", "pg_dependencies")));
|
|
|
|
PG_RETURN_VOID(); /* keep compiler quiet */
|
|
}
|
|
|
|
/*
|
|
* pg_dependencies - output routine for type pg_dependencies.
|
|
*/
|
|
Datum
|
|
pg_dependencies_out(PG_FUNCTION_ARGS)
|
|
{
|
|
bytea *data = PG_GETARG_BYTEA_PP(0);
|
|
MVDependencies *dependencies = statext_dependencies_deserialize(data);
|
|
int i,
|
|
j;
|
|
StringInfoData str;
|
|
|
|
initStringInfo(&str);
|
|
appendStringInfoChar(&str, '{');
|
|
|
|
for (i = 0; i < dependencies->ndeps; i++)
|
|
{
|
|
MVDependency *dependency = dependencies->deps[i];
|
|
|
|
if (i > 0)
|
|
appendStringInfoString(&str, ", ");
|
|
|
|
appendStringInfoChar(&str, '"');
|
|
for (j = 0; j < dependency->nattributes; j++)
|
|
{
|
|
if (j == dependency->nattributes - 1)
|
|
appendStringInfoString(&str, " => ");
|
|
else if (j > 0)
|
|
appendStringInfoString(&str, ", ");
|
|
|
|
appendStringInfo(&str, "%d", dependency->attributes[j]);
|
|
}
|
|
appendStringInfo(&str, "\": %f", dependency->degree);
|
|
}
|
|
|
|
appendStringInfoChar(&str, '}');
|
|
|
|
PG_RETURN_CSTRING(str.data);
|
|
}
|
|
|
|
/*
|
|
* pg_dependencies_recv - binary input routine for type pg_dependencies.
|
|
*/
|
|
Datum
|
|
pg_dependencies_recv(PG_FUNCTION_ARGS)
|
|
{
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("cannot accept a value of type %s", "pg_dependencies")));
|
|
|
|
PG_RETURN_VOID(); /* keep compiler quiet */
|
|
}
|
|
|
|
/*
|
|
* pg_dependencies_send - binary output routine for type pg_dependencies.
|
|
*
|
|
* Functional dependencies are serialized in a bytea value (although the type
|
|
* is named differently), so let's just send that.
|
|
*/
|
|
Datum
|
|
pg_dependencies_send(PG_FUNCTION_ARGS)
|
|
{
|
|
return byteasend(fcinfo);
|
|
}
|
|
|
|
/*
|
|
* dependency_is_compatible_clause
|
|
* Determines if the clause is compatible with functional dependencies
|
|
*
|
|
* Only clauses that have the form of equality to a pseudoconstant, or can be
|
|
* interpreted that way, are currently accepted. Furthermore the variable
|
|
* part of the clause must be a simple Var belonging to the specified
|
|
* relation, whose attribute number we return in *attnum on success.
|
|
*/
|
|
static bool
|
|
dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
|
|
{
|
|
Var *var;
|
|
Node *clause_expr;
|
|
|
|
if (IsA(clause, RestrictInfo))
|
|
{
|
|
RestrictInfo *rinfo = (RestrictInfo *) clause;
|
|
|
|
/* Pseudoconstants are not interesting (they couldn't contain a Var) */
|
|
if (rinfo->pseudoconstant)
|
|
return false;
|
|
|
|
/* Clauses referencing multiple, or no, varnos are incompatible */
|
|
if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
|
|
return false;
|
|
|
|
clause = (Node *) rinfo->clause;
|
|
}
|
|
|
|
if (is_opclause(clause))
|
|
{
|
|
/* If it's an opclause, check for Var = Const or Const = Var. */
|
|
OpExpr *expr = (OpExpr *) clause;
|
|
|
|
/* Only expressions with two arguments are candidates. */
|
|
if (list_length(expr->args) != 2)
|
|
return false;
|
|
|
|
/* Make sure non-selected argument is a pseudoconstant. */
|
|
if (is_pseudo_constant_clause(lsecond(expr->args)))
|
|
clause_expr = linitial(expr->args);
|
|
else if (is_pseudo_constant_clause(linitial(expr->args)))
|
|
clause_expr = lsecond(expr->args);
|
|
else
|
|
return false;
|
|
|
|
/*
|
|
* If it's not an "=" operator, just ignore the clause, as it's not
|
|
* compatible with functional dependencies.
|
|
*
|
|
* This uses the function for estimating selectivity, not the operator
|
|
* directly (a bit awkward, but well ...).
|
|
*
|
|
* XXX this is pretty dubious; probably it'd be better to check btree
|
|
* or hash opclass membership, so as not to be fooled by custom
|
|
* selectivity functions, and to be more consistent with decisions
|
|
* elsewhere in the planner.
|
|
*/
|
|
if (get_oprrest(expr->opno) != F_EQSEL)
|
|
return false;
|
|
|
|
/* OK to proceed with checking "var" */
|
|
}
|
|
else if (IsA(clause, ScalarArrayOpExpr))
|
|
{
|
|
/* If it's an scalar array operator, check for Var IN Const. */
|
|
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
|
|
|
|
/*
|
|
* Reject ALL() variant, we only care about ANY/IN.
|
|
*
|
|
* XXX Maybe we should check if all the values are the same, and allow
|
|
* ALL in that case? Doesn't seem very practical, though.
|
|
*/
|
|
if (!expr->useOr)
|
|
return false;
|
|
|
|
/* Only expressions with two arguments are candidates. */
|
|
if (list_length(expr->args) != 2)
|
|
return false;
|
|
|
|
/*
|
|
* We know it's always (Var IN Const), so we assume the var is the
|
|
* first argument, and pseudoconstant is the second one.
|
|
*/
|
|
if (!is_pseudo_constant_clause(lsecond(expr->args)))
|
|
return false;
|
|
|
|
clause_expr = linitial(expr->args);
|
|
|
|
/*
|
|
* If it's not an "=" operator, just ignore the clause, as it's not
|
|
* compatible with functional dependencies. The operator is identified
|
|
* simply by looking at which function it uses to estimate
|
|
* selectivity. That's a bit strange, but it's what other similar
|
|
* places do.
|
|
*/
|
|
if (get_oprrest(expr->opno) != F_EQSEL)
|
|
return false;
|
|
|
|
/* OK to proceed with checking "var" */
|
|
}
|
|
else if (is_orclause(clause))
|
|
{
|
|
BoolExpr *bool_expr = (BoolExpr *) clause;
|
|
ListCell *lc;
|
|
|
|
/* start with no attribute number */
|
|
*attnum = InvalidAttrNumber;
|
|
|
|
foreach(lc, bool_expr->args)
|
|
{
|
|
AttrNumber clause_attnum;
|
|
|
|
/*
|
|
* Had we found incompatible clause in the arguments, treat the
|
|
* whole clause as incompatible.
|
|
*/
|
|
if (!dependency_is_compatible_clause((Node *) lfirst(lc),
|
|
relid, &clause_attnum))
|
|
return false;
|
|
|
|
if (*attnum == InvalidAttrNumber)
|
|
*attnum = clause_attnum;
|
|
|
|
/* ensure all the variables are the same (same attnum) */
|
|
if (*attnum != clause_attnum)
|
|
return false;
|
|
}
|
|
|
|
/* the Var is already checked by the recursive call */
|
|
return true;
|
|
}
|
|
else if (is_notclause(clause))
|
|
{
|
|
/*
|
|
* "NOT x" can be interpreted as "x = false", so get the argument and
|
|
* proceed with seeing if it's a suitable Var.
|
|
*/
|
|
clause_expr = (Node *) get_notclausearg(clause);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* A boolean expression "x" can be interpreted as "x = true", so
|
|
* proceed with seeing if it's a suitable Var.
|
|
*/
|
|
clause_expr = (Node *) clause;
|
|
}
|
|
|
|
/*
|
|
* We may ignore any RelabelType node above the operand. (There won't be
|
|
* more than one, since eval_const_expressions has been applied already.)
|
|
*/
|
|
if (IsA(clause_expr, RelabelType))
|
|
clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
|
|
|
|
/* We only support plain Vars for now */
|
|
if (!IsA(clause_expr, Var))
|
|
return false;
|
|
|
|
/* OK, we know we have a Var */
|
|
var = (Var *) clause_expr;
|
|
|
|
/* Ensure Var is from the correct relation */
|
|
if (var->varno != relid)
|
|
return false;
|
|
|
|
/* We also better ensure the Var is from the current level */
|
|
if (var->varlevelsup != 0)
|
|
return false;
|
|
|
|
/* Also ignore system attributes (we don't allow stats on those) */
|
|
if (!AttrNumberIsForUserDefinedAttr(var->varattno))
|
|
return false;
|
|
|
|
*attnum = var->varattno;
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* find_strongest_dependency
|
|
* find the strongest dependency on the attributes
|
|
*
|
|
* When applying functional dependencies, we start with the strongest
|
|
* dependencies. That is, we select the dependency that:
|
|
*
|
|
* (a) has all attributes covered by equality clauses
|
|
*
|
|
* (b) has the most attributes
|
|
*
|
|
* (c) has the highest degree of validity
|
|
*
|
|
* This guarantees that we eliminate the most redundant conditions first
|
|
* (see the comment in dependencies_clauselist_selectivity).
|
|
*/
|
|
static MVDependency *
|
|
find_strongest_dependency(MVDependencies **dependencies, int ndependencies,
|
|
Bitmapset *attnums)
|
|
{
|
|
int i,
|
|
j;
|
|
MVDependency *strongest = NULL;
|
|
|
|
/* number of attnums in clauses */
|
|
int nattnums = bms_num_members(attnums);
|
|
|
|
/*
|
|
* Iterate over the MVDependency items and find the strongest one from the
|
|
* fully-matched dependencies. We do the cheap checks first, before
|
|
* matching it against the attnums.
|
|
*/
|
|
for (i = 0; i < ndependencies; i++)
|
|
{
|
|
for (j = 0; j < dependencies[i]->ndeps; j++)
|
|
{
|
|
MVDependency *dependency = dependencies[i]->deps[j];
|
|
|
|
/*
|
|
* Skip dependencies referencing more attributes than available
|
|
* clauses, as those can't be fully matched.
|
|
*/
|
|
if (dependency->nattributes > nattnums)
|
|
continue;
|
|
|
|
if (strongest)
|
|
{
|
|
/* skip dependencies on fewer attributes than the strongest. */
|
|
if (dependency->nattributes < strongest->nattributes)
|
|
continue;
|
|
|
|
/* also skip weaker dependencies when attribute count matches */
|
|
if (strongest->nattributes == dependency->nattributes &&
|
|
strongest->degree > dependency->degree)
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* this dependency is stronger, but we must still check that it's
|
|
* fully matched to these attnums. We perform this check last as
|
|
* it's slightly more expensive than the previous checks.
|
|
*/
|
|
if (dependency_is_fully_matched(dependency, attnums))
|
|
strongest = dependency; /* save new best match */
|
|
}
|
|
}
|
|
|
|
return strongest;
|
|
}
|
|
|
|
/*
|
|
* clauselist_apply_dependencies
|
|
* Apply the specified functional dependencies to a list of clauses and
|
|
* return the estimated selectivity of the clauses that are compatible
|
|
* with any of the given dependencies.
|
|
*
|
|
* This will estimate all not-already-estimated clauses that are compatible
|
|
* with functional dependencies, and which have an attribute mentioned by any
|
|
* of the given dependencies (either as an implying or implied attribute).
|
|
*
|
|
* Given (lists of) clauses on attributes (a,b) and a functional dependency
|
|
* (a=>b), the per-column selectivities P(a) and P(b) are notionally combined
|
|
* using the formula
|
|
*
|
|
* P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
|
|
*
|
|
* where 'f' is the degree of dependency. This reflects the fact that we
|
|
* expect a fraction f of all rows to be consistent with the dependency
|
|
* (a=>b), and so have a selectivity of P(a), while the remaining rows are
|
|
* treated as independent.
|
|
*
|
|
* In practice, we use a slightly modified version of this formula, which uses
|
|
* a selectivity of Min(P(a), P(b)) for the dependent rows, since the result
|
|
* should obviously not exceed either column's individual selectivity. I.e.,
|
|
* we actually combine selectivities using the formula
|
|
*
|
|
* P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
|
|
*
|
|
* This can make quite a difference if the specific values matching the
|
|
* clauses are not consistent with the functional dependency.
|
|
*/
|
|
static Selectivity
|
|
clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
|
|
int varRelid, JoinType jointype,
|
|
SpecialJoinInfo *sjinfo,
|
|
MVDependency **dependencies, int ndependencies,
|
|
AttrNumber *list_attnums,
|
|
Bitmapset **estimatedclauses)
|
|
{
|
|
Bitmapset *attnums;
|
|
int i;
|
|
int j;
|
|
int nattrs;
|
|
Selectivity *attr_sel;
|
|
int attidx;
|
|
int listidx;
|
|
ListCell *l;
|
|
Selectivity s1;
|
|
|
|
/*
|
|
* Extract the attnums of all implying and implied attributes from all the
|
|
* given dependencies. Each of these attributes is expected to have at
|
|
* least 1 not-already-estimated compatible clause that we will estimate
|
|
* here.
|
|
*/
|
|
attnums = NULL;
|
|
for (i = 0; i < ndependencies; i++)
|
|
{
|
|
for (j = 0; j < dependencies[i]->nattributes; j++)
|
|
{
|
|
AttrNumber attnum = dependencies[i]->attributes[j];
|
|
|
|
attnums = bms_add_member(attnums, attnum);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Compute per-column selectivity estimates for each of these attributes,
|
|
* and mark all the corresponding clauses as estimated.
|
|
*/
|
|
nattrs = bms_num_members(attnums);
|
|
attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs);
|
|
|
|
attidx = 0;
|
|
i = -1;
|
|
while ((i = bms_next_member(attnums, i)) >= 0)
|
|
{
|
|
List *attr_clauses = NIL;
|
|
Selectivity simple_sel;
|
|
|
|
listidx = -1;
|
|
foreach(l, clauses)
|
|
{
|
|
Node *clause = (Node *) lfirst(l);
|
|
|
|
listidx++;
|
|
if (list_attnums[listidx] == i)
|
|
{
|
|
attr_clauses = lappend(attr_clauses, clause);
|
|
*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
|
|
}
|
|
}
|
|
|
|
simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid,
|
|
jointype, sjinfo, false);
|
|
attr_sel[attidx++] = simple_sel;
|
|
}
|
|
|
|
/*
|
|
* Now combine these selectivities using the dependency information. For
|
|
* chains of dependencies such as a -> b -> c, the b -> c dependency will
|
|
* come before the a -> b dependency in the array, so we traverse the
|
|
* array backwards to ensure such chains are computed in the right order.
|
|
*
|
|
* As explained above, pairs of selectivities are combined using the
|
|
* formula
|
|
*
|
|
* P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b)
|
|
*
|
|
* to ensure that the combined selectivity is never greater than either
|
|
* individual selectivity.
|
|
*
|
|
* Where multiple dependencies apply (e.g., a -> b -> c), we use
|
|
* conditional probabilities to compute the overall result as follows:
|
|
*
|
|
* P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a)
|
|
*
|
|
* so we replace the selectivities of all implied attributes with
|
|
* conditional probabilities, that are conditional on all their implying
|
|
* attributes. The selectivities of all other non-implied attributes are
|
|
* left as they are.
|
|
*/
|
|
for (i = ndependencies - 1; i >= 0; i--)
|
|
{
|
|
MVDependency *dependency = dependencies[i];
|
|
AttrNumber attnum;
|
|
Selectivity s2;
|
|
double f;
|
|
|
|
/* Selectivity of all the implying attributes */
|
|
s1 = 1.0;
|
|
for (j = 0; j < dependency->nattributes - 1; j++)
|
|
{
|
|
attnum = dependency->attributes[j];
|
|
attidx = bms_member_index(attnums, attnum);
|
|
s1 *= attr_sel[attidx];
|
|
}
|
|
|
|
/* Original selectivity of the implied attribute */
|
|
attnum = dependency->attributes[j];
|
|
attidx = bms_member_index(attnums, attnum);
|
|
s2 = attr_sel[attidx];
|
|
|
|
/*
|
|
* Replace s2 with the conditional probability s2 given s1, computed
|
|
* using the formula P(b|a) = P(a,b) / P(a), which simplifies to
|
|
*
|
|
* P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b)
|
|
*
|
|
* where P(a) = s1, the selectivity of the implying attributes, and
|
|
* P(b) = s2, the selectivity of the implied attribute.
|
|
*/
|
|
f = dependency->degree;
|
|
|
|
if (s1 <= s2)
|
|
attr_sel[attidx] = f + (1 - f) * s2;
|
|
else
|
|
attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2;
|
|
}
|
|
|
|
/*
|
|
* The overall selectivity of all the clauses on all these attributes is
|
|
* then the product of all the original (non-implied) probabilities and
|
|
* the new conditional (implied) probabilities.
|
|
*/
|
|
s1 = 1.0;
|
|
for (i = 0; i < nattrs; i++)
|
|
s1 *= attr_sel[i];
|
|
|
|
CLAMP_PROBABILITY(s1);
|
|
|
|
pfree(attr_sel);
|
|
bms_free(attnums);
|
|
|
|
return s1;
|
|
}
|
|
|
|
/*
|
|
* dependency_is_compatible_expression
|
|
* Determines if the expression is compatible with functional dependencies
|
|
*
|
|
* Similar to dependency_is_compatible_clause, but doesn't enforce that the
|
|
* expression is a simple Var. On success, return the matching statistics
|
|
* expression into *expr.
|
|
*/
|
|
static bool
|
|
dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
|
|
{
|
|
ListCell *lc,
|
|
*lc2;
|
|
Node *clause_expr;
|
|
|
|
if (IsA(clause, RestrictInfo))
|
|
{
|
|
RestrictInfo *rinfo = (RestrictInfo *) clause;
|
|
|
|
/* Pseudoconstants are not interesting (they couldn't contain a Var) */
|
|
if (rinfo->pseudoconstant)
|
|
return false;
|
|
|
|
/* Clauses referencing multiple, or no, varnos are incompatible */
|
|
if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
|
|
return false;
|
|
|
|
clause = (Node *) rinfo->clause;
|
|
}
|
|
|
|
if (is_opclause(clause))
|
|
{
|
|
/* If it's an opclause, check for Var = Const or Const = Var. */
|
|
OpExpr *expr = (OpExpr *) clause;
|
|
|
|
/* Only expressions with two arguments are candidates. */
|
|
if (list_length(expr->args) != 2)
|
|
return false;
|
|
|
|
/* Make sure non-selected argument is a pseudoconstant. */
|
|
if (is_pseudo_constant_clause(lsecond(expr->args)))
|
|
clause_expr = linitial(expr->args);
|
|
else if (is_pseudo_constant_clause(linitial(expr->args)))
|
|
clause_expr = lsecond(expr->args);
|
|
else
|
|
return false;
|
|
|
|
/*
|
|
* If it's not an "=" operator, just ignore the clause, as it's not
|
|
* compatible with functional dependencies.
|
|
*
|
|
* This uses the function for estimating selectivity, not the operator
|
|
* directly (a bit awkward, but well ...).
|
|
*
|
|
* XXX this is pretty dubious; probably it'd be better to check btree
|
|
* or hash opclass membership, so as not to be fooled by custom
|
|
* selectivity functions, and to be more consistent with decisions
|
|
* elsewhere in the planner.
|
|
*/
|
|
if (get_oprrest(expr->opno) != F_EQSEL)
|
|
return false;
|
|
|
|
/* OK to proceed with checking "var" */
|
|
}
|
|
else if (IsA(clause, ScalarArrayOpExpr))
|
|
{
|
|
/* If it's an scalar array operator, check for Var IN Const. */
|
|
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
|
|
|
|
/*
|
|
* Reject ALL() variant, we only care about ANY/IN.
|
|
*
|
|
* FIXME Maybe we should check if all the values are the same, and
|
|
* allow ALL in that case? Doesn't seem very practical, though.
|
|
*/
|
|
if (!expr->useOr)
|
|
return false;
|
|
|
|
/* Only expressions with two arguments are candidates. */
|
|
if (list_length(expr->args) != 2)
|
|
return false;
|
|
|
|
/*
|
|
* We know it's always (Var IN Const), so we assume the var is the
|
|
* first argument, and pseudoconstant is the second one.
|
|
*/
|
|
if (!is_pseudo_constant_clause(lsecond(expr->args)))
|
|
return false;
|
|
|
|
clause_expr = linitial(expr->args);
|
|
|
|
/*
|
|
* If it's not an "=" operator, just ignore the clause, as it's not
|
|
* compatible with functional dependencies. The operator is identified
|
|
* simply by looking at which function it uses to estimate
|
|
* selectivity. That's a bit strange, but it's what other similar
|
|
* places do.
|
|
*/
|
|
if (get_oprrest(expr->opno) != F_EQSEL)
|
|
return false;
|
|
|
|
/* OK to proceed with checking "var" */
|
|
}
|
|
else if (is_orclause(clause))
|
|
{
|
|
BoolExpr *bool_expr = (BoolExpr *) clause;
|
|
ListCell *lc;
|
|
|
|
/* start with no expression (we'll use the first match) */
|
|
*expr = NULL;
|
|
|
|
foreach(lc, bool_expr->args)
|
|
{
|
|
Node *or_expr = NULL;
|
|
|
|
/*
|
|
* Had we found incompatible expression in the arguments, treat
|
|
* the whole expression as incompatible.
|
|
*/
|
|
if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
|
|
statlist, &or_expr))
|
|
return false;
|
|
|
|
if (*expr == NULL)
|
|
*expr = or_expr;
|
|
|
|
/* ensure all the expressions are the same */
|
|
if (!equal(or_expr, *expr))
|
|
return false;
|
|
}
|
|
|
|
/* the expression is already checked by the recursive call */
|
|
return true;
|
|
}
|
|
else if (is_notclause(clause))
|
|
{
|
|
/*
|
|
* "NOT x" can be interpreted as "x = false", so get the argument and
|
|
* proceed with seeing if it's a suitable Var.
|
|
*/
|
|
clause_expr = (Node *) get_notclausearg(clause);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* A boolean expression "x" can be interpreted as "x = true", so
|
|
* proceed with seeing if it's a suitable Var.
|
|
*/
|
|
clause_expr = (Node *) clause;
|
|
}
|
|
|
|
/*
|
|
* We may ignore any RelabelType node above the operand. (There won't be
|
|
* more than one, since eval_const_expressions has been applied already.)
|
|
*/
|
|
if (IsA(clause_expr, RelabelType))
|
|
clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
|
|
|
|
/*
|
|
* Search for a matching statistics expression.
|
|
*/
|
|
foreach(lc, statlist)
|
|
{
|
|
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
|
|
|
|
/* ignore stats without dependencies */
|
|
if (info->kind != STATS_EXT_DEPENDENCIES)
|
|
continue;
|
|
|
|
foreach(lc2, info->exprs)
|
|
{
|
|
Node *stat_expr = (Node *) lfirst(lc2);
|
|
|
|
if (equal(clause_expr, stat_expr))
|
|
{
|
|
*expr = stat_expr;
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* dependencies_clauselist_selectivity
|
|
* Return the estimated selectivity of (a subset of) the given clauses
|
|
* using functional dependency statistics, or 1.0 if no useful functional
|
|
* dependency statistic exists.
|
|
*
|
|
* 'estimatedclauses' is an input/output argument that gets a bit set
|
|
* corresponding to the (zero-based) list index of each clause that is included
|
|
* in the estimated selectivity.
|
|
*
|
|
* Given equality clauses on attributes (a,b) we find the strongest dependency
|
|
* between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
|
|
* dependency, we then combine the per-clause selectivities using the formula
|
|
*
|
|
* P(a,b) = f * P(a) + (1-f) * P(a) * P(b)
|
|
*
|
|
* where 'f' is the degree of the dependency. (Actually we use a slightly
|
|
* modified version of this formula -- see clauselist_apply_dependencies()).
|
|
*
|
|
* With clauses on more than two attributes, the dependencies are applied
|
|
* recursively, starting with the widest/strongest dependencies. For example
|
|
* P(a,b,c) is first split like this:
|
|
*
|
|
* P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c)
|
|
*
|
|
* assuming (a,b=>c) is the strongest dependency.
|
|
*/
|
|
Selectivity
|
|
dependencies_clauselist_selectivity(PlannerInfo *root,
|
|
List *clauses,
|
|
int varRelid,
|
|
JoinType jointype,
|
|
SpecialJoinInfo *sjinfo,
|
|
RelOptInfo *rel,
|
|
Bitmapset **estimatedclauses)
|
|
{
|
|
Selectivity s1 = 1.0;
|
|
ListCell *l;
|
|
Bitmapset *clauses_attnums = NULL;
|
|
AttrNumber *list_attnums;
|
|
int listidx;
|
|
MVDependencies **func_dependencies;
|
|
int nfunc_dependencies;
|
|
int total_ndeps;
|
|
MVDependency **dependencies;
|
|
int ndependencies;
|
|
int i;
|
|
AttrNumber attnum_offset;
|
|
RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
|
|
|
|
/* unique expressions */
|
|
Node **unique_exprs;
|
|
int unique_exprs_cnt;
|
|
|
|
/*
|
|
* When dealing with regular inheritance trees, ignore extended stats
|
|
* (which were built without data from child rels, and thus do not
|
|
* represent them). For partitioned tables data there's no data in the
|
|
* non-leaf relations, so we build stats only for the inheritance tree.
|
|
* So for partitioned tables we do consider extended stats.
|
|
*/
|
|
if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE)
|
|
return 1.0;
|
|
|
|
/* check if there's any stats that might be useful for us. */
|
|
if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
|
|
return 1.0;
|
|
|
|
list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
|
|
list_length(clauses));
|
|
|
|
/*
|
|
* We allocate space as if every clause was a unique expression, although
|
|
* that's probably overkill. Some will be simple column references that
|
|
* we'll translate to attnums, and there might be duplicates. But it's
|
|
* easier and cheaper to just do one allocation than repalloc later.
|
|
*/
|
|
unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
|
|
unique_exprs_cnt = 0;
|
|
|
|
/*
|
|
* Pre-process the clauses list to extract the attnums seen in each item.
|
|
* We need to determine if there's any clauses which will be useful for
|
|
* dependency selectivity estimations. Along the way we'll record all of
|
|
* the attnums for each clause in a list which we'll reference later so we
|
|
* don't need to repeat the same work again. We'll also keep track of all
|
|
* attnums seen.
|
|
*
|
|
* We also skip clauses that we already estimated using different types of
|
|
* statistics (we treat them as incompatible).
|
|
*
|
|
* To handle expressions, we assign them negative attnums, as if it was a
|
|
* system attribute (this is fine, as we only allow extended stats on user
|
|
* attributes). And then we offset everything by the number of
|
|
* expressions, so that we can store the values in a bitmapset.
|
|
*/
|
|
listidx = 0;
|
|
foreach(l, clauses)
|
|
{
|
|
Node *clause = (Node *) lfirst(l);
|
|
AttrNumber attnum;
|
|
Node *expr = NULL;
|
|
|
|
/* ignore clause by default */
|
|
list_attnums[listidx] = InvalidAttrNumber;
|
|
|
|
if (!bms_is_member(listidx, *estimatedclauses))
|
|
{
|
|
/*
|
|
* If it's a simple column reference, just extract the attnum. If
|
|
* it's an expression, assign a negative attnum as if it was a
|
|
* system attribute.
|
|
*/
|
|
if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
|
|
{
|
|
list_attnums[listidx] = attnum;
|
|
}
|
|
else if (dependency_is_compatible_expression(clause, rel->relid,
|
|
rel->statlist,
|
|
&expr))
|
|
{
|
|
/* special attnum assigned to this expression */
|
|
attnum = InvalidAttrNumber;
|
|
|
|
Assert(expr != NULL);
|
|
|
|
/* If the expression is duplicate, use the same attnum. */
|
|
for (i = 0; i < unique_exprs_cnt; i++)
|
|
{
|
|
if (equal(unique_exprs[i], expr))
|
|
{
|
|
/* negative attribute number to expression */
|
|
attnum = -(i + 1);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* not found in the list, so add it */
|
|
if (attnum == InvalidAttrNumber)
|
|
{
|
|
unique_exprs[unique_exprs_cnt++] = expr;
|
|
|
|
/* after incrementing the value, to get -1, -2, ... */
|
|
attnum = (-unique_exprs_cnt);
|
|
}
|
|
|
|
/* remember which attnum was assigned to this clause */
|
|
list_attnums[listidx] = attnum;
|
|
}
|
|
}
|
|
|
|
listidx++;
|
|
}
|
|
|
|
Assert(listidx == list_length(clauses));
|
|
|
|
/*
|
|
* How much we need to offset the attnums? If there are no expressions,
|
|
* then no offset is needed. Otherwise we need to offset enough for the
|
|
* lowest value (-unique_exprs_cnt) to become 1.
|
|
*/
|
|
if (unique_exprs_cnt > 0)
|
|
attnum_offset = (unique_exprs_cnt + 1);
|
|
else
|
|
attnum_offset = 0;
|
|
|
|
/*
|
|
* Now that we know how many expressions there are, we can offset the
|
|
* values just enough to build the bitmapset.
|
|
*/
|
|
for (i = 0; i < list_length(clauses); i++)
|
|
{
|
|
AttrNumber attnum;
|
|
|
|
/* ignore incompatible or already estimated clauses */
|
|
if (list_attnums[i] == InvalidAttrNumber)
|
|
continue;
|
|
|
|
/* make sure the attnum is in the expected range */
|
|
Assert(list_attnums[i] >= (-unique_exprs_cnt));
|
|
Assert(list_attnums[i] <= MaxHeapAttributeNumber);
|
|
|
|
/* make sure the attnum is positive (valid AttrNumber) */
|
|
attnum = list_attnums[i] + attnum_offset;
|
|
|
|
/*
|
|
* Either it's a regular attribute, or it's an expression, in which
|
|
* case we must not have seen it before (expressions are unique).
|
|
*
|
|
* XXX Check whether it's a regular attribute has to be done using the
|
|
* original attnum, while the second check has to use the value with
|
|
* an offset.
|
|
*/
|
|
Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
|
|
!bms_is_member(attnum, clauses_attnums));
|
|
|
|
/*
|
|
* Remember the offset attnum, both for attributes and expressions.
|
|
* We'll pass list_attnums to clauselist_apply_dependencies, which
|
|
* uses it to identify clauses in a bitmap. We could also pass the
|
|
* offset, but this is more convenient.
|
|
*/
|
|
list_attnums[i] = attnum;
|
|
|
|
clauses_attnums = bms_add_member(clauses_attnums, attnum);
|
|
}
|
|
|
|
/*
|
|
* If there's not at least two distinct attnums and expressions, then
|
|
* reject the whole list of clauses. We must return 1.0 so the calling
|
|
* function's selectivity is unaffected.
|
|
*/
|
|
if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
|
|
{
|
|
bms_free(clauses_attnums);
|
|
pfree(list_attnums);
|
|
return 1.0;
|
|
}
|
|
|
|
/*
|
|
* Load all functional dependencies matching at least two parameters. We
|
|
* can simply consider all dependencies at once, without having to search
|
|
* for the best statistics object.
|
|
*
|
|
* To not waste cycles and memory, we deserialize dependencies only for
|
|
* statistics that match at least two attributes. The array is allocated
|
|
* with the assumption that all objects match - we could grow the array to
|
|
* make it just the right size, but it's likely wasteful anyway thanks to
|
|
* moving the freed chunks to freelists etc.
|
|
*/
|
|
func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) *
|
|
list_length(rel->statlist));
|
|
nfunc_dependencies = 0;
|
|
total_ndeps = 0;
|
|
|
|
foreach(l, rel->statlist)
|
|
{
|
|
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
|
|
int nmatched;
|
|
int nexprs;
|
|
int k;
|
|
MVDependencies *deps;
|
|
|
|
/* skip statistics that are not of the correct type */
|
|
if (stat->kind != STATS_EXT_DEPENDENCIES)
|
|
continue;
|
|
|
|
/*
|
|
* Count matching attributes - we have to undo the attnum offsets. The
|
|
* input attribute numbers are not offset (expressions are not
|
|
* included in stat->keys, so it's not necessary). But we need to
|
|
* offset it before checking against clauses_attnums.
|
|
*/
|
|
nmatched = 0;
|
|
k = -1;
|
|
while ((k = bms_next_member(stat->keys, k)) >= 0)
|
|
{
|
|
AttrNumber attnum = (AttrNumber) k;
|
|
|
|
/* skip expressions */
|
|
if (!AttrNumberIsForUserDefinedAttr(attnum))
|
|
continue;
|
|
|
|
/* apply the same offset as above */
|
|
attnum += attnum_offset;
|
|
|
|
if (bms_is_member(attnum, clauses_attnums))
|
|
nmatched++;
|
|
}
|
|
|
|
/* count matching expressions */
|
|
nexprs = 0;
|
|
for (i = 0; i < unique_exprs_cnt; i++)
|
|
{
|
|
ListCell *lc;
|
|
|
|
foreach(lc, stat->exprs)
|
|
{
|
|
Node *stat_expr = (Node *) lfirst(lc);
|
|
|
|
/* try to match it */
|
|
if (equal(stat_expr, unique_exprs[i]))
|
|
nexprs++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Skip objects matching fewer than two attributes/expressions from
|
|
* clauses.
|
|
*/
|
|
if (nmatched + nexprs < 2)
|
|
continue;
|
|
|
|
deps = statext_dependencies_load(stat->statOid);
|
|
|
|
/*
|
|
* The expressions may be represented by different attnums in the
|
|
* stats, we need to remap them to be consistent with the clauses.
|
|
* That will make the later steps (e.g. picking the strongest item and
|
|
* so on) much simpler and cheaper, because it won't need to care
|
|
* about the offset at all.
|
|
*
|
|
* When we're at it, we can ignore dependencies that are not fully
|
|
* matched by clauses (i.e. referencing attributes or expressions that
|
|
* are not in the clauses).
|
|
*
|
|
* We have to do this for all statistics, as long as there are any
|
|
* expressions - we need to shift the attnums in all dependencies.
|
|
*
|
|
* XXX Maybe we should do this always, because it also eliminates some
|
|
* of the dependencies early. It might be cheaper than having to walk
|
|
* the longer list in find_strongest_dependency later, especially as
|
|
* we need to do that repeatedly?
|
|
*
|
|
* XXX We have to do this even when there are no expressions in
|
|
* clauses, otherwise find_strongest_dependency may fail for stats
|
|
* with expressions (due to lookup of negative value in bitmap). So we
|
|
* need to at least filter out those dependencies. Maybe we could do
|
|
* it in a cheaper way (if there are no expr clauses, we can just
|
|
* discard all negative attnums without any lookups).
|
|
*/
|
|
if (unique_exprs_cnt > 0 || stat->exprs != NIL)
|
|
{
|
|
int ndeps = 0;
|
|
|
|
for (i = 0; i < deps->ndeps; i++)
|
|
{
|
|
bool skip = false;
|
|
MVDependency *dep = deps->deps[i];
|
|
int j;
|
|
|
|
for (j = 0; j < dep->nattributes; j++)
|
|
{
|
|
int idx;
|
|
Node *expr;
|
|
int k;
|
|
AttrNumber unique_attnum = InvalidAttrNumber;
|
|
AttrNumber attnum;
|
|
|
|
/* undo the per-statistics offset */
|
|
attnum = dep->attributes[j];
|
|
|
|
/*
|
|
* For regular attributes we can simply check if it
|
|
* matches any clause. If there's no matching clause, we
|
|
* can just ignore it. We need to offset the attnum
|
|
* though.
|
|
*/
|
|
if (AttrNumberIsForUserDefinedAttr(attnum))
|
|
{
|
|
dep->attributes[j] = attnum + attnum_offset;
|
|
|
|
if (!bms_is_member(dep->attributes[j], clauses_attnums))
|
|
{
|
|
skip = true;
|
|
break;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* the attnum should be a valid system attnum (-1, -2,
|
|
* ...)
|
|
*/
|
|
Assert(AttributeNumberIsValid(attnum));
|
|
|
|
/*
|
|
* For expressions, we need to do two translations. First
|
|
* we have to translate the negative attnum to index in
|
|
* the list of expressions (in the statistics object).
|
|
* Then we need to see if there's a matching clause. The
|
|
* index of the unique expression determines the attnum
|
|
* (and we offset it).
|
|
*/
|
|
idx = -(1 + attnum);
|
|
|
|
/* Is the expression index is valid? */
|
|
Assert((idx >= 0) && (idx < list_length(stat->exprs)));
|
|
|
|
expr = (Node *) list_nth(stat->exprs, idx);
|
|
|
|
/* try to find the expression in the unique list */
|
|
for (k = 0; k < unique_exprs_cnt; k++)
|
|
{
|
|
/*
|
|
* found a matching unique expression, use the attnum
|
|
* (derived from index of the unique expression)
|
|
*/
|
|
if (equal(unique_exprs[k], expr))
|
|
{
|
|
unique_attnum = -(k + 1) + attnum_offset;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Found no matching expression, so we can simply skip
|
|
* this dependency, because there's no chance it will be
|
|
* fully covered.
|
|
*/
|
|
if (unique_attnum == InvalidAttrNumber)
|
|
{
|
|
skip = true;
|
|
break;
|
|
}
|
|
|
|
/* otherwise remap it to the new attnum */
|
|
dep->attributes[j] = unique_attnum;
|
|
}
|
|
|
|
/* if found a matching dependency, keep it */
|
|
if (!skip)
|
|
{
|
|
/* maybe we've skipped something earlier, so move it */
|
|
if (ndeps != i)
|
|
deps->deps[ndeps] = deps->deps[i];
|
|
|
|
ndeps++;
|
|
}
|
|
}
|
|
|
|
deps->ndeps = ndeps;
|
|
}
|
|
|
|
/*
|
|
* It's possible we've removed all dependencies, in which case we
|
|
* don't bother adding it to the list.
|
|
*/
|
|
if (deps->ndeps > 0)
|
|
{
|
|
func_dependencies[nfunc_dependencies] = deps;
|
|
total_ndeps += deps->ndeps;
|
|
nfunc_dependencies++;
|
|
}
|
|
}
|
|
|
|
/* if no matching stats could be found then we've nothing to do */
|
|
if (nfunc_dependencies == 0)
|
|
{
|
|
pfree(func_dependencies);
|
|
bms_free(clauses_attnums);
|
|
pfree(list_attnums);
|
|
pfree(unique_exprs);
|
|
return 1.0;
|
|
}
|
|
|
|
/*
|
|
* Work out which dependencies we can apply, starting with the
|
|
* widest/strongest ones, and proceeding to smaller/weaker ones.
|
|
*/
|
|
dependencies = (MVDependency **) palloc(sizeof(MVDependency *) *
|
|
total_ndeps);
|
|
ndependencies = 0;
|
|
|
|
while (true)
|
|
{
|
|
MVDependency *dependency;
|
|
AttrNumber attnum;
|
|
|
|
/* the widest/strongest dependency, fully matched by clauses */
|
|
dependency = find_strongest_dependency(func_dependencies,
|
|
nfunc_dependencies,
|
|
clauses_attnums);
|
|
if (!dependency)
|
|
break;
|
|
|
|
dependencies[ndependencies++] = dependency;
|
|
|
|
/* Ignore dependencies using this implied attribute in later loops */
|
|
attnum = dependency->attributes[dependency->nattributes - 1];
|
|
clauses_attnums = bms_del_member(clauses_attnums, attnum);
|
|
}
|
|
|
|
/*
|
|
* If we found applicable dependencies, use them to estimate all
|
|
* compatible clauses on attributes that they refer to.
|
|
*/
|
|
if (ndependencies != 0)
|
|
s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype,
|
|
sjinfo, dependencies, ndependencies,
|
|
list_attnums, estimatedclauses);
|
|
|
|
/* free deserialized functional dependencies (and then the array) */
|
|
for (i = 0; i < nfunc_dependencies; i++)
|
|
pfree(func_dependencies[i]);
|
|
|
|
pfree(dependencies);
|
|
pfree(func_dependencies);
|
|
bms_free(clauses_attnums);
|
|
pfree(list_attnums);
|
|
pfree(unique_exprs);
|
|
|
|
return s1;
|
|
}
|