postgresql/src/backend/utils/adt/rangetypes_selfuncs.c
Heikki Linnakangas 918eee0c49 Collect and use histograms of lower and upper bounds for range types.
This enables selectivity estimation of the <<, >>, &<, &> and && operators,
as well as the normal inequality operators: <, <=, >=, >. "range @> element"
is also supported, but the range-variant @> and <@ operators are not,
because they cannot be sensibly estimated with lower and upper bound
histograms alone. We would need to make some assumption about the lengths of
the ranges for that. Alexander's patch included a separate histogram of
lengths for that, but I left that out of the patch for simplicity. Hopefully
that will be added as a followup patch.

The fraction of empty ranges is also calculated and used in estimation.

Alexander Korotkov, heavily modified by me.
2012-08-27 15:58:46 +03:00

600 lines
16 KiB
C

/*-------------------------------------------------------------------------
*
* rangetypes_selfuncs.c
* Functions for selectivity estimation of range operators
*
* Estimates are based on histograms of lower and upper bounds, and the
* fraction of empty ranges.
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/adt/rangetypes_selfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "utils/lsyscache.h"
#include "utils/rangetypes.h"
#include "utils/selfuncs.h"
#include "utils/typcache.h"
static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
RangeType *constval, Oid operator);
static double default_range_selectivity(Oid operator);
static double calc_hist_selectivity(TypeCacheEntry *typcache,
VariableStatData *vardata, RangeType *constval,
Oid operator);
static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
RangeBound *constbound,
RangeBound *hist, int hist_nvalues,
bool equal);
static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
RangeBound *hist, int hist_length, bool equal);
static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
RangeBound *hist1, RangeBound *hist2);
/*
* Returns a default selectivity estimate for given operator, when we don't
* have statistics or cannot use them for some reason.
*/
static double
default_range_selectivity(Oid operator)
{
switch (operator)
{
case OID_RANGE_OVERLAP_OP:
return 0.01;
case OID_RANGE_CONTAINS_OP:
case OID_RANGE_CONTAINED_OP:
return 0.005;
case OID_RANGE_CONTAINS_ELEM_OP:
/*
* "range @> elem" is more or less identical to a scalar
* inequality "A >= b AND A <= c".
*/
return DEFAULT_RANGE_INEQ_SEL;
case OID_RANGE_LESS_OP:
case OID_RANGE_LESS_EQUAL_OP:
case OID_RANGE_GREATER_OP:
case OID_RANGE_GREATER_EQUAL_OP:
case OID_RANGE_LEFT_OP:
case OID_RANGE_RIGHT_OP:
/* these are similar to regular scalar inequalities */
return DEFAULT_INEQ_SEL;
default:
/* all range operators should be handled above, but just in case */
return 0.01;
}
}
/*
* rangesel -- restriction selectivity for range operators
*/
Datum
rangesel(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
VariableStatData vardata;
Node *other;
bool varonleft;
Selectivity selec;
TypeCacheEntry *typcache;
RangeType *constrange = NULL;
/*
* If expression is not (variable op something) or (something op
* variable), then punt and return a default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
PG_RETURN_FLOAT8(default_range_selectivity(operator));
/*
* Can't do anything useful if the something is not a constant, either.
*/
if (!IsA(other, Const))
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(default_range_selectivity(operator));
}
/*
* All the range operators are strict, so we can cope with a NULL constant
* right away.
*/
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
/*
* If var is on the right, commute the operator, so that we can assume the
* var is on the left in what follows.
*/
if (!varonleft)
{
/* we have other Op var, commute to make var Op other */
operator = get_commutator(operator);
if (!operator)
{
/* Use default selectivity (should we raise an error instead?) */
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(default_range_selectivity(operator));
}
}
typcache = range_get_typcache(fcinfo, vardata.vartype);
/*
* OK, there's a Var and a Const we're dealing with here. We need the
* Const to be of same range type as the column, else we can't do anything
* useful. (Such cases will likely fail at runtime, but here we'd rather
* just return a default estimate.)
*
* If the operator is "range @> element", the constant should be of the
* element type of the range column. Convert it to a range that includes
* only that single point, so that we don't need special handling for
* that in what follows.
*/
if (operator == OID_RANGE_CONTAINS_ELEM_OP)
{
if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
{
RangeBound lower, upper;
lower.inclusive = true;
lower.val = ((Const *) other)->constvalue;
lower.infinite = false;
lower.lower = true;
upper.inclusive = true;
upper.val = ((Const *) other)->constvalue;
upper.infinite = false;
upper.lower = false;
constrange = range_serialize(typcache, &lower, &upper, false);
}
}
else
{
if (((Const *) other)->consttype == vardata.vartype)
constrange = DatumGetRangeType(((Const *) other)->constvalue);
}
/*
* If we got a valid constant on one side of the operator, proceed to
* estimate using statistics. Otherwise punt and return a default
* constant estimate.
*/
if (constrange)
selec = calc_rangesel(typcache, &vardata, constrange, operator);
else
selec = default_range_selectivity(operator);
ReleaseVariableStats(vardata);
CLAMP_PROBABILITY(selec);
PG_RETURN_FLOAT8((float8) selec);
}
static double
calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
RangeType *constval, Oid operator)
{
double hist_selec;
double selec;
float4 empty_frac, null_frac;
/*
* First look up the fraction of NULLs and empty ranges from pg_statistic.
*/
if (HeapTupleIsValid(vardata->statsTuple))
{
Form_pg_statistic stats;
float4 *numbers;
int nnumbers;
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
null_frac = stats->stanullfrac;
/* Try to get fraction of empty ranges */
if (get_attstatsslot(vardata->statsTuple,
vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_RANGE_EMPTY_FRAC, InvalidOid,
NULL,
NULL, NULL,
&numbers, &nnumbers))
{
if (nnumbers != 1)
elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
empty_frac = numbers[0];
}
else
{
/* No empty fraction statistic. Assume no empty ranges. */
empty_frac = 0.0;
}
}
else
{
/*
* No stats are available. Follow through the calculations below
* anyway, assuming no NULLs and no empty ranges. This still allows
* us to give a better-than-nothing estimate based on whether the
* constant is an empty range or not.
*/
null_frac = 0.0;
empty_frac = 0.0;
}
if (RangeIsEmpty(constval))
{
/*
* An empty range matches all ranges, all empty ranges, or nothing,
* depending on the operator
*/
switch (operator)
{
case OID_RANGE_OVERLAP_OP:
case OID_RANGE_OVERLAPS_LEFT_OP:
case OID_RANGE_OVERLAPS_RIGHT_OP:
case OID_RANGE_LEFT_OP:
case OID_RANGE_RIGHT_OP:
/* these return false if either argument is empty */
selec = 0.0;
break;
case OID_RANGE_CONTAINED_OP:
case OID_RANGE_LESS_EQUAL_OP:
case OID_RANGE_GREATER_EQUAL_OP:
/*
* these return true when both args are empty, false if only
* one is empty
*/
selec = empty_frac;
break;
case OID_RANGE_CONTAINS_OP:
/* everything contains an empty range */
selec = 1.0;
break;
case OID_RANGE_CONTAINS_ELEM_OP:
default:
elog(ERROR, "unexpected operator %u", operator);
selec = 0.0; /* keep compiler quiet */
break;
}
}
else
{
/*
* Calculate selectivity using bound histograms. If that fails for
* some reason, e.g no histogram in pg_statistic, use the default
* constant estimate for the fraction of non-empty values. This is
* still somewhat better than just returning the default estimate,
* because this still takes into account the fraction of empty and
* NULL tuples, if we had statistics for them.
*/
hist_selec = calc_hist_selectivity(typcache, vardata, constval,
operator);
if (hist_selec < 0.0)
hist_selec = default_range_selectivity(operator);
/*
* Now merge the results for the empty ranges and histogram
* calculations, realizing that the histogram covers only the
* non-null, non-empty values.
*/
if (operator == OID_RANGE_CONTAINED_OP)
{
/* empty is contained by anything non-empty */
selec = (1.0 - empty_frac) * hist_selec + empty_frac;
}
else
{
/* with any other operator, empty Op non-empty matches nothing */
selec = (1.0 - empty_frac) * hist_selec;
}
}
/* all range operators are strict */
selec *= (1.0 - null_frac);
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* Calculate range operator selectivity using histograms of range bounds.
*
* This estimate is for the portion of values that are not empty and not
* NULL.
*/
static double
calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
RangeType *constval, Oid operator)
{
Datum *hist_values;
int nhist;
RangeBound *hist_lower;
RangeBound *hist_upper;
int i;
RangeBound const_lower;
RangeBound const_upper;
bool empty;
double hist_selec;
/* Try to get histogram of ranges */
if (!(HeapTupleIsValid(vardata->statsTuple) &&
get_attstatsslot(vardata->statsTuple,
vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
NULL,
&hist_values, &nhist,
NULL, NULL)))
return -1.0;
/*
* Convert histogram of ranges into histograms of its lower and upper
* bounds.
*/
hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
for (i = 0; i < nhist; i++)
{
range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
&hist_lower[i], &hist_upper[i], &empty);
/* The histogram should not contain any empty ranges */
if (empty)
elog(ERROR, "bounds histogram contains an empty range");
}
/* Extract the bounds of the constant value. */
range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
Assert (!empty);
/*
* Calculate selectivity comparing the lower or upper bound of the
* constant with the histogram of lower or upper bounds.
*/
switch (operator)
{
case OID_RANGE_LESS_OP:
/*
* The regular b-tree comparison operators (<, <=, >, >=) compare
* the lower bounds first, and the upper bounds for values with
* equal lower bounds. Estimate that by comparing the lower bounds
* only. This gives a fairly accurate estimate assuming there
* aren't many rows with a lower bound equal to the constant's
* lower bound.
*/
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, false);
break;
case OID_RANGE_LESS_EQUAL_OP:
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, true);
break;
case OID_RANGE_GREATER_OP:
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, true);
break;
case OID_RANGE_GREATER_EQUAL_OP:
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, false);
break;
case OID_RANGE_LEFT_OP:
/* var << const when upper(var) < lower(const) */
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_lower,
hist_upper, nhist, false);
break;
case OID_RANGE_RIGHT_OP:
/* var >> const when lower(var) > upper(const) */
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_upper,
hist_lower, nhist, true);
break;
case OID_RANGE_OVERLAPS_RIGHT_OP:
/* compare lower bounds */
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, false);
break;
case OID_RANGE_OVERLAPS_LEFT_OP:
/* compare upper bounds */
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_upper,
hist_upper, nhist, true);
break;
case OID_RANGE_OVERLAP_OP:
case OID_RANGE_CONTAINS_ELEM_OP:
/*
* A && B <=> NOT (A << B OR A >> B).
*
* "range @> elem" is equivalent to "range && [elem,elem]". The
* caller already constructed the singular range from the element
* constant, so just treat it the same as &&.
*/
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
nhist, false);
hist_selec +=
(1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
nhist, true));
hist_selec = 1.0 - hist_selec;
break;
case OID_RANGE_CONTAINS_OP:
case OID_RANGE_CONTAINED_OP:
/* TODO: not implemented yet */
hist_selec = -1.0;
break;
default:
elog(ERROR, "unknown range operator %u", operator);
hist_selec = -1.0; /* keep compiler quiet */
break;
}
return hist_selec;
}
/*
* Look up the fraction of values less than (or equal, if 'equal' argument
* is true) a given const in a histogram of range bounds.
*/
static double
calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
RangeBound *hist, int hist_nvalues, bool equal)
{
Selectivity selec;
int index;
/*
* Find the histogram bin the given constant falls into. Estimate
* selectivity as the number of preceding whole bins.
*/
index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
/* Adjust using linear interpolation within the bin */
if (index >= 0 && index < hist_nvalues - 1)
selec += get_position(typcache, constbound, &hist[index],
&hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
return selec;
}
/*
* Binary search on an array of range bounds. Returns greatest index of range
* bound in array which is less than given range bound. If all range bounds in
* array are greater or equal than given range bound, return -1.
*/
static int
rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
int hist_length, bool equal)
{
int lower = -1,
upper = hist_length - 1,
cmp,
middle;
while (lower < upper)
{
middle = (lower + upper + 1) / 2;
cmp = range_cmp_bounds(typcache, &hist[middle], value);
if (cmp < 0 || (equal && cmp == 0))
lower = middle;
else
upper = middle - 1;
}
return lower;
}
/*
* Get relative position of value in histogram bin in [0,1] range.
*/
static float8
get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
RangeBound *hist2)
{
bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
float8 position;
if (!hist1->infinite && !hist2->infinite)
{
float8 bin_width;
/*
* Both bounds are finite. Assuming the subtype's comparison function
* works sanely, the value must be finite, too, because it lies
* somewhere between the bounds. If it doesn't, just return something.
*/
if (value->infinite)
return 0.5;
/* Can't interpolate without subdiff function */
if (!has_subdiff)
return 0.5;
/* Calculate relative position using subdiff function. */
bin_width = DatumGetFloat8(FunctionCall2Coll(
&typcache->rng_subdiff_finfo,
typcache->rng_collation,
hist2->val,
hist1->val));
if (bin_width <= 0.0)
return 0.5; /* zero width bin */
position = DatumGetFloat8(FunctionCall2Coll(
&typcache->rng_subdiff_finfo,
typcache->rng_collation,
value->val,
hist1->val))
/ bin_width;
/* Relative position must be in [0,1] range */
position = Max(position, 0.0);
position = Min(position, 1.0);
return position;
}
else if (hist1->infinite && !hist2->infinite)
{
/*
* Lower bin boundary is -infinite, upper is finite. If the value is
* -infinite, return 0.0 to indicate it's equal to the lower bound.
* Otherwise return 1.0 to indicate it's infinitely far from the lower
* bound.
*/
return ((value->infinite && value->lower) ? 0.0 : 1.0);
}
else if (!hist1->infinite && hist2->infinite)
{
/* same as above, but in reverse */
return ((value->infinite && !value->lower) ? 1.0 : 0.0);
}
else
{
/*
* If both bin boundaries are infinite, they should be equal to each
* other, and the value should also be infinite and equal to both
* bounds. (But don't Assert that, to avoid crashing if a user creates
* a datatype with a broken comparison function).
*
* Assume the value to lie in the middle of the infinite bounds.
*/
return 0.5;
}
}