postgresql/src/backend/utils/adt/rangetypes_selfuncs.c

600 lines
16 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* rangetypes_selfuncs.c
* Functions for selectivity estimation of range operators
*
* Estimates are based on histograms of lower and upper bounds, and the
* fraction of empty ranges.
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/adt/rangetypes_selfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "utils/lsyscache.h"
#include "utils/rangetypes.h"
#include "utils/selfuncs.h"
#include "utils/typcache.h"
static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
RangeType *constval, Oid operator);
static double default_range_selectivity(Oid operator);
static double calc_hist_selectivity(TypeCacheEntry *typcache,
VariableStatData *vardata, RangeType *constval,
Oid operator);
static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
RangeBound *constbound,
RangeBound *hist, int hist_nvalues,
bool equal);
static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
RangeBound *hist, int hist_length, bool equal);
static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
RangeBound *hist1, RangeBound *hist2);
/*
* Returns a default selectivity estimate for given operator, when we don't
* have statistics or cannot use them for some reason.
*/
static double
default_range_selectivity(Oid operator)
{
switch (operator)
{
case OID_RANGE_OVERLAP_OP:
return 0.01;
case OID_RANGE_CONTAINS_OP:
case OID_RANGE_CONTAINED_OP:
return 0.005;
case OID_RANGE_CONTAINS_ELEM_OP:
/*
* "range @> elem" is more or less identical to a scalar
* inequality "A >= b AND A <= c".
*/
return DEFAULT_RANGE_INEQ_SEL;
case OID_RANGE_LESS_OP:
case OID_RANGE_LESS_EQUAL_OP:
case OID_RANGE_GREATER_OP:
case OID_RANGE_GREATER_EQUAL_OP:
case OID_RANGE_LEFT_OP:
case OID_RANGE_RIGHT_OP:
/* these are similar to regular scalar inequalities */
return DEFAULT_INEQ_SEL;
default:
/* all range operators should be handled above, but just in case */
return 0.01;
}
}
/*
* rangesel -- restriction selectivity for range operators
*/
Datum
rangesel(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
VariableStatData vardata;
Node *other;
bool varonleft;
Selectivity selec;
TypeCacheEntry *typcache;
RangeType *constrange = NULL;
/*
* If expression is not (variable op something) or (something op
* variable), then punt and return a default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
PG_RETURN_FLOAT8(default_range_selectivity(operator));
/*
* Can't do anything useful if the something is not a constant, either.
*/
if (!IsA(other, Const))
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(default_range_selectivity(operator));
}
/*
* All the range operators are strict, so we can cope with a NULL constant
* right away.
*/
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
/*
* If var is on the right, commute the operator, so that we can assume the
* var is on the left in what follows.
*/
if (!varonleft)
{
/* we have other Op var, commute to make var Op other */
operator = get_commutator(operator);
if (!operator)
{
/* Use default selectivity (should we raise an error instead?) */
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(default_range_selectivity(operator));
}
}
typcache = range_get_typcache(fcinfo, vardata.vartype);
/*
* OK, there's a Var and a Const we're dealing with here. We need the
* Const to be of same range type as the column, else we can't do anything
* useful. (Such cases will likely fail at runtime, but here we'd rather
* just return a default estimate.)
*
* If the operator is "range @> element", the constant should be of the
* element type of the range column. Convert it to a range that includes
* only that single point, so that we don't need special handling for
* that in what follows.
*/
if (operator == OID_RANGE_CONTAINS_ELEM_OP)
{
if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
{
RangeBound lower, upper;
lower.inclusive = true;
lower.val = ((Const *) other)->constvalue;
lower.infinite = false;
lower.lower = true;
upper.inclusive = true;
upper.val = ((Const *) other)->constvalue;
upper.infinite = false;
upper.lower = false;
constrange = range_serialize(typcache, &lower, &upper, false);
}
}
else
{
if (((Const *) other)->consttype == vardata.vartype)
constrange = DatumGetRangeType(((Const *) other)->constvalue);
}
/*
* If we got a valid constant on one side of the operator, proceed to
* estimate using statistics. Otherwise punt and return a default
* constant estimate.
*/
if (constrange)
selec = calc_rangesel(typcache, &vardata, constrange, operator);
else
selec = default_range_selectivity(operator);
ReleaseVariableStats(vardata);
CLAMP_PROBABILITY(selec);
PG_RETURN_FLOAT8((float8) selec);
}
static double
calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
RangeType *constval, Oid operator)
{
double hist_selec;
double selec;
float4 empty_frac, null_frac;
/*
* First look up the fraction of NULLs and empty ranges from pg_statistic.
*/
if (HeapTupleIsValid(vardata->statsTuple))
{
Form_pg_statistic stats;
float4 *numbers;
int nnumbers;
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
null_frac = stats->stanullfrac;
/* Try to get fraction of empty ranges */
if (get_attstatsslot(vardata->statsTuple,
vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_RANGE_EMPTY_FRAC, InvalidOid,
NULL,
NULL, NULL,
&numbers, &nnumbers))
{
if (nnumbers != 1)
elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
empty_frac = numbers[0];
}
else
{
/* No empty fraction statistic. Assume no empty ranges. */
empty_frac = 0.0;
}
}
else
{
/*
* No stats are available. Follow through the calculations below
* anyway, assuming no NULLs and no empty ranges. This still allows
* us to give a better-than-nothing estimate based on whether the
* constant is an empty range or not.
*/
null_frac = 0.0;
empty_frac = 0.0;
}
if (RangeIsEmpty(constval))
{
/*
* An empty range matches all ranges, all empty ranges, or nothing,
* depending on the operator
*/
switch (operator)
{
case OID_RANGE_OVERLAP_OP:
case OID_RANGE_OVERLAPS_LEFT_OP:
case OID_RANGE_OVERLAPS_RIGHT_OP:
case OID_RANGE_LEFT_OP:
case OID_RANGE_RIGHT_OP:
/* these return false if either argument is empty */
selec = 0.0;
break;
case OID_RANGE_CONTAINED_OP:
case OID_RANGE_LESS_EQUAL_OP:
case OID_RANGE_GREATER_EQUAL_OP:
/*
* these return true when both args are empty, false if only
* one is empty
*/
selec = empty_frac;
break;
case OID_RANGE_CONTAINS_OP:
/* everything contains an empty range */
selec = 1.0;
break;
case OID_RANGE_CONTAINS_ELEM_OP:
default:
elog(ERROR, "unexpected operator %u", operator);
selec = 0.0; /* keep compiler quiet */
break;
}
}
else
{
/*
* Calculate selectivity using bound histograms. If that fails for
* some reason, e.g no histogram in pg_statistic, use the default
* constant estimate for the fraction of non-empty values. This is
* still somewhat better than just returning the default estimate,
* because this still takes into account the fraction of empty and
* NULL tuples, if we had statistics for them.
*/
hist_selec = calc_hist_selectivity(typcache, vardata, constval,
operator);
if (hist_selec < 0.0)
hist_selec = default_range_selectivity(operator);
/*
* Now merge the results for the empty ranges and histogram
* calculations, realizing that the histogram covers only the
* non-null, non-empty values.
*/
if (operator == OID_RANGE_CONTAINED_OP)
{
/* empty is contained by anything non-empty */
selec = (1.0 - empty_frac) * hist_selec + empty_frac;
}
else
{
/* with any other operator, empty Op non-empty matches nothing */
selec = (1.0 - empty_frac) * hist_selec;
}
}
/* all range operators are strict */
selec *= (1.0 - null_frac);
/* result should be in range, but make sure... */
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* Calculate range operator selectivity using histograms of range bounds.
*
* This estimate is for the portion of values that are not empty and not
* NULL.
*/
static double
calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
RangeType *constval, Oid operator)
{
Datum *hist_values;
int nhist;
RangeBound *hist_lower;
RangeBound *hist_upper;
int i;
RangeBound const_lower;
RangeBound const_upper;
bool empty;
double hist_selec;
/* Try to get histogram of ranges */
if (!(HeapTupleIsValid(vardata->statsTuple) &&
get_attstatsslot(vardata->statsTuple,
vardata->atttype, vardata->atttypmod,
STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
NULL,
&hist_values, &nhist,
NULL, NULL)))
return -1.0;
/*
* Convert histogram of ranges into histograms of its lower and upper
* bounds.
*/
hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
for (i = 0; i < nhist; i++)
{
range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
&hist_lower[i], &hist_upper[i], &empty);
/* The histogram should not contain any empty ranges */
if (empty)
elog(ERROR, "bounds histogram contains an empty range");
}
/* Extract the bounds of the constant value. */
range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
Assert (!empty);
/*
* Calculate selectivity comparing the lower or upper bound of the
* constant with the histogram of lower or upper bounds.
*/
switch (operator)
{
case OID_RANGE_LESS_OP:
/*
* The regular b-tree comparison operators (<, <=, >, >=) compare
* the lower bounds first, and the upper bounds for values with
* equal lower bounds. Estimate that by comparing the lower bounds
* only. This gives a fairly accurate estimate assuming there
* aren't many rows with a lower bound equal to the constant's
* lower bound.
*/
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, false);
break;
case OID_RANGE_LESS_EQUAL_OP:
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, true);
break;
case OID_RANGE_GREATER_OP:
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, true);
break;
case OID_RANGE_GREATER_EQUAL_OP:
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, false);
break;
case OID_RANGE_LEFT_OP:
/* var << const when upper(var) < lower(const) */
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_lower,
hist_upper, nhist, false);
break;
case OID_RANGE_RIGHT_OP:
/* var >> const when lower(var) > upper(const) */
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_upper,
hist_lower, nhist, true);
break;
case OID_RANGE_OVERLAPS_RIGHT_OP:
/* compare lower bounds */
hist_selec =
1 - calc_hist_selectivity_scalar(typcache, &const_lower,
hist_lower, nhist, false);
break;
case OID_RANGE_OVERLAPS_LEFT_OP:
/* compare upper bounds */
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_upper,
hist_upper, nhist, true);
break;
case OID_RANGE_OVERLAP_OP:
case OID_RANGE_CONTAINS_ELEM_OP:
/*
* A && B <=> NOT (A << B OR A >> B).
*
* "range @> elem" is equivalent to "range && [elem,elem]". The
* caller already constructed the singular range from the element
* constant, so just treat it the same as &&.
*/
hist_selec =
calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
nhist, false);
hist_selec +=
(1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
nhist, true));
hist_selec = 1.0 - hist_selec;
break;
case OID_RANGE_CONTAINS_OP:
case OID_RANGE_CONTAINED_OP:
/* TODO: not implemented yet */
hist_selec = -1.0;
break;
default:
elog(ERROR, "unknown range operator %u", operator);
hist_selec = -1.0; /* keep compiler quiet */
break;
}
return hist_selec;
}
/*
* Look up the fraction of values less than (or equal, if 'equal' argument
* is true) a given const in a histogram of range bounds.
*/
static double
calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
RangeBound *hist, int hist_nvalues, bool equal)
{
Selectivity selec;
int index;
/*
* Find the histogram bin the given constant falls into. Estimate
* selectivity as the number of preceding whole bins.
*/
index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
/* Adjust using linear interpolation within the bin */
if (index >= 0 && index < hist_nvalues - 1)
selec += get_position(typcache, constbound, &hist[index],
&hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
return selec;
}
/*
* Binary search on an array of range bounds. Returns greatest index of range
* bound in array which is less than given range bound. If all range bounds in
* array are greater or equal than given range bound, return -1.
*/
static int
rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
int hist_length, bool equal)
{
int lower = -1,
upper = hist_length - 1,
cmp,
middle;
while (lower < upper)
{
middle = (lower + upper + 1) / 2;
cmp = range_cmp_bounds(typcache, &hist[middle], value);
if (cmp < 0 || (equal && cmp == 0))
lower = middle;
else
upper = middle - 1;
}
return lower;
}
/*
* Get relative position of value in histogram bin in [0,1] range.
*/
static float8
get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
RangeBound *hist2)
{
bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
float8 position;
if (!hist1->infinite && !hist2->infinite)
{
float8 bin_width;
/*
* Both bounds are finite. Assuming the subtype's comparison function
* works sanely, the value must be finite, too, because it lies
* somewhere between the bounds. If it doesn't, just return something.
*/
if (value->infinite)
return 0.5;
/* Can't interpolate without subdiff function */
if (!has_subdiff)
return 0.5;
/* Calculate relative position using subdiff function. */
bin_width = DatumGetFloat8(FunctionCall2Coll(
&typcache->rng_subdiff_finfo,
typcache->rng_collation,
hist2->val,
hist1->val));
if (bin_width <= 0.0)
return 0.5; /* zero width bin */
position = DatumGetFloat8(FunctionCall2Coll(
&typcache->rng_subdiff_finfo,
typcache->rng_collation,
value->val,
hist1->val))
/ bin_width;
/* Relative position must be in [0,1] range */
position = Max(position, 0.0);
position = Min(position, 1.0);
return position;
}
else if (hist1->infinite && !hist2->infinite)
{
/*
* Lower bin boundary is -infinite, upper is finite. If the value is
* -infinite, return 0.0 to indicate it's equal to the lower bound.
* Otherwise return 1.0 to indicate it's infinitely far from the lower
* bound.
*/
return ((value->infinite && value->lower) ? 0.0 : 1.0);
}
else if (!hist1->infinite && hist2->infinite)
{
/* same as above, but in reverse */
return ((value->infinite && !value->lower) ? 1.0 : 0.0);
}
else
{
/*
* If both bin boundaries are infinite, they should be equal to each
* other, and the value should also be infinite and equal to both
* bounds. (But don't Assert that, to avoid crashing if a user creates
* a datatype with a broken comparison function).
*
* Assume the value to lie in the middle of the infinite bounds.
*/
return 0.5;
}
}