Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* multirangetypes_selfuncs.c
|
|
|
|
* Functions for selectivity estimation of multirange operators
|
|
|
|
*
|
|
|
|
* Estimates are based on histograms of lower and upper bounds, and the
|
|
|
|
* fraction of empty multiranges.
|
|
|
|
*
|
2023-01-02 21:00:37 +01:00
|
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/backend/utils/adt/multirangetypes_selfuncs.c
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include <math.h>
|
|
|
|
|
|
|
|
#include "access/htup_details.h"
|
|
|
|
#include "catalog/pg_operator.h"
|
|
|
|
#include "catalog/pg_statistic.h"
|
|
|
|
#include "catalog/pg_type.h"
|
|
|
|
#include "utils/float.h"
|
|
|
|
#include "utils/fmgrprotos.h"
|
|
|
|
#include "utils/lsyscache.h"
|
|
|
|
#include "utils/rangetypes.h"
|
|
|
|
#include "utils/multirangetypes.h"
|
|
|
|
#include "utils/selfuncs.h"
|
|
|
|
#include "utils/typcache.h"
|
|
|
|
|
|
|
|
static double calc_multirangesel(TypeCacheEntry *typcache,
|
|
|
|
VariableStatData *vardata,
|
|
|
|
const MultirangeType *constval, Oid operator);
|
|
|
|
static double default_multirange_selectivity(Oid operator);
|
|
|
|
static double calc_hist_selectivity(TypeCacheEntry *typcache,
|
|
|
|
VariableStatData *vardata,
|
|
|
|
const MultirangeType *constval,
|
|
|
|
Oid operator);
|
|
|
|
static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
|
|
|
|
const RangeBound *constbound,
|
|
|
|
const RangeBound *hist,
|
|
|
|
int hist_nvalues, bool equal);
|
|
|
|
static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value,
|
|
|
|
const RangeBound *hist, int hist_length, bool equal);
|
|
|
|
static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value,
|
|
|
|
const RangeBound *hist1, const RangeBound *hist2);
|
|
|
|
static float8 get_len_position(double value, double hist1, double hist2);
|
|
|
|
static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1,
|
|
|
|
const RangeBound *bound2);
|
|
|
|
static int length_hist_bsearch(Datum *length_hist_values,
|
|
|
|
int length_hist_nvalues, double value,
|
|
|
|
bool equal);
|
|
|
|
static double calc_length_hist_frac(Datum *length_hist_values,
|
|
|
|
int length_hist_nvalues, double length1,
|
|
|
|
double length2, bool equal);
|
|
|
|
static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
|
|
|
|
const RangeBound *lower,
|
|
|
|
RangeBound *upper,
|
|
|
|
const RangeBound *hist_lower,
|
|
|
|
int hist_nvalues,
|
|
|
|
Datum *length_hist_values,
|
|
|
|
int length_hist_nvalues);
|
|
|
|
static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
|
|
|
|
const RangeBound *lower,
|
|
|
|
const RangeBound *upper,
|
|
|
|
const RangeBound *hist_lower,
|
|
|
|
int hist_nvalues,
|
|
|
|
Datum *length_hist_values,
|
|
|
|
int length_hist_nvalues);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Returns a default selectivity estimate for given operator, when we don't
|
|
|
|
* have statistics or cannot use them for some reason.
|
|
|
|
*/
|
|
|
|
static double
|
|
|
|
default_multirange_selectivity(Oid operator)
|
|
|
|
{
|
|
|
|
switch (operator)
|
|
|
|
{
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
|
|
|
|
case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
|
|
|
|
return 0.01;
|
|
|
|
|
2020-12-29 21:35:33 +01:00
|
|
|
case OID_RANGE_CONTAINS_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_MULTIRANGE_CONTAINED_OP:
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
case OID_MULTIRANGE_CONTAINS_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_RANGE_CONTAINED_OP:
|
|
|
|
case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
|
|
|
|
return 0.005;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_CONTAINS_ELEM_OP:
|
|
|
|
case OID_MULTIRANGE_ELEM_CONTAINED_OP:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* "multirange @> elem" is more or less identical to a scalar
|
|
|
|
* inequality "A >= b AND A <= c".
|
|
|
|
*/
|
|
|
|
return DEFAULT_MULTIRANGE_INEQ_SEL;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_LESS_OP:
|
|
|
|
case OID_MULTIRANGE_LESS_EQUAL_OP:
|
|
|
|
case OID_MULTIRANGE_GREATER_OP:
|
|
|
|
case OID_MULTIRANGE_GREATER_EQUAL_OP:
|
|
|
|
case OID_MULTIRANGE_LEFT_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_RIGHT_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_RIGHT_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
|
|
|
|
case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
|
|
|
|
case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
|
|
|
|
/* these are similar to regular scalar inequalities */
|
|
|
|
return DEFAULT_INEQ_SEL;
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* all multirange operators should be handled above, but just in
|
|
|
|
* case
|
|
|
|
*/
|
|
|
|
return 0.01;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* multirangesel -- restriction selectivity for multirange operators
|
|
|
|
*/
|
|
|
|
Datum
|
|
|
|
multirangesel(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
|
|
|
|
Oid operator = PG_GETARG_OID(1);
|
|
|
|
List *args = (List *) PG_GETARG_POINTER(2);
|
|
|
|
int varRelid = PG_GETARG_INT32(3);
|
|
|
|
VariableStatData vardata;
|
|
|
|
Node *other;
|
|
|
|
bool varonleft;
|
|
|
|
Selectivity selec;
|
|
|
|
TypeCacheEntry *typcache = NULL;
|
|
|
|
MultirangeType *constmultirange = NULL;
|
|
|
|
RangeType *constrange = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If expression is not (variable op something) or (something op
|
|
|
|
* variable), then punt and return a default estimate.
|
|
|
|
*/
|
|
|
|
if (!get_restriction_variable(root, args, varRelid,
|
|
|
|
&vardata, &other, &varonleft))
|
|
|
|
PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Can't do anything useful if the something is not a constant, either.
|
|
|
|
*/
|
|
|
|
if (!IsA(other, Const))
|
|
|
|
{
|
|
|
|
ReleaseVariableStats(vardata);
|
|
|
|
PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* All the multirange operators are strict, so we can cope with a NULL
|
|
|
|
* constant right away.
|
|
|
|
*/
|
|
|
|
if (((Const *) other)->constisnull)
|
|
|
|
{
|
|
|
|
ReleaseVariableStats(vardata);
|
|
|
|
PG_RETURN_FLOAT8(0.0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If var is on the right, commute the operator, so that we can assume the
|
|
|
|
* var is on the left in what follows.
|
|
|
|
*/
|
|
|
|
if (!varonleft)
|
|
|
|
{
|
|
|
|
/* we have other Op var, commute to make var Op other */
|
|
|
|
operator = get_commutator(operator);
|
|
|
|
if (!operator)
|
|
|
|
{
|
|
|
|
/* Use default selectivity (should we raise an error instead?) */
|
|
|
|
ReleaseVariableStats(vardata);
|
|
|
|
PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* OK, there's a Var and a Const we're dealing with here. We need the
|
|
|
|
* Const to be of same multirange type as the column, else we can't do
|
|
|
|
* anything useful. (Such cases will likely fail at runtime, but here we'd
|
|
|
|
* rather just return a default estimate.)
|
|
|
|
*
|
|
|
|
* If the operator is "multirange @> element", the constant should be of
|
|
|
|
* the element type of the multirange column. Convert it to a multirange
|
|
|
|
* that includes only that single point, so that we don't need special
|
|
|
|
* handling for that in what follows.
|
|
|
|
*/
|
|
|
|
if (operator == OID_MULTIRANGE_CONTAINS_ELEM_OP)
|
|
|
|
{
|
|
|
|
typcache = multirange_get_typcache(fcinfo, vardata.vartype);
|
|
|
|
|
|
|
|
if (((Const *) other)->consttype == typcache->rngtype->rngelemtype->type_id)
|
|
|
|
{
|
|
|
|
RangeBound lower,
|
|
|
|
upper;
|
|
|
|
|
|
|
|
lower.inclusive = true;
|
|
|
|
lower.val = ((Const *) other)->constvalue;
|
|
|
|
lower.infinite = false;
|
|
|
|
lower.lower = true;
|
|
|
|
upper.inclusive = true;
|
|
|
|
upper.val = ((Const *) other)->constvalue;
|
|
|
|
upper.infinite = false;
|
|
|
|
upper.lower = false;
|
2022-12-15 18:18:36 +01:00
|
|
|
constrange = range_serialize(typcache->rngtype, &lower, &upper,
|
|
|
|
false, NULL);
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
|
|
|
|
1, &constrange);
|
|
|
|
}
|
|
|
|
}
|
2020-12-29 21:35:33 +01:00
|
|
|
else if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
|
|
|
|
operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
operator == OID_MULTIRANGE_OVERLAPS_RANGE_OP ||
|
|
|
|
operator == OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP ||
|
|
|
|
operator == OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP ||
|
|
|
|
operator == OID_MULTIRANGE_LEFT_RANGE_OP ||
|
|
|
|
operator == OID_MULTIRANGE_RIGHT_RANGE_OP)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Promote a range in "multirange OP range" just like we do an element
|
|
|
|
* in "multirange OP element".
|
|
|
|
*/
|
|
|
|
typcache = multirange_get_typcache(fcinfo, vardata.vartype);
|
|
|
|
if (((Const *) other)->consttype == typcache->rngtype->type_id)
|
|
|
|
{
|
|
|
|
constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
|
|
|
|
constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
|
|
|
|
1, &constrange);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (operator == OID_RANGE_OVERLAPS_MULTIRANGE_OP ||
|
|
|
|
operator == OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP ||
|
|
|
|
operator == OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP ||
|
|
|
|
operator == OID_RANGE_LEFT_MULTIRANGE_OP ||
|
|
|
|
operator == OID_RANGE_RIGHT_MULTIRANGE_OP ||
|
2020-12-29 21:35:33 +01:00
|
|
|
operator == OID_RANGE_CONTAINS_MULTIRANGE_OP ||
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
operator == OID_MULTIRANGE_ELEM_CONTAINED_OP ||
|
|
|
|
operator == OID_MULTIRANGE_RANGE_CONTAINED_OP)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Here, the Var is the elem/range, not the multirange. For now we
|
|
|
|
* just punt and return the default estimate. In future we could
|
|
|
|
* disassemble the multirange constant to do something more
|
|
|
|
* intelligent.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
else if (((Const *) other)->consttype == vardata.vartype)
|
|
|
|
{
|
|
|
|
/* Both sides are the same multirange type */
|
|
|
|
typcache = multirange_get_typcache(fcinfo, vardata.vartype);
|
|
|
|
|
|
|
|
constmultirange = DatumGetMultirangeTypeP(((Const *) other)->constvalue);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we got a valid constant on one side of the operator, proceed to
|
|
|
|
* estimate using statistics. Otherwise punt and return a default constant
|
|
|
|
* estimate. Note that calc_multirangesel need not handle
|
|
|
|
* OID_MULTIRANGE_*_CONTAINED_OP.
|
|
|
|
*/
|
|
|
|
if (constmultirange)
|
|
|
|
selec = calc_multirangesel(typcache, &vardata, constmultirange, operator);
|
|
|
|
else
|
|
|
|
selec = default_multirange_selectivity(operator);
|
|
|
|
|
|
|
|
ReleaseVariableStats(vardata);
|
|
|
|
|
|
|
|
CLAMP_PROBABILITY(selec);
|
|
|
|
|
|
|
|
PG_RETURN_FLOAT8((float8) selec);
|
|
|
|
}
|
|
|
|
|
|
|
|
static double
|
|
|
|
calc_multirangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
|
|
|
|
const MultirangeType *constval, Oid operator)
|
|
|
|
{
|
|
|
|
double hist_selec;
|
|
|
|
double selec;
|
|
|
|
float4 empty_frac,
|
|
|
|
null_frac;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First look up the fraction of NULLs and empty multiranges from
|
|
|
|
* pg_statistic.
|
|
|
|
*/
|
|
|
|
if (HeapTupleIsValid(vardata->statsTuple))
|
|
|
|
{
|
|
|
|
Form_pg_statistic stats;
|
|
|
|
AttStatsSlot sslot;
|
|
|
|
|
|
|
|
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
|
|
|
|
null_frac = stats->stanullfrac;
|
|
|
|
|
|
|
|
/* Try to get fraction of empty multiranges */
|
|
|
|
if (get_attstatsslot(&sslot, vardata->statsTuple,
|
|
|
|
STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
|
|
|
|
InvalidOid,
|
|
|
|
ATTSTATSSLOT_NUMBERS))
|
|
|
|
{
|
|
|
|
if (sslot.nnumbers != 1)
|
|
|
|
elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
|
|
|
|
empty_frac = sslot.numbers[0];
|
|
|
|
free_attstatsslot(&sslot);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* No empty fraction statistic. Assume no empty ranges. */
|
|
|
|
empty_frac = 0.0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* No stats are available. Follow through the calculations below
|
|
|
|
* anyway, assuming no NULLs and no empty multiranges. This still
|
|
|
|
* allows us to give a better-than-nothing estimate based on whether
|
|
|
|
* the constant is an empty multirange or not.
|
|
|
|
*/
|
|
|
|
null_frac = 0.0;
|
|
|
|
empty_frac = 0.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (MultirangeIsEmpty(constval))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* An empty multirange matches all multiranges, all empty multiranges,
|
|
|
|
* or nothing, depending on the operator
|
|
|
|
*/
|
|
|
|
switch (operator)
|
|
|
|
{
|
|
|
|
/* these return false if either argument is empty */
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
|
2021-06-29 22:18:09 +02:00
|
|
|
case OID_MULTIRANGE_LEFT_RANGE_OP:
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
|
2021-06-29 22:18:09 +02:00
|
|
|
case OID_MULTIRANGE_RIGHT_RANGE_OP:
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
|
|
|
|
/* nothing is less than an empty multirange */
|
|
|
|
case OID_MULTIRANGE_LESS_OP:
|
|
|
|
selec = 0.0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* only empty multiranges can be contained by an empty
|
|
|
|
* multirange
|
|
|
|
*/
|
2021-06-29 22:18:09 +02:00
|
|
|
case OID_RANGE_MULTIRANGE_CONTAINED_OP:
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
|
|
|
|
/* only empty ranges are <= an empty multirange */
|
|
|
|
case OID_MULTIRANGE_LESS_EQUAL_OP:
|
|
|
|
selec = empty_frac;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* everything contains an empty multirange */
|
|
|
|
case OID_MULTIRANGE_CONTAINS_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
|
|
|
|
/* everything is >= an empty multirange */
|
|
|
|
case OID_MULTIRANGE_GREATER_EQUAL_OP:
|
|
|
|
selec = 1.0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* all non-empty multiranges are > an empty multirange */
|
|
|
|
case OID_MULTIRANGE_GREATER_OP:
|
|
|
|
selec = 1.0 - empty_frac;
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* an element cannot be empty */
|
|
|
|
case OID_MULTIRANGE_CONTAINS_ELEM_OP:
|
2021-06-29 22:18:09 +02:00
|
|
|
|
|
|
|
/* filtered out by multirangesel() */
|
|
|
|
case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_RIGHT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_CONTAINS_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_ELEM_CONTAINED_OP:
|
|
|
|
case OID_MULTIRANGE_RANGE_CONTAINED_OP:
|
|
|
|
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
default:
|
|
|
|
elog(ERROR, "unexpected operator %u", operator);
|
|
|
|
selec = 0.0; /* keep compiler quiet */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Calculate selectivity using bound histograms. If that fails for
|
|
|
|
* some reason, e.g no histogram in pg_statistic, use the default
|
|
|
|
* constant estimate for the fraction of non-empty values. This is
|
|
|
|
* still somewhat better than just returning the default estimate,
|
|
|
|
* because this still takes into account the fraction of empty and
|
|
|
|
* NULL tuples, if we had statistics for them.
|
|
|
|
*/
|
|
|
|
hist_selec = calc_hist_selectivity(typcache, vardata, constval,
|
|
|
|
operator);
|
|
|
|
if (hist_selec < 0.0)
|
|
|
|
hist_selec = default_multirange_selectivity(operator);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now merge the results for the empty multiranges and histogram
|
|
|
|
* calculations, realizing that the histogram covers only the
|
|
|
|
* non-null, non-empty values.
|
|
|
|
*/
|
2021-06-29 22:18:09 +02:00
|
|
|
if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
|
|
|
|
{
|
|
|
|
/* empty is contained by anything non-empty */
|
|
|
|
selec = (1.0 - empty_frac) * hist_selec + empty_frac;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* with any other operator, empty Op non-empty matches nothing */
|
|
|
|
selec = (1.0 - empty_frac) * hist_selec;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* all multirange operators are strict */
|
|
|
|
selec *= (1.0 - null_frac);
|
|
|
|
|
|
|
|
/* result should be in range, but make sure... */
|
|
|
|
CLAMP_PROBABILITY(selec);
|
|
|
|
|
|
|
|
return selec;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate multirange operator selectivity using histograms of multirange bounds.
|
|
|
|
*
|
|
|
|
* This estimate is for the portion of values that are not empty and not
|
|
|
|
* NULL.
|
|
|
|
*/
|
|
|
|
static double
|
|
|
|
calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
|
|
|
|
const MultirangeType *constval, Oid operator)
|
|
|
|
{
|
|
|
|
TypeCacheEntry *rng_typcache = typcache->rngtype;
|
|
|
|
AttStatsSlot hslot;
|
|
|
|
AttStatsSlot lslot;
|
|
|
|
int nhist;
|
|
|
|
RangeBound *hist_lower;
|
|
|
|
RangeBound *hist_upper;
|
|
|
|
int i;
|
|
|
|
RangeBound const_lower;
|
|
|
|
RangeBound const_upper;
|
|
|
|
RangeBound tmp;
|
|
|
|
double hist_selec;
|
|
|
|
|
|
|
|
/* Can't use the histogram with insecure multirange support functions */
|
|
|
|
if (!statistic_proc_security_check(vardata,
|
|
|
|
rng_typcache->rng_cmp_proc_finfo.fn_oid))
|
|
|
|
return -1;
|
|
|
|
if (OidIsValid(rng_typcache->rng_subdiff_finfo.fn_oid) &&
|
|
|
|
!statistic_proc_security_check(vardata,
|
|
|
|
rng_typcache->rng_subdiff_finfo.fn_oid))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
/* Try to get histogram of ranges */
|
|
|
|
if (!(HeapTupleIsValid(vardata->statsTuple) &&
|
|
|
|
get_attstatsslot(&hslot, vardata->statsTuple,
|
|
|
|
STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
|
|
|
|
ATTSTATSSLOT_VALUES)))
|
|
|
|
return -1.0;
|
|
|
|
|
|
|
|
/* check that it's a histogram, not just a dummy entry */
|
|
|
|
if (hslot.nvalues < 2)
|
|
|
|
{
|
|
|
|
free_attstatsslot(&hslot);
|
|
|
|
return -1.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert histogram of ranges into histograms of its lower and upper
|
|
|
|
* bounds.
|
|
|
|
*/
|
|
|
|
nhist = hslot.nvalues;
|
|
|
|
hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
|
|
|
|
hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
|
|
|
|
for (i = 0; i < nhist; i++)
|
|
|
|
{
|
|
|
|
bool empty;
|
|
|
|
|
|
|
|
range_deserialize(rng_typcache, DatumGetRangeTypeP(hslot.values[i]),
|
|
|
|
&hist_lower[i], &hist_upper[i], &empty);
|
|
|
|
/* The histogram should not contain any empty ranges */
|
|
|
|
if (empty)
|
|
|
|
elog(ERROR, "bounds histogram contains an empty range");
|
|
|
|
}
|
|
|
|
|
|
|
|
/* @> and @< also need a histogram of range lengths */
|
|
|
|
if (operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
|
|
|
|
operator == OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP ||
|
|
|
|
operator == OID_MULTIRANGE_RANGE_CONTAINED_OP ||
|
|
|
|
operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
|
|
|
|
{
|
|
|
|
if (!(HeapTupleIsValid(vardata->statsTuple) &&
|
|
|
|
get_attstatsslot(&lslot, vardata->statsTuple,
|
|
|
|
STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
|
|
|
|
InvalidOid,
|
|
|
|
ATTSTATSSLOT_VALUES)))
|
|
|
|
{
|
|
|
|
free_attstatsslot(&hslot);
|
|
|
|
return -1.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check that it's a histogram, not just a dummy entry */
|
|
|
|
if (lslot.nvalues < 2)
|
|
|
|
{
|
|
|
|
free_attstatsslot(&lslot);
|
|
|
|
free_attstatsslot(&hslot);
|
|
|
|
return -1.0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
memset(&lslot, 0, sizeof(lslot));
|
|
|
|
|
|
|
|
/* Extract the bounds of the constant value. */
|
|
|
|
Assert(constval->rangeCount > 0);
|
|
|
|
multirange_get_bounds(rng_typcache, constval, 0,
|
|
|
|
&const_lower, &tmp);
|
|
|
|
multirange_get_bounds(rng_typcache, constval, constval->rangeCount - 1,
|
|
|
|
&tmp, &const_upper);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate selectivity comparing the lower or upper bound of the
|
|
|
|
* constant with the histogram of lower or upper bounds.
|
|
|
|
*/
|
|
|
|
switch (operator)
|
|
|
|
{
|
|
|
|
case OID_MULTIRANGE_LESS_OP:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The regular b-tree comparison operators (<, <=, >, >=) compare
|
|
|
|
* the lower bounds first, and the upper bounds for values with
|
|
|
|
* equal lower bounds. Estimate that by comparing the lower bounds
|
|
|
|
* only. This gives a fairly accurate estimate assuming there
|
|
|
|
* aren't many rows with a lower bound equal to the constant's
|
|
|
|
* lower bound.
|
|
|
|
*/
|
|
|
|
hist_selec =
|
|
|
|
calc_hist_selectivity_scalar(rng_typcache, &const_lower,
|
|
|
|
hist_lower, nhist, false);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_LESS_EQUAL_OP:
|
|
|
|
hist_selec =
|
|
|
|
calc_hist_selectivity_scalar(rng_typcache, &const_lower,
|
|
|
|
hist_lower, nhist, true);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_GREATER_OP:
|
|
|
|
hist_selec =
|
|
|
|
1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
|
|
|
|
hist_lower, nhist, false);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_GREATER_EQUAL_OP:
|
|
|
|
hist_selec =
|
|
|
|
1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
|
|
|
|
hist_lower, nhist, true);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_LEFT_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
|
|
|
|
/* var << const when upper(var) < lower(const) */
|
|
|
|
hist_selec =
|
|
|
|
calc_hist_selectivity_scalar(rng_typcache, &const_lower,
|
|
|
|
hist_upper, nhist, false);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_RIGHT_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
|
|
|
|
/* var >> const when lower(var) > upper(const) */
|
|
|
|
hist_selec =
|
|
|
|
1 - calc_hist_selectivity_scalar(rng_typcache, &const_upper,
|
|
|
|
hist_lower, nhist, true);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
|
|
|
|
/* compare lower bounds */
|
|
|
|
hist_selec =
|
|
|
|
1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
|
|
|
|
hist_lower, nhist, false);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
|
|
|
|
/* compare upper bounds */
|
|
|
|
hist_selec =
|
|
|
|
calc_hist_selectivity_scalar(rng_typcache, &const_upper,
|
|
|
|
hist_upper, nhist, true);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_CONTAINS_ELEM_OP:
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A && B <=> NOT (A << B OR A >> B).
|
|
|
|
*
|
|
|
|
* Since A << B and A >> B are mutually exclusive events we can
|
|
|
|
* sum their probabilities to find probability of (A << B OR A >>
|
|
|
|
* B).
|
|
|
|
*
|
|
|
|
* "multirange @> elem" is equivalent to "multirange &&
|
|
|
|
* {[elem,elem]}". The caller already constructed the singular
|
|
|
|
* range from the element constant, so just treat it the same as
|
|
|
|
* &&.
|
|
|
|
*/
|
|
|
|
hist_selec =
|
|
|
|
calc_hist_selectivity_scalar(rng_typcache,
|
|
|
|
&const_lower, hist_upper,
|
|
|
|
nhist, false);
|
|
|
|
hist_selec +=
|
|
|
|
(1.0 - calc_hist_selectivity_scalar(rng_typcache,
|
|
|
|
&const_upper, hist_lower,
|
|
|
|
nhist, true));
|
|
|
|
hist_selec = 1.0 - hist_selec;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_CONTAINS_RANGE_OP:
|
|
|
|
case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
|
|
|
|
hist_selec =
|
|
|
|
calc_hist_selectivity_contains(rng_typcache, &const_lower,
|
|
|
|
&const_upper, hist_lower, nhist,
|
|
|
|
lslot.values, lslot.nvalues);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
|
2020-12-30 18:19:15 +01:00
|
|
|
case OID_RANGE_MULTIRANGE_CONTAINED_OP:
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
if (const_lower.infinite)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Lower bound no longer matters. Just estimate the fraction
|
|
|
|
* with an upper bound <= const upper bound
|
|
|
|
*/
|
|
|
|
hist_selec =
|
|
|
|
calc_hist_selectivity_scalar(rng_typcache, &const_upper,
|
|
|
|
hist_upper, nhist, true);
|
|
|
|
}
|
|
|
|
else if (const_upper.infinite)
|
|
|
|
{
|
|
|
|
hist_selec =
|
|
|
|
1.0 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
|
|
|
|
hist_lower, nhist, false);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
hist_selec =
|
|
|
|
calc_hist_selectivity_contained(rng_typcache, &const_lower,
|
|
|
|
&const_upper, hist_lower, nhist,
|
|
|
|
lslot.values, lslot.nvalues);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2021-06-29 22:18:09 +02:00
|
|
|
/* filtered out by multirangesel() */
|
|
|
|
case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_LEFT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_RIGHT_MULTIRANGE_OP:
|
|
|
|
case OID_RANGE_CONTAINS_MULTIRANGE_OP:
|
|
|
|
case OID_MULTIRANGE_ELEM_CONTAINED_OP:
|
|
|
|
case OID_MULTIRANGE_RANGE_CONTAINED_OP:
|
|
|
|
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
default:
|
|
|
|
elog(ERROR, "unknown multirange operator %u", operator);
|
|
|
|
hist_selec = -1.0; /* keep compiler quiet */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_attstatsslot(&lslot);
|
|
|
|
free_attstatsslot(&hslot);
|
|
|
|
|
|
|
|
return hist_selec;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look up the fraction of values less than (or equal, if 'equal' argument
|
|
|
|
* is true) a given const in a histogram of range bounds.
|
|
|
|
*/
|
|
|
|
static double
|
|
|
|
calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound,
|
|
|
|
const RangeBound *hist, int hist_nvalues, bool equal)
|
|
|
|
{
|
|
|
|
Selectivity selec;
|
|
|
|
int index;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the histogram bin the given constant falls into. Estimate
|
|
|
|
* selectivity as the number of preceding whole bins.
|
|
|
|
*/
|
|
|
|
index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
|
|
|
|
selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
|
|
|
|
|
|
|
|
/* Adjust using linear interpolation within the bin */
|
|
|
|
if (index >= 0 && index < hist_nvalues - 1)
|
|
|
|
selec += get_position(typcache, constbound, &hist[index],
|
|
|
|
&hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
|
|
|
|
|
|
|
|
return selec;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Binary search on an array of range bounds. Returns greatest index of range
|
|
|
|
* bound in array which is less(less or equal) than given range bound. If all
|
|
|
|
* range bounds in array are greater or equal(greater) than given range bound,
|
|
|
|
* return -1. When "equal" flag is set conditions in brackets are used.
|
|
|
|
*
|
|
|
|
* This function is used in scalar operator selectivity estimation. Another
|
|
|
|
* goal of this function is to find a histogram bin where to stop
|
2021-08-06 20:55:59 +02:00
|
|
|
* interpolation of portion of bounds which are less than or equal to given bound.
|
Multirange datatypes
Multiranges are basically sorted arrays of non-overlapping ranges with
set-theoretic operations defined over them.
Since v14, each range type automatically gets a corresponding multirange
datatype. There are both manual and automatic mechanisms for naming multirange
types. Once can specify multirange type name using multirange_type_name
attribute in CREATE TYPE. Otherwise, a multirange type name is generated
automatically. If the range type name contains "range" then we change that to
"multirange". Otherwise, we add "_multirange" to the end.
Implementation of multiranges comes with a space-efficient internal
representation format, which evades extra paddings and duplicated storage of
oids. Altogether this format allows fetching a particular range by its index
in O(n).
Statistic gathering and selectivity estimation are implemented for multiranges.
For this purpose, stored multirange is approximated as union range without gaps.
This field will likely need improvements in the future.
Catversion is bumped.
Discussion: https://postgr.es/m/CALNJ-vSUpQ_Y%3DjXvTxt1VYFztaBSsWVXeF1y6gTYQ4bOiWDLgQ%40mail.gmail.com
Discussion: https://postgr.es/m/a0b8026459d1e6167933be2104a6174e7d40d0ab.camel%40j-davis.com#fe7218c83b08068bfffb0c5293eceda0
Author: Paul Jungwirth, revised by me
Reviewed-by: David Fetter, Corey Huinker, Jeff Davis, Pavel Stehule
Reviewed-by: Alvaro Herrera, Tom Lane, Isaac Morland, David G. Johnston
Reviewed-by: Zhihong Yu, Alexander Korotkov
2020-12-20 05:20:33 +01:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist,
|
|
|
|
int hist_length, bool equal)
|
|
|
|
{
|
|
|
|
int lower = -1,
|
|
|
|
upper = hist_length - 1,
|
|
|
|
cmp,
|
|
|
|
middle;
|
|
|
|
|
|
|
|
while (lower < upper)
|
|
|
|
{
|
|
|
|
middle = (lower + upper + 1) / 2;
|
|
|
|
cmp = range_cmp_bounds(typcache, &hist[middle], value);
|
|
|
|
|
|
|
|
if (cmp < 0 || (equal && cmp == 0))
|
|
|
|
lower = middle;
|
|
|
|
else
|
|
|
|
upper = middle - 1;
|
|
|
|
}
|
|
|
|
return lower;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Binary search on length histogram. Returns greatest index of range length in
|
|
|
|
* histogram which is less than (less than or equal) the given length value. If
|
|
|
|
* all lengths in the histogram are greater than (greater than or equal) the
|
|
|
|
* given length, returns -1.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
|
|
|
|
double value, bool equal)
|
|
|
|
{
|
|
|
|
int lower = -1,
|
|
|
|
upper = length_hist_nvalues - 1,
|
|
|
|
middle;
|
|
|
|
|
|
|
|
while (lower < upper)
|
|
|
|
{
|
|
|
|
double middleval;
|
|
|
|
|
|
|
|
middle = (lower + upper + 1) / 2;
|
|
|
|
|
|
|
|
middleval = DatumGetFloat8(length_hist_values[middle]);
|
|
|
|
if (middleval < value || (equal && middleval <= value))
|
|
|
|
lower = middle;
|
|
|
|
else
|
|
|
|
upper = middle - 1;
|
|
|
|
}
|
|
|
|
return lower;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get relative position of value in histogram bin in [0,1] range.
|
|
|
|
*/
|
|
|
|
static float8
|
|
|
|
get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1,
|
|
|
|
const RangeBound *hist2)
|
|
|
|
{
|
|
|
|
bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
|
|
|
|
float8 position;
|
|
|
|
|
|
|
|
if (!hist1->infinite && !hist2->infinite)
|
|
|
|
{
|
|
|
|
float8 bin_width;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Both bounds are finite. Assuming the subtype's comparison function
|
|
|
|
* works sanely, the value must be finite, too, because it lies
|
|
|
|
* somewhere between the bounds. If it doesn't, arbitrarily return
|
|
|
|
* 0.5.
|
|
|
|
*/
|
|
|
|
if (value->infinite)
|
|
|
|
return 0.5;
|
|
|
|
|
|
|
|
/* Can't interpolate without subdiff function */
|
|
|
|
if (!has_subdiff)
|
|
|
|
return 0.5;
|
|
|
|
|
|
|
|
/* Calculate relative position using subdiff function. */
|
|
|
|
bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
|
|
|
|
typcache->rng_collation,
|
|
|
|
hist2->val,
|
|
|
|
hist1->val));
|
|
|
|
if (isnan(bin_width) || bin_width <= 0.0)
|
|
|
|
return 0.5; /* punt for NaN or zero-width bin */
|
|
|
|
|
|
|
|
position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
|
|
|
|
typcache->rng_collation,
|
|
|
|
value->val,
|
|
|
|
hist1->val))
|
|
|
|
/ bin_width;
|
|
|
|
|
|
|
|
if (isnan(position))
|
|
|
|
return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
|
|
|
|
|
|
|
|
/* Relative position must be in [0,1] range */
|
|
|
|
position = Max(position, 0.0);
|
|
|
|
position = Min(position, 1.0);
|
|
|
|
return position;
|
|
|
|
}
|
|
|
|
else if (hist1->infinite && !hist2->infinite)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Lower bin boundary is -infinite, upper is finite. If the value is
|
|
|
|
* -infinite, return 0.0 to indicate it's equal to the lower bound.
|
|
|
|
* Otherwise return 1.0 to indicate it's infinitely far from the lower
|
|
|
|
* bound.
|
|
|
|
*/
|
|
|
|
return ((value->infinite && value->lower) ? 0.0 : 1.0);
|
|
|
|
}
|
|
|
|
else if (!hist1->infinite && hist2->infinite)
|
|
|
|
{
|
|
|
|
/* same as above, but in reverse */
|
|
|
|
return ((value->infinite && !value->lower) ? 1.0 : 0.0);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If both bin boundaries are infinite, they should be equal to each
|
|
|
|
* other, and the value should also be infinite and equal to both
|
|
|
|
* bounds. (But don't Assert that, to avoid crashing if a user creates
|
|
|
|
* a datatype with a broken comparison function).
|
|
|
|
*
|
|
|
|
* Assume the value to lie in the middle of the infinite bounds.
|
|
|
|
*/
|
|
|
|
return 0.5;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get relative position of value in a length histogram bin in [0,1] range.
|
|
|
|
*/
|
|
|
|
static double
|
|
|
|
get_len_position(double value, double hist1, double hist2)
|
|
|
|
{
|
|
|
|
if (!isinf(hist1) && !isinf(hist2))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Both bounds are finite. The value should be finite too, because it
|
|
|
|
* lies somewhere between the bounds. If it doesn't, just return
|
|
|
|
* something.
|
|
|
|
*/
|
|
|
|
if (isinf(value))
|
|
|
|
return 0.5;
|
|
|
|
|
|
|
|
return 1.0 - (hist2 - value) / (hist2 - hist1);
|
|
|
|
}
|
|
|
|
else if (isinf(hist1) && !isinf(hist2))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Lower bin boundary is -infinite, upper is finite. Return 1.0 to
|
|
|
|
* indicate the value is infinitely far from the lower bound.
|
|
|
|
*/
|
|
|
|
return 1.0;
|
|
|
|
}
|
|
|
|
else if (isinf(hist1) && isinf(hist2))
|
|
|
|
{
|
|
|
|
/* same as above, but in reverse */
|
|
|
|
return 0.0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If both bin boundaries are infinite, they should be equal to each
|
|
|
|
* other, and the value should also be infinite and equal to both
|
|
|
|
* bounds. (But don't Assert that, to avoid crashing unnecessarily if
|
|
|
|
* the caller messes up)
|
|
|
|
*
|
|
|
|
* Assume the value to lie in the middle of the infinite bounds.
|
|
|
|
*/
|
|
|
|
return 0.5;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Measure distance between two range bounds.
|
|
|
|
*/
|
|
|
|
static float8
|
|
|
|
get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2)
|
|
|
|
{
|
|
|
|
bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
|
|
|
|
|
|
|
|
if (!bound1->infinite && !bound2->infinite)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Neither bound is infinite, use subdiff function or return default
|
|
|
|
* value of 1.0 if no subdiff is available.
|
|
|
|
*/
|
|
|
|
if (has_subdiff)
|
|
|
|
{
|
|
|
|
float8 res;
|
|
|
|
|
|
|
|
res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
|
|
|
|
typcache->rng_collation,
|
|
|
|
bound2->val,
|
|
|
|
bound1->val));
|
|
|
|
/* Reject possible NaN result, also negative result */
|
|
|
|
if (isnan(res) || res < 0.0)
|
|
|
|
return 1.0;
|
|
|
|
else
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return 1.0;
|
|
|
|
}
|
|
|
|
else if (bound1->infinite && bound2->infinite)
|
|
|
|
{
|
|
|
|
/* Both bounds are infinite */
|
|
|
|
if (bound1->lower == bound2->lower)
|
|
|
|
return 0.0;
|
|
|
|
else
|
|
|
|
return get_float8_infinity();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* One bound is infinite, the other is not */
|
|
|
|
return get_float8_infinity();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the average of function P(x), in the interval [length1, length2],
|
|
|
|
* where P(x) is the fraction of tuples with length < x (or length <= x if
|
|
|
|
* 'equal' is true).
|
|
|
|
*/
|
|
|
|
static double
|
|
|
|
calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
|
|
|
|
double length1, double length2, bool equal)
|
|
|
|
{
|
|
|
|
double frac;
|
|
|
|
double A,
|
|
|
|
B,
|
|
|
|
PA,
|
|
|
|
PB;
|
|
|
|
double pos;
|
|
|
|
int i;
|
|
|
|
double area;
|
|
|
|
|
|
|
|
Assert(length2 >= length1);
|
|
|
|
|
|
|
|
if (length2 < 0.0)
|
|
|
|
return 0.0; /* shouldn't happen, but doesn't hurt to check */
|
|
|
|
|
|
|
|
/* All lengths in the table are <= infinite. */
|
|
|
|
if (isinf(length2) && equal)
|
|
|
|
return 1.0;
|
|
|
|
|
|
|
|
/*----------
|
|
|
|
* The average of a function between A and B can be calculated by the
|
|
|
|
* formula:
|
|
|
|
*
|
|
|
|
* B
|
|
|
|
* 1 /
|
|
|
|
* ------- | P(x)dx
|
|
|
|
* B - A /
|
|
|
|
* A
|
|
|
|
*
|
|
|
|
* The geometrical interpretation of the integral is the area under the
|
|
|
|
* graph of P(x). P(x) is defined by the length histogram. We calculate
|
|
|
|
* the area in a piecewise fashion, iterating through the length histogram
|
|
|
|
* bins. Each bin is a trapezoid:
|
|
|
|
*
|
|
|
|
* P(x2)
|
|
|
|
* /|
|
|
|
|
* / |
|
|
|
|
* P(x1)/ |
|
|
|
|
* | |
|
|
|
|
* | |
|
|
|
|
* ---+---+--
|
|
|
|
* x1 x2
|
|
|
|
*
|
|
|
|
* where x1 and x2 are the boundaries of the current histogram, and P(x1)
|
|
|
|
* and P(x1) are the cumulative fraction of tuples at the boundaries.
|
|
|
|
*
|
|
|
|
* The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
|
|
|
|
*
|
|
|
|
* The first bin contains the lower bound passed by the caller, so we
|
|
|
|
* use linear interpolation between the previous and next histogram bin
|
|
|
|
* boundary to calculate P(x1). Likewise for the last bin: we use linear
|
|
|
|
* interpolation to calculate P(x2). For the bins in between, x1 and x2
|
|
|
|
* lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
|
|
|
|
* P(x1) = (bin index) / (number of bins)
|
|
|
|
* P(x2) = (bin index + 1 / (number of bins)
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* First bin, the one that contains lower bound */
|
|
|
|
i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
|
|
|
|
if (i >= length_hist_nvalues - 1)
|
|
|
|
return 1.0;
|
|
|
|
|
|
|
|
if (i < 0)
|
|
|
|
{
|
|
|
|
i = 0;
|
|
|
|
pos = 0.0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* interpolate length1's position in the bin */
|
|
|
|
pos = get_len_position(length1,
|
|
|
|
DatumGetFloat8(length_hist_values[i]),
|
|
|
|
DatumGetFloat8(length_hist_values[i + 1]));
|
|
|
|
}
|
|
|
|
PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
|
|
|
|
B = length1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In the degenerate case that length1 == length2, simply return
|
|
|
|
* P(length1). This is not merely an optimization: if length1 == length2,
|
|
|
|
* we'd divide by zero later on.
|
|
|
|
*/
|
|
|
|
if (length2 == length1)
|
|
|
|
return PB;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop through all the bins, until we hit the last bin, the one that
|
|
|
|
* contains the upper bound. (if lower and upper bounds are in the same
|
|
|
|
* bin, this falls out immediately)
|
|
|
|
*/
|
|
|
|
area = 0.0;
|
|
|
|
for (; i < length_hist_nvalues - 1; i++)
|
|
|
|
{
|
|
|
|
double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
|
|
|
|
|
|
|
|
/* check if we've reached the last bin */
|
|
|
|
if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* the upper bound of previous bin is the lower bound of this bin */
|
|
|
|
A = B;
|
|
|
|
PA = PB;
|
|
|
|
|
|
|
|
B = bin_upper;
|
|
|
|
PB = (double) i / (double) (length_hist_nvalues - 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the area of this trapezoid to the total. The point of the
|
|
|
|
* if-check is to avoid NaN, in the corner case that PA == PB == 0,
|
|
|
|
* and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
|
|
|
|
* 0) is zero, regardless of the width (B - A).
|
|
|
|
*/
|
|
|
|
if (PA > 0 || PB > 0)
|
|
|
|
area += 0.5 * (PB + PA) * (B - A);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Last bin */
|
|
|
|
A = B;
|
|
|
|
PA = PB;
|
|
|
|
|
|
|
|
B = length2; /* last bin ends at the query upper bound */
|
|
|
|
if (i >= length_hist_nvalues - 1)
|
|
|
|
pos = 0.0;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
|
|
|
|
pos = 0.0;
|
|
|
|
else
|
|
|
|
pos = get_len_position(length2,
|
|
|
|
DatumGetFloat8(length_hist_values[i]),
|
|
|
|
DatumGetFloat8(length_hist_values[i + 1]));
|
|
|
|
}
|
|
|
|
PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
|
|
|
|
|
|
|
|
if (PA > 0 || PB > 0)
|
|
|
|
area += 0.5 * (PB + PA) * (B - A);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok, we have calculated the area, ie. the integral. Divide by width to
|
|
|
|
* get the requested average.
|
|
|
|
*
|
|
|
|
* Avoid NaN arising from infinite / infinite. This happens at least if
|
|
|
|
* length2 is infinite. It's not clear what the correct value would be in
|
|
|
|
* that case, so 0.5 seems as good as any value.
|
|
|
|
*/
|
|
|
|
if (isinf(area) && isinf(length2))
|
|
|
|
frac = 0.5;
|
|
|
|
else
|
|
|
|
frac = area / (length2 - length1);
|
|
|
|
|
|
|
|
return frac;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
|
|
|
|
* of multiranges that fall within the constant lower and upper bounds. This uses
|
|
|
|
* the histograms of range lower bounds and range lengths, on the assumption
|
|
|
|
* that the range lengths are independent of the lower bounds.
|
|
|
|
*
|
|
|
|
* The caller has already checked that constant lower and upper bounds are
|
|
|
|
* finite.
|
|
|
|
*/
|
|
|
|
static double
|
|
|
|
calc_hist_selectivity_contained(TypeCacheEntry *typcache,
|
|
|
|
const RangeBound *lower, RangeBound *upper,
|
|
|
|
const RangeBound *hist_lower, int hist_nvalues,
|
|
|
|
Datum *length_hist_values, int length_hist_nvalues)
|
|
|
|
{
|
|
|
|
int i,
|
|
|
|
upper_index;
|
|
|
|
float8 prev_dist;
|
|
|
|
double bin_width;
|
|
|
|
double upper_bin_width;
|
|
|
|
double sum_frac;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Begin by finding the bin containing the upper bound, in the lower bound
|
|
|
|
* histogram. Any range with a lower bound > constant upper bound can't
|
|
|
|
* match, ie. there are no matches in bins greater than upper_index.
|
|
|
|
*/
|
|
|
|
upper->inclusive = !upper->inclusive;
|
|
|
|
upper->lower = true;
|
|
|
|
upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
|
|
|
|
false);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the upper bound value is below the histogram's lower limit, there
|
|
|
|
* are no matches.
|
|
|
|
*/
|
|
|
|
if (upper_index < 0)
|
|
|
|
return 0.0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the upper bound value is at or beyond the histogram's upper limit,
|
|
|
|
* start our loop at the last actual bin, as though the upper bound were
|
|
|
|
* within that bin; get_position will clamp its result to 1.0 anyway.
|
|
|
|
* (This corresponds to assuming that the data population above the
|
|
|
|
* histogram's upper limit is empty, exactly like what we just assumed for
|
|
|
|
* the lower limit.)
|
|
|
|
*/
|
|
|
|
upper_index = Min(upper_index, hist_nvalues - 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate upper_bin_width, ie. the fraction of the (upper_index,
|
|
|
|
* upper_index + 1) bin which is greater than upper bound of query range
|
|
|
|
* using linear interpolation of subdiff function.
|
|
|
|
*/
|
|
|
|
upper_bin_width = get_position(typcache, upper,
|
|
|
|
&hist_lower[upper_index],
|
|
|
|
&hist_lower[upper_index + 1]);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In the loop, dist and prev_dist are the distance of the "current" bin's
|
|
|
|
* lower and upper bounds from the constant upper bound.
|
|
|
|
*
|
|
|
|
* bin_width represents the width of the current bin. Normally it is 1.0,
|
|
|
|
* meaning a full width bin, but can be less in the corner cases: start
|
|
|
|
* and end of the loop. We start with bin_width = upper_bin_width, because
|
|
|
|
* we begin at the bin containing the upper bound.
|
|
|
|
*/
|
|
|
|
prev_dist = 0.0;
|
|
|
|
bin_width = upper_bin_width;
|
|
|
|
|
|
|
|
sum_frac = 0.0;
|
|
|
|
for (i = upper_index; i >= 0; i--)
|
|
|
|
{
|
|
|
|
double dist;
|
|
|
|
double length_hist_frac;
|
|
|
|
bool final_bin = false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dist -- distance from upper bound of query range to lower bound of
|
|
|
|
* the current bin in the lower bound histogram. Or to the lower bound
|
|
|
|
* of the constant range, if this is the final bin, containing the
|
|
|
|
* constant lower bound.
|
|
|
|
*/
|
|
|
|
if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
|
|
|
|
{
|
|
|
|
dist = get_distance(typcache, lower, upper);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Subtract from bin_width the portion of this bin that we want to
|
|
|
|
* ignore.
|
|
|
|
*/
|
|
|
|
bin_width -= get_position(typcache, lower, &hist_lower[i],
|
|
|
|
&hist_lower[i + 1]);
|
|
|
|
if (bin_width < 0.0)
|
|
|
|
bin_width = 0.0;
|
|
|
|
final_bin = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
dist = get_distance(typcache, &hist_lower[i], upper);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Estimate the fraction of tuples in this bin that are narrow enough
|
|
|
|
* to not exceed the distance to the upper bound of the query range.
|
|
|
|
*/
|
|
|
|
length_hist_frac = calc_length_hist_frac(length_hist_values,
|
|
|
|
length_hist_nvalues,
|
|
|
|
prev_dist, dist, true);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the fraction of tuples in this bin, with a suitable length, to
|
|
|
|
* the total.
|
|
|
|
*/
|
|
|
|
sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
|
|
|
|
|
|
|
|
if (final_bin)
|
|
|
|
break;
|
|
|
|
|
|
|
|
bin_width = 1.0;
|
|
|
|
prev_dist = dist;
|
|
|
|
}
|
|
|
|
|
|
|
|
return sum_frac;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate selectivity of "var @> const" operator, ie. estimate the fraction
|
|
|
|
* of multiranges that contain the constant lower and upper bounds. This uses
|
|
|
|
* the histograms of range lower bounds and range lengths, on the assumption
|
|
|
|
* that the range lengths are independent of the lower bounds.
|
|
|
|
*/
|
|
|
|
static double
|
|
|
|
calc_hist_selectivity_contains(TypeCacheEntry *typcache,
|
|
|
|
const RangeBound *lower, const RangeBound *upper,
|
|
|
|
const RangeBound *hist_lower, int hist_nvalues,
|
|
|
|
Datum *length_hist_values, int length_hist_nvalues)
|
|
|
|
{
|
|
|
|
int i,
|
|
|
|
lower_index;
|
|
|
|
double bin_width,
|
|
|
|
lower_bin_width;
|
|
|
|
double sum_frac;
|
|
|
|
float8 prev_dist;
|
|
|
|
|
|
|
|
/* Find the bin containing the lower bound of query range. */
|
|
|
|
lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
|
|
|
|
true);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the lower bound value is below the histogram's lower limit, there
|
|
|
|
* are no matches.
|
|
|
|
*/
|
|
|
|
if (lower_index < 0)
|
|
|
|
return 0.0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the lower bound value is at or beyond the histogram's upper limit,
|
|
|
|
* start our loop at the last actual bin, as though the upper bound were
|
|
|
|
* within that bin; get_position will clamp its result to 1.0 anyway.
|
|
|
|
* (This corresponds to assuming that the data population above the
|
|
|
|
* histogram's upper limit is empty, exactly like what we just assumed for
|
|
|
|
* the lower limit.)
|
|
|
|
*/
|
|
|
|
lower_index = Min(lower_index, hist_nvalues - 2);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate lower_bin_width, ie. the fraction of the of (lower_index,
|
|
|
|
* lower_index + 1) bin which is greater than lower bound of query range
|
|
|
|
* using linear interpolation of subdiff function.
|
|
|
|
*/
|
|
|
|
lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
|
|
|
|
&hist_lower[lower_index + 1]);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop through all the lower bound bins, smaller than the query lower
|
|
|
|
* bound. In the loop, dist and prev_dist are the distance of the
|
|
|
|
* "current" bin's lower and upper bounds from the constant upper bound.
|
|
|
|
* We begin from query lower bound, and walk backwards, so the first bin's
|
|
|
|
* upper bound is the query lower bound, and its distance to the query
|
|
|
|
* upper bound is the length of the query range.
|
|
|
|
*
|
|
|
|
* bin_width represents the width of the current bin. Normally it is 1.0,
|
|
|
|
* meaning a full width bin, except for the first bin, which is only
|
|
|
|
* counted up to the constant lower bound.
|
|
|
|
*/
|
|
|
|
prev_dist = get_distance(typcache, lower, upper);
|
|
|
|
sum_frac = 0.0;
|
|
|
|
bin_width = lower_bin_width;
|
|
|
|
for (i = lower_index; i >= 0; i--)
|
|
|
|
{
|
|
|
|
float8 dist;
|
|
|
|
double length_hist_frac;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dist -- distance from upper bound of query range to current value
|
|
|
|
* of lower bound histogram or lower bound of query range (if we've
|
|
|
|
* reach it).
|
|
|
|
*/
|
|
|
|
dist = get_distance(typcache, &hist_lower[i], upper);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get average fraction of length histogram which covers intervals
|
|
|
|
* longer than (or equal to) distance to upper bound of query range.
|
|
|
|
*/
|
|
|
|
length_hist_frac =
|
|
|
|
1.0 - calc_length_hist_frac(length_hist_values,
|
|
|
|
length_hist_nvalues,
|
|
|
|
prev_dist, dist, false);
|
|
|
|
|
|
|
|
sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
|
|
|
|
|
|
|
|
bin_width = 1.0;
|
|
|
|
prev_dist = dist;
|
|
|
|
}
|
|
|
|
|
|
|
|
return sum_frac;
|
|
|
|
}
|