/*------------------------------------------------------------------------- * * multirangetypes_selfuncs.c * Functions for selectivity estimation of multirange operators * * Estimates are based on histograms of lower and upper bounds, and the * fraction of empty multiranges. * * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/backend/utils/adt/multirangetypes_selfuncs.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "access/htup_details.h" #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" #include "utils/float.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" #include "utils/multirangetypes.h" #include "utils/rangetypes.h" #include "utils/selfuncs.h" #include "utils/typcache.h" static double calc_multirangesel(TypeCacheEntry *typcache, VariableStatData *vardata, const MultirangeType *constval, Oid operator); static double default_multirange_selectivity(Oid operator); static double calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, const MultirangeType *constval, Oid operator); static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound, const RangeBound *hist, int hist_nvalues, bool equal); static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist, int hist_length, bool equal); static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1, const RangeBound *hist2); static float8 get_len_position(double value, double hist1, double hist2); static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2); static int length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues, double value, bool equal); static double calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal); static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, const RangeBound *lower, RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, Datum *length_hist_values, int length_hist_nvalues); static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, const RangeBound *lower, const RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, Datum *length_hist_values, int length_hist_nvalues); /* * Returns a default selectivity estimate for given operator, when we don't * have statistics or cannot use them for some reason. */ static double default_multirange_selectivity(Oid operator) { switch (operator) { case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP: case OID_MULTIRANGE_OVERLAPS_RANGE_OP: case OID_RANGE_OVERLAPS_MULTIRANGE_OP: return 0.01; case OID_RANGE_CONTAINS_MULTIRANGE_OP: case OID_RANGE_MULTIRANGE_CONTAINED_OP: case OID_MULTIRANGE_CONTAINS_RANGE_OP: case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP: case OID_MULTIRANGE_RANGE_CONTAINED_OP: case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP: return 0.005; case OID_MULTIRANGE_CONTAINS_ELEM_OP: case OID_MULTIRANGE_ELEM_CONTAINED_OP: /* * "multirange @> elem" is more or less identical to a scalar * inequality "A >= b AND A <= c". */ return DEFAULT_MULTIRANGE_INEQ_SEL; case OID_MULTIRANGE_LESS_OP: case OID_MULTIRANGE_LESS_EQUAL_OP: case OID_MULTIRANGE_GREATER_OP: case OID_MULTIRANGE_GREATER_EQUAL_OP: case OID_MULTIRANGE_LEFT_RANGE_OP: case OID_MULTIRANGE_LEFT_MULTIRANGE_OP: case OID_RANGE_LEFT_MULTIRANGE_OP: case OID_MULTIRANGE_RIGHT_RANGE_OP: case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP: case OID_RANGE_RIGHT_MULTIRANGE_OP: case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP: case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP: case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP: case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP: case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP: case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP: /* these are similar to regular scalar inequalities */ return DEFAULT_INEQ_SEL; default: /* * all multirange operators should be handled above, but just in * case */ return 0.01; } } /* * multirangesel -- restriction selectivity for multirange operators */ Datum multirangesel(PG_FUNCTION_ARGS) { PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); Oid operator = PG_GETARG_OID(1); List *args = (List *) PG_GETARG_POINTER(2); int varRelid = PG_GETARG_INT32(3); VariableStatData vardata; Node *other; bool varonleft; Selectivity selec; TypeCacheEntry *typcache = NULL; MultirangeType *constmultirange = NULL; RangeType *constrange = NULL; /* * If expression is not (variable op something) or (something op * variable), then punt and return a default estimate. */ if (!get_restriction_variable(root, args, varRelid, &vardata, &other, &varonleft)) PG_RETURN_FLOAT8(default_multirange_selectivity(operator)); /* * Can't do anything useful if the something is not a constant, either. */ if (!IsA(other, Const)) { ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(default_multirange_selectivity(operator)); } /* * All the multirange operators are strict, so we can cope with a NULL * constant right away. */ if (((Const *) other)->constisnull) { ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(0.0); } /* * If var is on the right, commute the operator, so that we can assume the * var is on the left in what follows. */ if (!varonleft) { /* we have other Op var, commute to make var Op other */ operator = get_commutator(operator); if (!operator) { /* Use default selectivity (should we raise an error instead?) */ ReleaseVariableStats(vardata); PG_RETURN_FLOAT8(default_multirange_selectivity(operator)); } } /* * OK, there's a Var and a Const we're dealing with here. We need the * Const to be of same multirange type as the column, else we can't do * anything useful. (Such cases will likely fail at runtime, but here we'd * rather just return a default estimate.) * * If the operator is "multirange @> element", the constant should be of * the element type of the multirange column. Convert it to a multirange * that includes only that single point, so that we don't need special * handling for that in what follows. */ if (operator == OID_MULTIRANGE_CONTAINS_ELEM_OP) { typcache = multirange_get_typcache(fcinfo, vardata.vartype); if (((Const *) other)->consttype == typcache->rngtype->rngelemtype->type_id) { RangeBound lower, upper; lower.inclusive = true; lower.val = ((Const *) other)->constvalue; lower.infinite = false; lower.lower = true; upper.inclusive = true; upper.val = ((Const *) other)->constvalue; upper.infinite = false; upper.lower = false; constrange = range_serialize(typcache->rngtype, &lower, &upper, false, NULL); constmultirange = make_multirange(typcache->type_id, typcache->rngtype, 1, &constrange); } } else if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP || operator == OID_MULTIRANGE_CONTAINS_RANGE_OP || operator == OID_MULTIRANGE_OVERLAPS_RANGE_OP || operator == OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP || operator == OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP || operator == OID_MULTIRANGE_LEFT_RANGE_OP || operator == OID_MULTIRANGE_RIGHT_RANGE_OP) { /* * Promote a range in "multirange OP range" just like we do an element * in "multirange OP element". */ typcache = multirange_get_typcache(fcinfo, vardata.vartype); if (((Const *) other)->consttype == typcache->rngtype->type_id) { constrange = DatumGetRangeTypeP(((Const *) other)->constvalue); constmultirange = make_multirange(typcache->type_id, typcache->rngtype, 1, &constrange); } } else if (operator == OID_RANGE_OVERLAPS_MULTIRANGE_OP || operator == OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP || operator == OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP || operator == OID_RANGE_LEFT_MULTIRANGE_OP || operator == OID_RANGE_RIGHT_MULTIRANGE_OP || operator == OID_RANGE_CONTAINS_MULTIRANGE_OP || operator == OID_MULTIRANGE_ELEM_CONTAINED_OP || operator == OID_MULTIRANGE_RANGE_CONTAINED_OP) { /* * Here, the Var is the elem/range, not the multirange. For now we * just punt and return the default estimate. In future we could * disassemble the multirange constant to do something more * intelligent. */ } else if (((Const *) other)->consttype == vardata.vartype) { /* Both sides are the same multirange type */ typcache = multirange_get_typcache(fcinfo, vardata.vartype); constmultirange = DatumGetMultirangeTypeP(((Const *) other)->constvalue); } /* * If we got a valid constant on one side of the operator, proceed to * estimate using statistics. Otherwise punt and return a default constant * estimate. Note that calc_multirangesel need not handle * OID_MULTIRANGE_*_CONTAINED_OP. */ if (constmultirange) selec = calc_multirangesel(typcache, &vardata, constmultirange, operator); else selec = default_multirange_selectivity(operator); ReleaseVariableStats(vardata); CLAMP_PROBABILITY(selec); PG_RETURN_FLOAT8((float8) selec); } static double calc_multirangesel(TypeCacheEntry *typcache, VariableStatData *vardata, const MultirangeType *constval, Oid operator) { double hist_selec; double selec; float4 empty_frac, null_frac; /* * First look up the fraction of NULLs and empty multiranges from * pg_statistic. */ if (HeapTupleIsValid(vardata->statsTuple)) { Form_pg_statistic stats; AttStatsSlot sslot; stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); null_frac = stats->stanullfrac; /* Try to get fraction of empty multiranges */ if (get_attstatsslot(&sslot, vardata->statsTuple, STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM, InvalidOid, ATTSTATSSLOT_NUMBERS)) { if (sslot.nnumbers != 1) elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */ empty_frac = sslot.numbers[0]; free_attstatsslot(&sslot); } else { /* No empty fraction statistic. Assume no empty ranges. */ empty_frac = 0.0; } } else { /* * No stats are available. Follow through the calculations below * anyway, assuming no NULLs and no empty multiranges. This still * allows us to give a better-than-nothing estimate based on whether * the constant is an empty multirange or not. */ null_frac = 0.0; empty_frac = 0.0; } if (MultirangeIsEmpty(constval)) { /* * An empty multirange matches all multiranges, all empty multiranges, * or nothing, depending on the operator */ switch (operator) { /* these return false if either argument is empty */ case OID_MULTIRANGE_OVERLAPS_RANGE_OP: case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP: case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP: case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP: case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP: case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP: case OID_MULTIRANGE_LEFT_RANGE_OP: case OID_MULTIRANGE_LEFT_MULTIRANGE_OP: case OID_MULTIRANGE_RIGHT_RANGE_OP: case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP: /* nothing is less than an empty multirange */ case OID_MULTIRANGE_LESS_OP: selec = 0.0; break; /* * only empty multiranges can be contained by an empty * multirange */ case OID_RANGE_MULTIRANGE_CONTAINED_OP: case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP: /* only empty ranges are <= an empty multirange */ case OID_MULTIRANGE_LESS_EQUAL_OP: selec = empty_frac; break; /* everything contains an empty multirange */ case OID_MULTIRANGE_CONTAINS_RANGE_OP: case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP: /* everything is >= an empty multirange */ case OID_MULTIRANGE_GREATER_EQUAL_OP: selec = 1.0; break; /* all non-empty multiranges are > an empty multirange */ case OID_MULTIRANGE_GREATER_OP: selec = 1.0 - empty_frac; break; /* an element cannot be empty */ case OID_MULTIRANGE_CONTAINS_ELEM_OP: /* filtered out by multirangesel() */ case OID_RANGE_OVERLAPS_MULTIRANGE_OP: case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP: case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP: case OID_RANGE_LEFT_MULTIRANGE_OP: case OID_RANGE_RIGHT_MULTIRANGE_OP: case OID_RANGE_CONTAINS_MULTIRANGE_OP: case OID_MULTIRANGE_ELEM_CONTAINED_OP: case OID_MULTIRANGE_RANGE_CONTAINED_OP: default: elog(ERROR, "unexpected operator %u", operator); selec = 0.0; /* keep compiler quiet */ break; } } else { /* * Calculate selectivity using bound histograms. If that fails for * some reason, e.g no histogram in pg_statistic, use the default * constant estimate for the fraction of non-empty values. This is * still somewhat better than just returning the default estimate, * because this still takes into account the fraction of empty and * NULL tuples, if we had statistics for them. */ hist_selec = calc_hist_selectivity(typcache, vardata, constval, operator); if (hist_selec < 0.0) hist_selec = default_multirange_selectivity(operator); /* * Now merge the results for the empty multiranges and histogram * calculations, realizing that the histogram covers only the * non-null, non-empty values. */ if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP || operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP) { /* empty is contained by anything non-empty */ selec = (1.0 - empty_frac) * hist_selec + empty_frac; } else { /* with any other operator, empty Op non-empty matches nothing */ selec = (1.0 - empty_frac) * hist_selec; } } /* all multirange operators are strict */ selec *= (1.0 - null_frac); /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); return selec; } /* * Calculate multirange operator selectivity using histograms of multirange bounds. * * This estimate is for the portion of values that are not empty and not * NULL. */ static double calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata, const MultirangeType *constval, Oid operator) { TypeCacheEntry *rng_typcache = typcache->rngtype; AttStatsSlot hslot; AttStatsSlot lslot; int nhist; RangeBound *hist_lower; RangeBound *hist_upper; int i; RangeBound const_lower; RangeBound const_upper; RangeBound tmp; double hist_selec; /* Can't use the histogram with insecure multirange support functions */ if (!statistic_proc_security_check(vardata, rng_typcache->rng_cmp_proc_finfo.fn_oid)) return -1; if (OidIsValid(rng_typcache->rng_subdiff_finfo.fn_oid) && !statistic_proc_security_check(vardata, rng_typcache->rng_subdiff_finfo.fn_oid)) return -1; /* Try to get histogram of ranges */ if (!(HeapTupleIsValid(vardata->statsTuple) && get_attstatsslot(&hslot, vardata->statsTuple, STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES))) return -1.0; /* check that it's a histogram, not just a dummy entry */ if (hslot.nvalues < 2) { free_attstatsslot(&hslot); return -1.0; } /* * Convert histogram of ranges into histograms of its lower and upper * bounds. */ nhist = hslot.nvalues; hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist); hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist); for (i = 0; i < nhist; i++) { bool empty; range_deserialize(rng_typcache, DatumGetRangeTypeP(hslot.values[i]), &hist_lower[i], &hist_upper[i], &empty); /* The histogram should not contain any empty ranges */ if (empty) elog(ERROR, "bounds histogram contains an empty range"); } /* @> and @< also need a histogram of range lengths */ if (operator == OID_MULTIRANGE_CONTAINS_RANGE_OP || operator == OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP || operator == OID_MULTIRANGE_RANGE_CONTAINED_OP || operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP) { if (!(HeapTupleIsValid(vardata->statsTuple) && get_attstatsslot(&lslot, vardata->statsTuple, STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM, InvalidOid, ATTSTATSSLOT_VALUES))) { free_attstatsslot(&hslot); return -1.0; } /* check that it's a histogram, not just a dummy entry */ if (lslot.nvalues < 2) { free_attstatsslot(&lslot); free_attstatsslot(&hslot); return -1.0; } } else memset(&lslot, 0, sizeof(lslot)); /* Extract the bounds of the constant value. */ Assert(constval->rangeCount > 0); multirange_get_bounds(rng_typcache, constval, 0, &const_lower, &tmp); multirange_get_bounds(rng_typcache, constval, constval->rangeCount - 1, &tmp, &const_upper); /* * Calculate selectivity comparing the lower or upper bound of the * constant with the histogram of lower or upper bounds. */ switch (operator) { case OID_MULTIRANGE_LESS_OP: /* * The regular b-tree comparison operators (<, <=, >, >=) compare * the lower bounds first, and the upper bounds for values with * equal lower bounds. Estimate that by comparing the lower bounds * only. This gives a fairly accurate estimate assuming there * aren't many rows with a lower bound equal to the constant's * lower bound. */ hist_selec = calc_hist_selectivity_scalar(rng_typcache, &const_lower, hist_lower, nhist, false); break; case OID_MULTIRANGE_LESS_EQUAL_OP: hist_selec = calc_hist_selectivity_scalar(rng_typcache, &const_lower, hist_lower, nhist, true); break; case OID_MULTIRANGE_GREATER_OP: hist_selec = 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower, hist_lower, nhist, false); break; case OID_MULTIRANGE_GREATER_EQUAL_OP: hist_selec = 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower, hist_lower, nhist, true); break; case OID_MULTIRANGE_LEFT_RANGE_OP: case OID_MULTIRANGE_LEFT_MULTIRANGE_OP: /* var << const when upper(var) < lower(const) */ hist_selec = calc_hist_selectivity_scalar(rng_typcache, &const_lower, hist_upper, nhist, false); break; case OID_MULTIRANGE_RIGHT_RANGE_OP: case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP: /* var >> const when lower(var) > upper(const) */ hist_selec = 1 - calc_hist_selectivity_scalar(rng_typcache, &const_upper, hist_lower, nhist, true); break; case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP: case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP: /* compare lower bounds */ hist_selec = 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower, hist_lower, nhist, false); break; case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP: case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP: /* compare upper bounds */ hist_selec = calc_hist_selectivity_scalar(rng_typcache, &const_upper, hist_upper, nhist, true); break; case OID_MULTIRANGE_OVERLAPS_RANGE_OP: case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP: case OID_MULTIRANGE_CONTAINS_ELEM_OP: /* * A && B <=> NOT (A << B OR A >> B). * * Since A << B and A >> B are mutually exclusive events we can * sum their probabilities to find probability of (A << B OR A >> * B). * * "multirange @> elem" is equivalent to "multirange && * {[elem,elem]}". The caller already constructed the singular * range from the element constant, so just treat it the same as * &&. */ hist_selec = calc_hist_selectivity_scalar(rng_typcache, &const_lower, hist_upper, nhist, false); hist_selec += (1.0 - calc_hist_selectivity_scalar(rng_typcache, &const_upper, hist_lower, nhist, true)); hist_selec = 1.0 - hist_selec; break; case OID_MULTIRANGE_CONTAINS_RANGE_OP: case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP: hist_selec = calc_hist_selectivity_contains(rng_typcache, &const_lower, &const_upper, hist_lower, nhist, lslot.values, lslot.nvalues); break; case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP: case OID_RANGE_MULTIRANGE_CONTAINED_OP: if (const_lower.infinite) { /* * Lower bound no longer matters. Just estimate the fraction * with an upper bound <= const upper bound */ hist_selec = calc_hist_selectivity_scalar(rng_typcache, &const_upper, hist_upper, nhist, true); } else if (const_upper.infinite) { hist_selec = 1.0 - calc_hist_selectivity_scalar(rng_typcache, &const_lower, hist_lower, nhist, false); } else { hist_selec = calc_hist_selectivity_contained(rng_typcache, &const_lower, &const_upper, hist_lower, nhist, lslot.values, lslot.nvalues); } break; /* filtered out by multirangesel() */ case OID_RANGE_OVERLAPS_MULTIRANGE_OP: case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP: case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP: case OID_RANGE_LEFT_MULTIRANGE_OP: case OID_RANGE_RIGHT_MULTIRANGE_OP: case OID_RANGE_CONTAINS_MULTIRANGE_OP: case OID_MULTIRANGE_ELEM_CONTAINED_OP: case OID_MULTIRANGE_RANGE_CONTAINED_OP: default: elog(ERROR, "unknown multirange operator %u", operator); hist_selec = -1.0; /* keep compiler quiet */ break; } free_attstatsslot(&lslot); free_attstatsslot(&hslot); return hist_selec; } /* * Look up the fraction of values less than (or equal, if 'equal' argument * is true) a given const in a histogram of range bounds. */ static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound, const RangeBound *hist, int hist_nvalues, bool equal) { Selectivity selec; int index; /* * Find the histogram bin the given constant falls into. Estimate * selectivity as the number of preceding whole bins. */ index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal); selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1); /* Adjust using linear interpolation within the bin */ if (index >= 0 && index < hist_nvalues - 1) selec += get_position(typcache, constbound, &hist[index], &hist[index + 1]) / (Selectivity) (hist_nvalues - 1); return selec; } /* * Binary search on an array of range bounds. Returns greatest index of range * bound in array which is less(less or equal) than given range bound. If all * range bounds in array are greater or equal(greater) than given range bound, * return -1. When "equal" flag is set conditions in brackets are used. * * This function is used in scalar operator selectivity estimation. Another * goal of this function is to find a histogram bin where to stop * interpolation of portion of bounds which are less than or equal to given bound. */ static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist, int hist_length, bool equal) { int lower = -1, upper = hist_length - 1, cmp, middle; while (lower < upper) { middle = (lower + upper + 1) / 2; cmp = range_cmp_bounds(typcache, &hist[middle], value); if (cmp < 0 || (equal && cmp == 0)) lower = middle; else upper = middle - 1; } return lower; } /* * Binary search on length histogram. Returns greatest index of range length in * histogram which is less than (less than or equal) the given length value. If * all lengths in the histogram are greater than (greater than or equal) the * given length, returns -1. */ static int length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues, double value, bool equal) { int lower = -1, upper = length_hist_nvalues - 1, middle; while (lower < upper) { double middleval; middle = (lower + upper + 1) / 2; middleval = DatumGetFloat8(length_hist_values[middle]); if (middleval < value || (equal && middleval <= value)) lower = middle; else upper = middle - 1; } return lower; } /* * Get relative position of value in histogram bin in [0,1] range. */ static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1, const RangeBound *hist2) { bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid); float8 position; if (!hist1->infinite && !hist2->infinite) { float8 bin_width; /* * Both bounds are finite. Assuming the subtype's comparison function * works sanely, the value must be finite, too, because it lies * somewhere between the bounds. If it doesn't, arbitrarily return * 0.5. */ if (value->infinite) return 0.5; /* Can't interpolate without subdiff function */ if (!has_subdiff) return 0.5; /* Calculate relative position using subdiff function. */ bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo, typcache->rng_collation, hist2->val, hist1->val)); if (isnan(bin_width) || bin_width <= 0.0) return 0.5; /* punt for NaN or zero-width bin */ position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo, typcache->rng_collation, value->val, hist1->val)) / bin_width; if (isnan(position)) return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */ /* Relative position must be in [0,1] range */ position = Max(position, 0.0); position = Min(position, 1.0); return position; } else if (hist1->infinite && !hist2->infinite) { /* * Lower bin boundary is -infinite, upper is finite. If the value is * -infinite, return 0.0 to indicate it's equal to the lower bound. * Otherwise return 1.0 to indicate it's infinitely far from the lower * bound. */ return ((value->infinite && value->lower) ? 0.0 : 1.0); } else if (!hist1->infinite && hist2->infinite) { /* same as above, but in reverse */ return ((value->infinite && !value->lower) ? 1.0 : 0.0); } else { /* * If both bin boundaries are infinite, they should be equal to each * other, and the value should also be infinite and equal to both * bounds. (But don't Assert that, to avoid crashing if a user creates * a datatype with a broken comparison function). * * Assume the value to lie in the middle of the infinite bounds. */ return 0.5; } } /* * Get relative position of value in a length histogram bin in [0,1] range. */ static double get_len_position(double value, double hist1, double hist2) { if (!isinf(hist1) && !isinf(hist2)) { /* * Both bounds are finite. The value should be finite too, because it * lies somewhere between the bounds. If it doesn't, just return * something. */ if (isinf(value)) return 0.5; return 1.0 - (hist2 - value) / (hist2 - hist1); } else if (isinf(hist1) && !isinf(hist2)) { /* * Lower bin boundary is -infinite, upper is finite. Return 1.0 to * indicate the value is infinitely far from the lower bound. */ return 1.0; } else if (isinf(hist1) && isinf(hist2)) { /* same as above, but in reverse */ return 0.0; } else { /* * If both bin boundaries are infinite, they should be equal to each * other, and the value should also be infinite and equal to both * bounds. (But don't Assert that, to avoid crashing unnecessarily if * the caller messes up) * * Assume the value to lie in the middle of the infinite bounds. */ return 0.5; } } /* * Measure distance between two range bounds. */ static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2) { bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid); if (!bound1->infinite && !bound2->infinite) { /* * Neither bound is infinite, use subdiff function or return default * value of 1.0 if no subdiff is available. */ if (has_subdiff) { float8 res; res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo, typcache->rng_collation, bound2->val, bound1->val)); /* Reject possible NaN result, also negative result */ if (isnan(res) || res < 0.0) return 1.0; else return res; } else return 1.0; } else if (bound1->infinite && bound2->infinite) { /* Both bounds are infinite */ if (bound1->lower == bound2->lower) return 0.0; else return get_float8_infinity(); } else { /* One bound is infinite, the other is not */ return get_float8_infinity(); } } /* * Calculate the average of function P(x), in the interval [length1, length2], * where P(x) is the fraction of tuples with length < x (or length <= x if * 'equal' is true). */ static double calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues, double length1, double length2, bool equal) { double frac; double A, B, PA, PB; double pos; int i; double area; Assert(length2 >= length1); if (length2 < 0.0) return 0.0; /* shouldn't happen, but doesn't hurt to check */ /* All lengths in the table are <= infinite. */ if (isinf(length2) && equal) return 1.0; /*---------- * The average of a function between A and B can be calculated by the * formula: * * B * 1 / * ------- | P(x)dx * B - A / * A * * The geometrical interpretation of the integral is the area under the * graph of P(x). P(x) is defined by the length histogram. We calculate * the area in a piecewise fashion, iterating through the length histogram * bins. Each bin is a trapezoid: * * P(x2) * /| * / | * P(x1)/ | * | | * | | * ---+---+-- * x1 x2 * * where x1 and x2 are the boundaries of the current histogram, and P(x1) * and P(x1) are the cumulative fraction of tuples at the boundaries. * * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1) * * The first bin contains the lower bound passed by the caller, so we * use linear interpolation between the previous and next histogram bin * boundary to calculate P(x1). Likewise for the last bin: we use linear * interpolation to calculate P(x2). For the bins in between, x1 and x2 * lie on histogram bin boundaries, so P(x1) and P(x2) are simply: * P(x1) = (bin index) / (number of bins) * P(x2) = (bin index + 1 / (number of bins) */ /* First bin, the one that contains lower bound */ i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal); if (i >= length_hist_nvalues - 1) return 1.0; if (i < 0) { i = 0; pos = 0.0; } else { /* interpolate length1's position in the bin */ pos = get_len_position(length1, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1])); } PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1); B = length1; /* * In the degenerate case that length1 == length2, simply return * P(length1). This is not merely an optimization: if length1 == length2, * we'd divide by zero later on. */ if (length2 == length1) return PB; /* * Loop through all the bins, until we hit the last bin, the one that * contains the upper bound. (if lower and upper bounds are in the same * bin, this falls out immediately) */ area = 0.0; for (; i < length_hist_nvalues - 1; i++) { double bin_upper = DatumGetFloat8(length_hist_values[i + 1]); /* check if we've reached the last bin */ if (!(bin_upper < length2 || (equal && bin_upper <= length2))) break; /* the upper bound of previous bin is the lower bound of this bin */ A = B; PA = PB; B = bin_upper; PB = (double) i / (double) (length_hist_nvalues - 1); /* * Add the area of this trapezoid to the total. The point of the * if-check is to avoid NaN, in the corner case that PA == PB == 0, * and B - A == Inf. The area of a zero-height trapezoid (PA == PB == * 0) is zero, regardless of the width (B - A). */ if (PA > 0 || PB > 0) area += 0.5 * (PB + PA) * (B - A); } /* Last bin */ A = B; PA = PB; B = length2; /* last bin ends at the query upper bound */ if (i >= length_hist_nvalues - 1) pos = 0.0; else { if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1])) pos = 0.0; else pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1])); } PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1); if (PA > 0 || PB > 0) area += 0.5 * (PB + PA) * (B - A); /* * Ok, we have calculated the area, ie. the integral. Divide by width to * get the requested average. * * Avoid NaN arising from infinite / infinite. This happens at least if * length2 is infinite. It's not clear what the correct value would be in * that case, so 0.5 seems as good as any value. */ if (isinf(area) && isinf(length2)) frac = 0.5; else frac = area / (length2 - length1); return frac; } /* * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction * of multiranges that fall within the constant lower and upper bounds. This uses * the histograms of range lower bounds and range lengths, on the assumption * that the range lengths are independent of the lower bounds. * * The caller has already checked that constant lower and upper bounds are * finite. */ static double calc_hist_selectivity_contained(TypeCacheEntry *typcache, const RangeBound *lower, RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, Datum *length_hist_values, int length_hist_nvalues) { int i, upper_index; float8 prev_dist; double bin_width; double upper_bin_width; double sum_frac; /* * Begin by finding the bin containing the upper bound, in the lower bound * histogram. Any range with a lower bound > constant upper bound can't * match, ie. there are no matches in bins greater than upper_index. */ upper->inclusive = !upper->inclusive; upper->lower = true; upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues, false); /* * If the upper bound value is below the histogram's lower limit, there * are no matches. */ if (upper_index < 0) return 0.0; /* * If the upper bound value is at or beyond the histogram's upper limit, * start our loop at the last actual bin, as though the upper bound were * within that bin; get_position will clamp its result to 1.0 anyway. * (This corresponds to assuming that the data population above the * histogram's upper limit is empty, exactly like what we just assumed for * the lower limit.) */ upper_index = Min(upper_index, hist_nvalues - 2); /* * Calculate upper_bin_width, ie. the fraction of the (upper_index, * upper_index + 1) bin which is greater than upper bound of query range * using linear interpolation of subdiff function. */ upper_bin_width = get_position(typcache, upper, &hist_lower[upper_index], &hist_lower[upper_index + 1]); /* * In the loop, dist and prev_dist are the distance of the "current" bin's * lower and upper bounds from the constant upper bound. * * bin_width represents the width of the current bin. Normally it is 1.0, * meaning a full width bin, but can be less in the corner cases: start * and end of the loop. We start with bin_width = upper_bin_width, because * we begin at the bin containing the upper bound. */ prev_dist = 0.0; bin_width = upper_bin_width; sum_frac = 0.0; for (i = upper_index; i >= 0; i--) { double dist; double length_hist_frac; bool final_bin = false; /* * dist -- distance from upper bound of query range to lower bound of * the current bin in the lower bound histogram. Or to the lower bound * of the constant range, if this is the final bin, containing the * constant lower bound. */ if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0) { dist = get_distance(typcache, lower, upper); /* * Subtract from bin_width the portion of this bin that we want to * ignore. */ bin_width -= get_position(typcache, lower, &hist_lower[i], &hist_lower[i + 1]); if (bin_width < 0.0) bin_width = 0.0; final_bin = true; } else dist = get_distance(typcache, &hist_lower[i], upper); /* * Estimate the fraction of tuples in this bin that are narrow enough * to not exceed the distance to the upper bound of the query range. */ length_hist_frac = calc_length_hist_frac(length_hist_values, length_hist_nvalues, prev_dist, dist, true); /* * Add the fraction of tuples in this bin, with a suitable length, to * the total. */ sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1); if (final_bin) break; bin_width = 1.0; prev_dist = dist; } return sum_frac; } /* * Calculate selectivity of "var @> const" operator, ie. estimate the fraction * of multiranges that contain the constant lower and upper bounds. This uses * the histograms of range lower bounds and range lengths, on the assumption * that the range lengths are independent of the lower bounds. */ static double calc_hist_selectivity_contains(TypeCacheEntry *typcache, const RangeBound *lower, const RangeBound *upper, const RangeBound *hist_lower, int hist_nvalues, Datum *length_hist_values, int length_hist_nvalues) { int i, lower_index; double bin_width, lower_bin_width; double sum_frac; float8 prev_dist; /* Find the bin containing the lower bound of query range. */ lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues, true); /* * If the lower bound value is below the histogram's lower limit, there * are no matches. */ if (lower_index < 0) return 0.0; /* * If the lower bound value is at or beyond the histogram's upper limit, * start our loop at the last actual bin, as though the upper bound were * within that bin; get_position will clamp its result to 1.0 anyway. * (This corresponds to assuming that the data population above the * histogram's upper limit is empty, exactly like what we just assumed for * the lower limit.) */ lower_index = Min(lower_index, hist_nvalues - 2); /* * Calculate lower_bin_width, ie. the fraction of the of (lower_index, * lower_index + 1) bin which is greater than lower bound of query range * using linear interpolation of subdiff function. */ lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index], &hist_lower[lower_index + 1]); /* * Loop through all the lower bound bins, smaller than the query lower * bound. In the loop, dist and prev_dist are the distance of the * "current" bin's lower and upper bounds from the constant upper bound. * We begin from query lower bound, and walk backwards, so the first bin's * upper bound is the query lower bound, and its distance to the query * upper bound is the length of the query range. * * bin_width represents the width of the current bin. Normally it is 1.0, * meaning a full width bin, except for the first bin, which is only * counted up to the constant lower bound. */ prev_dist = get_distance(typcache, lower, upper); sum_frac = 0.0; bin_width = lower_bin_width; for (i = lower_index; i >= 0; i--) { float8 dist; double length_hist_frac; /* * dist -- distance from upper bound of query range to current value * of lower bound histogram or lower bound of query range (if we've * reach it). */ dist = get_distance(typcache, &hist_lower[i], upper); /* * Get average fraction of length histogram which covers intervals * longer than (or equal to) distance to upper bound of query range. */ length_hist_frac = 1.0 - calc_length_hist_frac(length_hist_values, length_hist_nvalues, prev_dist, dist, false); sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1); bin_width = 1.0; prev_dist = dist; } return sum_frac; }