Fix edge-case crashes and misestimation in range containment selectivity.

When estimating the selectivity of "range_var <@ range_constant" or
"range_var @> range_constant", if the upper (or respectively lower)
bound of the range_constant was above the last bin of the range_var's
histogram, the code would access uninitialized memory and potentially
crash (though it seems the probability of a crash is quite low).
Handle the endpoint cases explicitly to fix that.

While at it, be more paranoid about the possibility of getting NaN
or other silly results from the range type's subdiff function.
And improve some comments.

Ordinarily we'd probably add a regression test case demonstrating
the bug in unpatched code.  But it's too hard to get it to crash
reliably because of the uninitialized-memory dependence, so skip that.

Per bug #16122 from Adam Scott.  It's been broken from the beginning,
apparently, so backpatch to all supported branches.

Diagnosis by Michael Paquier, patch by Andrey Borodin and Tom Lane.

Discussion: https://postgr.es/m/16122-eb35bc248c806c15@postgresql.org
This commit is contained in:
Tom Lane 2020-01-12 14:37:00 -05:00
parent 27676e22d3
commit 6bd567b658
1 changed files with 72 additions and 28 deletions

View File

@ -17,6 +17,8 @@
*/
#include "postgres.h"
#include <math.h>
#include "access/htup_details.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
@ -405,6 +407,13 @@ calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
NULL, NULL)))
return -1.0;
/* check that it's a histogram, not just a dummy entry */
if (nhist < 2)
{
free_attstatsslot(vardata->atttype, hist_values, nhist, NULL, 0);
return -1.0;
}
/*
* Convert histogram of ranges into histograms of its lower and upper
* bounds.
@ -693,7 +702,8 @@ get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
/*
* Both bounds are finite. Assuming the subtype's comparison function
* works sanely, the value must be finite, too, because it lies
* somewhere between the bounds. If it doesn't, just return something.
* somewhere between the bounds. If it doesn't, arbitrarily return
* 0.5.
*/
if (value->infinite)
return 0.5;
@ -703,21 +713,22 @@ get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
return 0.5;
/* Calculate relative position using subdiff function. */
bin_width = DatumGetFloat8(FunctionCall2Coll(
&typcache->rng_subdiff_finfo,
bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
typcache->rng_collation,
hist2->val,
hist1->val));
if (bin_width <= 0.0)
return 0.5; /* zero width bin */
if (isnan(bin_width) || bin_width <= 0.0)
return 0.5; /* punt for NaN or zero-width bin */
position = DatumGetFloat8(FunctionCall2Coll(
&typcache->rng_subdiff_finfo,
position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
typcache->rng_collation,
value->val,
hist1->val))
/ bin_width;
if (isnan(position))
return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
/* Relative position must be in [0,1] range */
position = Max(position, 0.0);
position = Min(position, 1.0);
@ -809,15 +820,23 @@ get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
if (!bound1->infinite && !bound2->infinite)
{
/*
* No bounds are infinite, use subdiff function or return default
* Neither bound is infinite, use subdiff function or return default
* value of 1.0 if no subdiff is available.
*/
if (has_subdiff)
return
DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
typcache->rng_collation,
bound2->val,
bound1->val));
{
float8 res;
res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
typcache->rng_collation,
bound2->val,
bound1->val));
/* Reject possible NaN result, also negative result */
if (isnan(res) || res < 0.0)
return 1.0;
else
return res;
}
else
return 1.0;
}
@ -831,7 +850,7 @@ get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
}
else
{
/* One bound is infinite, another is not */
/* One bound is infinite, the other is not */
return get_float8_infinity();
}
}
@ -1027,17 +1046,31 @@ calc_hist_selectivity_contained(TypeCacheEntry *typcache,
upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
false);
/*
* If the upper bound value is below the histogram's lower limit, there
* are no matches.
*/
if (upper_index < 0)
return 0.0;
/*
* If the upper bound value is at or beyond the histogram's upper limit,
* start our loop at the last actual bin, as though the upper bound were
* within that bin; get_position will clamp its result to 1.0 anyway.
* (This corresponds to assuming that the data population above the
* histogram's upper limit is empty, exactly like what we just assumed for
* the lower limit.)
*/
upper_index = Min(upper_index, hist_nvalues - 2);
/*
* Calculate upper_bin_width, ie. the fraction of the (upper_index,
* upper_index + 1) bin which is greater than upper bound of query range
* using linear interpolation of subdiff function.
*/
if (upper_index >= 0 && upper_index < hist_nvalues - 1)
upper_bin_width = get_position(typcache, upper,
&hist_lower[upper_index],
&hist_lower[upper_index + 1]);
else
upper_bin_width = 0.0;
upper_bin_width = get_position(typcache, upper,
&hist_lower[upper_index],
&hist_lower[upper_index + 1]);
/*
* In the loop, dist and prev_dist are the distance of the "current" bin's
@ -1110,9 +1143,6 @@ calc_hist_selectivity_contained(TypeCacheEntry *typcache,
* of ranges that contain the constant lower and upper bounds. This uses
* the histograms of range lower bounds and range lengths, on the assumption
* that the range lengths are independent of the lower bounds.
*
* Note, this is "var @> const", ie. estimate the fraction of ranges that
* contain the constant lower and upper bounds.
*/
static double
calc_hist_selectivity_contains(TypeCacheEntry *typcache,
@ -1131,16 +1161,30 @@ calc_hist_selectivity_contains(TypeCacheEntry *typcache,
lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
true);
/*
* If the lower bound value is below the histogram's lower limit, there
* are no matches.
*/
if (lower_index < 0)
return 0.0;
/*
* If the lower bound value is at or beyond the histogram's upper limit,
* start our loop at the last actual bin, as though the upper bound were
* within that bin; get_position will clamp its result to 1.0 anyway.
* (This corresponds to assuming that the data population above the
* histogram's upper limit is empty, exactly like what we just assumed for
* the lower limit.)
*/
lower_index = Min(lower_index, hist_nvalues - 2);
/*
* Calculate lower_bin_width, ie. the fraction of the of (lower_index,
* lower_index + 1) bin which is greater than lower bound of query range
* using linear interpolation of subdiff function.
*/
if (lower_index >= 0 && lower_index < hist_nvalues - 1)
lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
&hist_lower[lower_index + 1]);
else
lower_bin_width = 0.0;
lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
&hist_lower[lower_index + 1]);
/*
* Loop through all the lower bound bins, smaller than the query lower