postgresql/src/backend/utils/adt/rangetypes_typanalyze.c

357 lines
10 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* rangetypes_typanalyze.c
* Functions for gathering statistics from range columns
*
* For a range type column, histograms of lower and upper bounds, and
* the fraction of NULL and empty ranges are collected.
*
* Both histograms have the same length, and they are combined into a
* single array of ranges. This has the same shape as the histogram that
* std_typanalyze would collect, but the values are different. Each range
* in the array is a valid range, even though the lower and upper bounds
* come from different tuples. In theory, the standard scalar selectivity
* functions could be used with the combined histogram.
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/adt/rangetypes_typanalyze.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/pg_operator.h"
#include "commands/vacuum.h"
#include "utils/float.h"
#include "utils/fmgrprotos.h"
#include "utils/lsyscache.h"
#include "utils/rangetypes.h"
static int float8_qsort_cmp(const void *a1, const void *a2);
static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
static void compute_range_stats(VacAttrStats *stats,
AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
/*
* range_typanalyze -- typanalyze function for range columns
*/
Datum
range_typanalyze(PG_FUNCTION_ARGS)
{
VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
TypeCacheEntry *typcache;
Form_pg_attribute attr = stats->attr;
/* Get information about range type; note column might be a domain */
typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
if (attr->attstattarget < 0)
attr->attstattarget = default_statistics_target;
stats->compute_stats = compute_range_stats;
stats->extra_data = typcache;
/* same as in std_typanalyze */
stats->minrows = 300 * attr->attstattarget;
PG_RETURN_BOOL(true);
}
/*
* Comparison function for sorting float8s, used for range lengths.
*/
static int
float8_qsort_cmp(const void *a1, const void *a2)
{
const float8 *f1 = (const float8 *) a1;
const float8 *f2 = (const float8 *) a2;
if (*f1 < *f2)
return -1;
else if (*f1 == *f2)
return 0;
else
return 1;
}
/*
* Comparison function for sorting RangeBounds.
*/
static int
range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
{
RangeBound *b1 = (RangeBound *) a1;
RangeBound *b2 = (RangeBound *) a2;
TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
return range_cmp_bounds(typcache, b1, b2);
}
/*
* compute_range_stats() -- compute statistics for a range column
*/
static void
compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
int samplerows, double totalrows)
{
TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
int null_cnt = 0;
int non_null_cnt = 0;
int non_empty_cnt = 0;
int empty_cnt = 0;
int range_no;
int slot_idx;
int num_bins = stats->attr->attstattarget;
int num_hist;
float8 *lengths;
RangeBound *lowers,
*uppers;
double total_width = 0;
/* Allocate memory to hold range bounds and lengths of the sample ranges. */
lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
lengths = (float8 *) palloc(sizeof(float8) * samplerows);
/* Loop over the sample ranges. */
for (range_no = 0; range_no < samplerows; range_no++)
{
Datum value;
bool isnull,
empty;
RangeType *range;
RangeBound lower,
upper;
float8 length;
vacuum_delay_point();
value = fetchfunc(stats, range_no, &isnull);
if (isnull)
{
/* range is null, just count that */
null_cnt++;
continue;
}
/*
* XXX: should we ignore wide values, like std_typanalyze does, to
* avoid bloating the statistics table?
*/
total_width += VARSIZE_ANY(DatumGetPointer(value));
/* Get range and deserialize it for further analysis. */
range = DatumGetRangeTypeP(value);
range_deserialize(typcache, range, &lower, &upper, &empty);
if (!empty)
{
/* Remember bounds and length for further usage in histograms */
lowers[non_empty_cnt] = lower;
uppers[non_empty_cnt] = upper;
if (lower.infinite || upper.infinite)
{
/* Length of any kind of an infinite range is infinite */
length = get_float8_infinity();
}
else if (has_subdiff)
{
/*
* For an ordinary range, use subdiff function between upper
* and lower bound values.
*/
length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
typcache->rng_collation,
upper.val, lower.val));
}
else
{
/* Use default value of 1.0 if no subdiff is available. */
length = 1.0;
}
lengths[non_empty_cnt] = length;
non_empty_cnt++;
}
else
empty_cnt++;
non_null_cnt++;
}
slot_idx = 0;
/* We can only compute real stats if we found some non-null values. */
if (non_null_cnt > 0)
{
Datum *bound_hist_values;
Datum *length_hist_values;
int pos,
posfrac,
delta,
deltafrac,
i;
MemoryContext old_cxt;
float4 *emptyfrac;
stats->stats_valid = true;
/* Do the simple null-frac and width stats */
stats->stanullfrac = (double) null_cnt / (double) samplerows;
stats->stawidth = total_width / (double) non_null_cnt;
Fix misestimation of n_distinct for a nearly-unique column with many nulls. If ANALYZE found no repeated non-null entries in its sample, it set the column's stadistinct value to -1.0, intending to indicate that the entries are all distinct. But what this value actually means is that the number of distinct values is 100% of the table's rowcount, and thus it was overestimating the number of distinct values by however many nulls there are. This could lead to very poor selectivity estimates, as for example in a recent report from Andreas Joseph Krogh. We should discount the stadistinct value by whatever we've estimated the nulls fraction to be. (That is what will happen if we choose to use a negative stadistinct for a column that does have repeated entries, so this code path was just inconsistent.) In addition to fixing the stadistinct entries stored by several different ANALYZE code paths, adjust the logic where get_variable_numdistinct() forces an "all distinct" estimate on the basis of finding a relevant unique index. Unique indexes don't reject nulls, so there's no reason to assume that the null fraction doesn't apply. Back-patch to all supported branches. Back-patching is a bit of a judgment call, but this problem seems to affect only a few users (else we'd have identified it long ago), and it's bad enough when it does happen that destabilizing plan choices in a worse direction seems unlikely. Patch by me, with documentation wording suggested by Dean Rasheed Report: <VisenaEmail.26.df42f82acae38a58.156463942b8@tc7-visena> Discussion: <16143.1470350371@sss.pgh.pa.us>
2016-08-08 00:52:02 +02:00
/* Estimate that non-null values are unique */
stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
/* Must copy the target values into anl_context */
old_cxt = MemoryContextSwitchTo(stats->anl_context);
/*
* Generate a bounds histogram slot entry if there are at least two
* values.
*/
if (non_empty_cnt >= 2)
{
/* Sort bound values */
qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
range_bound_qsort_cmp, typcache);
qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
range_bound_qsort_cmp, typcache);
num_hist = non_empty_cnt;
if (num_hist > num_bins)
num_hist = num_bins + 1;
bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
/*
* The object of this loop is to construct ranges from first and
* last entries in lowers[] and uppers[] along with evenly-spaced
* values in between. So the i'th value is a range of lowers[(i *
* (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
* (num_hist - 1)]. But computing that subscript directly risks
* integer overflow when the stats target is more than a couple
* thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
* at each step, tracking the integral and fractional parts of the
* sum separately.
*/
delta = (non_empty_cnt - 1) / (num_hist - 1);
deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
pos = posfrac = 0;
for (i = 0; i < num_hist; i++)
{
bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
&lowers[pos],
&uppers[pos],
false));
pos += delta;
posfrac += deltafrac;
if (posfrac >= (num_hist - 1))
{
/* fractional part exceeds 1, carry to integer part */
pos++;
posfrac -= (num_hist - 1);
}
}
stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
stats->stavalues[slot_idx] = bound_hist_values;
stats->numvalues[slot_idx] = num_hist;
slot_idx++;
}
/*
* Generate a length histogram slot entry if there are at least two
* values.
*/
if (non_empty_cnt >= 2)
{
/*
* Ascending sort of range lengths for further filling of
* histogram
*/
qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
num_hist = non_empty_cnt;
if (num_hist > num_bins)
num_hist = num_bins + 1;
length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
/*
* The object of this loop is to copy the first and last lengths[]
* entries along with evenly-spaced values in between. So the i'th
* value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
* computing that subscript directly risks integer overflow when
* the stats target is more than a couple thousand. Instead we
* add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
* the integral and fractional parts of the sum separately.
*/
delta = (non_empty_cnt - 1) / (num_hist - 1);
deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
pos = posfrac = 0;
for (i = 0; i < num_hist; i++)
{
length_hist_values[i] = Float8GetDatum(lengths[pos]);
pos += delta;
posfrac += deltafrac;
if (posfrac >= (num_hist - 1))
{
/* fractional part exceeds 1, carry to integer part */
pos++;
posfrac -= (num_hist - 1);
}
}
}
else
{
/*
* Even when we don't create the histogram, store an empty array
* to mean "no histogram". We can't just leave stavalues NULL,
* because get_attstatsslot() errors if you ask for stavalues, and
* it's NULL. We'll still store the empty fraction in stanumbers.
*/
length_hist_values = palloc(0);
num_hist = 0;
}
stats->staop[slot_idx] = Float8LessOperator;
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
stats->stacoll[slot_idx] = InvalidOid;
stats->stavalues[slot_idx] = length_hist_values;
stats->numvalues[slot_idx] = num_hist;
stats->statypid[slot_idx] = FLOAT8OID;
stats->statyplen[slot_idx] = sizeof(float8);
stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL;
stats->statypalign[slot_idx] = 'd';
/* Store the fraction of empty ranges */
emptyfrac = (float4 *) palloc(sizeof(float4));
*emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
stats->stanumbers[slot_idx] = emptyfrac;
stats->numnumbers[slot_idx] = 1;
stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
slot_idx++;
MemoryContextSwitchTo(old_cxt);
}
else if (null_cnt > 0)
{
/* We found only nulls; assume the column is entirely null */
stats->stats_valid = true;
stats->stanullfrac = 1.0;
stats->stawidth = 0; /* "unknown" */
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
stats->stadistinct = 0.0; /* "unknown" */
}
/*
* We don't need to bother cleaning up any of our temporary palloc's. The
* hashtable should also go away, as it used a child memory context.
*/
}