postgresql/src/backend/utils/adt/array_selfuncs.c

1194 lines
32 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* array_selfuncs.c
* Functions for selectivity estimation of array operators
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/adt/array_selfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include "access/htup_details.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
#include "utils/typcache.h"
/* Default selectivity constant for "@>" and "<@" operators */
#define DEFAULT_CONTAIN_SEL 0.005
/* Default selectivity constant for "&&" operator */
#define DEFAULT_OVERLAP_SEL 0.01
/* Default selectivity for given operator */
#define DEFAULT_SEL(operator) \
((operator) == OID_ARRAY_OVERLAP_OP ? \
DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
Oid elemtype, Oid operator);
static Selectivity mcelem_array_selec(ArrayType *array,
TypeCacheEntry *typentry,
Datum *mcelem, int nmcelem,
float4 *numbers, int nnumbers,
float4 *hist, int nhist,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
Oid operator);
static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
float4 *numbers, int nnumbers,
Datum *array_data, int nitems,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
Oid operator, TypeCacheEntry *typentry);
static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
float4 *numbers, int nnumbers,
Datum *array_data, int nitems,
float4 *hist, int nhist,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
Oid operator, TypeCacheEntry *typentry);
static float *calc_hist(const float4 *hist, int nhist, int n);
static float *calc_distr(const float *p, int n, int m, float rest);
static int floor_log2(uint32 n);
static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
int *index, TypeCacheEntry *typentry);
static int element_compare(const void *key1, const void *key2, void *arg);
static int float_compare_desc(const void *key1, const void *key2);
/*
* scalararraysel_containment
* Estimate selectivity of ScalarArrayOpExpr via array containment.
*
* If we have const =/<> ANY/ALL (array_var) then we can estimate the
* selectivity as though this were an array containment operator,
* array_var op ARRAY[const].
*
* scalararraysel() has already verified that the ScalarArrayOpExpr's operator
* is the array element type's default equality or inequality operator, and
* has aggressively simplified both inputs to constants.
*
* Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
*/
Selectivity
scalararraysel_containment(PlannerInfo *root,
Node *leftop, Node *rightop,
Oid elemtype, bool isEquality, bool useOr,
int varRelid)
{
Selectivity selec;
VariableStatData vardata;
Datum constval;
TypeCacheEntry *typentry;
FmgrInfo *cmpfunc;
/*
* rightop must be a variable, else punt.
*/
examine_variable(root, rightop, varRelid, &vardata);
if (!vardata.rel)
{
ReleaseVariableStats(vardata);
return -1.0;
}
/*
* leftop must be a constant, else punt.
*/
if (!IsA(leftop, Const))
{
ReleaseVariableStats(vardata);
return -1.0;
}
if (((Const *) leftop)->constisnull)
{
/* qual can't succeed if null on left */
ReleaseVariableStats(vardata);
return (Selectivity) 0.0;
}
constval = ((Const *) leftop)->constvalue;
/* Get element type's default comparison function */
typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
{
ReleaseVariableStats(vardata);
return -1.0;
}
cmpfunc = &typentry->cmp_proc_finfo;
/*
* If the operator is <>, swap ANY/ALL, then invert the result later.
*/
if (!isEquality)
useOr = !useOr;
/* Get array element stats for var, if available */
if (HeapTupleIsValid(vardata.statsTuple) &&
statistic_proc_security_check(&vardata, cmpfunc->fn_oid))
{
Form_pg_statistic stats;
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
AttStatsSlot sslot;
AttStatsSlot hslot;
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
/* MCELEM will be an array of same type as element */
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
if (get_attstatsslot(&sslot, vardata.statsTuple,
STATISTIC_KIND_MCELEM, InvalidOid,
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
{
/* For ALL case, also get histogram of distinct-element counts */
if (useOr ||
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
!get_attstatsslot(&hslot, vardata.statsTuple,
STATISTIC_KIND_DECHIST, InvalidOid,
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
ATTSTATSSLOT_NUMBERS))
memset(&hslot, 0, sizeof(hslot));
/*
* For = ANY, estimate as var @> ARRAY[const].
*
* For = ALL, estimate as var <@ ARRAY[const].
*/
if (useOr)
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
selec = mcelem_array_contain_overlap_selec(sslot.values,
sslot.nvalues,
sslot.numbers,
sslot.nnumbers,
&constval, 1,
OID_ARRAY_CONTAINS_OP,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
typentry);
else
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
selec = mcelem_array_contained_selec(sslot.values,
sslot.nvalues,
sslot.numbers,
sslot.nnumbers,
&constval, 1,
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
hslot.numbers,
hslot.nnumbers,
OID_ARRAY_CONTAINED_OP,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
typentry);
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
free_attstatsslot(&hslot);
free_attstatsslot(&sslot);
}
else
{
/* No most-common-elements info, so do without */
if (useOr)
selec = mcelem_array_contain_overlap_selec(NULL, 0,
NULL, 0,
&constval, 1,
OID_ARRAY_CONTAINS_OP,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
typentry);
else
selec = mcelem_array_contained_selec(NULL, 0,
NULL, 0,
&constval, 1,
NULL, 0,
OID_ARRAY_CONTAINED_OP,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
typentry);
}
/*
* MCE stats count only non-null rows, so adjust for null rows.
*/
selec *= (1.0 - stats->stanullfrac);
}
else
{
/* No stats at all, so do without */
if (useOr)
selec = mcelem_array_contain_overlap_selec(NULL, 0,
NULL, 0,
&constval, 1,
OID_ARRAY_CONTAINS_OP,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
typentry);
else
selec = mcelem_array_contained_selec(NULL, 0,
NULL, 0,
&constval, 1,
NULL, 0,
OID_ARRAY_CONTAINED_OP,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
typentry);
/* we assume no nulls here, so no stanullfrac correction */
}
ReleaseVariableStats(vardata);
/*
* If the operator is <>, invert the results.
*/
if (!isEquality)
selec = 1.0 - selec;
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* arraycontsel -- restriction selectivity for array @>, &&, <@ operators
*/
Datum
arraycontsel(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
VariableStatData vardata;
Node *other;
bool varonleft;
Selectivity selec;
Oid element_typeid;
/*
* If expression is not (variable op something) or (something op
* variable), then punt and return a default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
/*
* Can't do anything useful if the something is not a constant, either.
*/
if (!IsA(other, Const))
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
}
/*
* The "&&", "@>" and "<@" operators are strict, so we can cope with a
* NULL constant right away.
*/
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
/*
* If var is on the right, commute the operator, so that we can assume the
* var is on the left in what follows.
*/
if (!varonleft)
{
if (operator == OID_ARRAY_CONTAINS_OP)
operator = OID_ARRAY_CONTAINED_OP;
else if (operator == OID_ARRAY_CONTAINED_OP)
operator = OID_ARRAY_CONTAINS_OP;
}
/*
* OK, there's a Var and a Const we're dealing with here. We need the
* Const to be an array with same element type as column, else we can't do
* anything useful. (Such cases will likely fail at runtime, but here
* we'd rather just return a default estimate.)
*/
element_typeid = get_base_element_type(((Const *) other)->consttype);
if (element_typeid != InvalidOid &&
element_typeid == get_base_element_type(vardata.vartype))
{
selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
element_typeid, operator);
}
else
{
selec = DEFAULT_SEL(operator);
}
ReleaseVariableStats(vardata);
CLAMP_PROBABILITY(selec);
PG_RETURN_FLOAT8((float8) selec);
}
/*
* arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
*/
Datum
arraycontjoinsel(PG_FUNCTION_ARGS)
{
/* For the moment this is just a stub */
Oid operator = PG_GETARG_OID(1);
PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
}
/*
* Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
* or "arraycolumn <@ const" based on the statistics
*
* This function is mainly responsible for extracting the pg_statistic data
* to be used; we then pass the problem on to mcelem_array_selec().
*/
static Selectivity
calc_arraycontsel(VariableStatData *vardata, Datum constval,
Oid elemtype, Oid operator)
{
Selectivity selec;
TypeCacheEntry *typentry;
FmgrInfo *cmpfunc;
ArrayType *array;
/* Get element type's default comparison function */
typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
return DEFAULT_SEL(operator);
cmpfunc = &typentry->cmp_proc_finfo;
/*
* The caller made sure the const is an array with same element type, so
* get it now
*/
array = DatumGetArrayTypeP(constval);
if (HeapTupleIsValid(vardata->statsTuple) &&
statistic_proc_security_check(vardata, cmpfunc->fn_oid))
{
Form_pg_statistic stats;
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
AttStatsSlot sslot;
AttStatsSlot hslot;
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
/* MCELEM will be an array of same type as column */
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
if (get_attstatsslot(&sslot, vardata->statsTuple,
STATISTIC_KIND_MCELEM, InvalidOid,
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
{
/*
* For "array <@ const" case we also need histogram of distinct
* element counts.
*/
if (operator != OID_ARRAY_CONTAINED_OP ||
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
!get_attstatsslot(&hslot, vardata->statsTuple,
STATISTIC_KIND_DECHIST, InvalidOid,
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
ATTSTATSSLOT_NUMBERS))
memset(&hslot, 0, sizeof(hslot));
/* Use the most-common-elements slot for the array Var. */
selec = mcelem_array_selec(array, typentry,
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
sslot.values, sslot.nvalues,
sslot.numbers, sslot.nnumbers,
hslot.numbers, hslot.nnumbers,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
operator);
Redesign get_attstatsslot()/free_attstatsslot() for more safety and speed. The mess cleaned up in commit da0759600 is clear evidence that it's a bug hazard to expect the caller of get_attstatsslot()/free_attstatsslot() to provide the correct type OID for the array elements in the slot. Moreover, we weren't even getting any performance benefit from that, since get_attstatsslot() was extracting the real type OID from the array anyway. So we ought to get rid of that requirement; indeed, it would make more sense for get_attstatsslot() to pass back the type OID it found, in case the caller isn't sure what to expect, which is likely in binary- compatible-operator cases. Another problem with the current implementation is that if the stats array element type is pass-by-reference, we incur a palloc/memcpy/pfree cycle for each element. That seemed acceptable when the code was written because we were targeting O(10) array sizes --- but these days, stats arrays are almost always bigger than that, sometimes much bigger. We can save a significant number of cycles by doing one palloc/memcpy/pfree of the whole array. Indeed, in the now-probably-common case where the array is toasted, that happens anyway so this method is basically free. (Note: although the catcache code will inline any out-of-line toasted values, it doesn't decompress them. At the other end of the size range, it doesn't expand short-header datums either. In either case, DatumGetArrayTypeP would have to make a copy. We do end up using an extra array copy step if the element type is pass-by-value and the array length is neither small enough for a short header nor large enough to have suffered compression. But that seems like a very acceptable price for winning in pass-by-ref cases.) Hence, redesign to take these insights into account. While at it, convert to an API in which we fill a struct rather than passing a bunch of pointers to individual output arguments. That will make it less painful if we ever want further expansion of what get_attstatsslot can pass back. It's certainly arguable that this is new development and not something to push post-feature-freeze. However, I view it as primarily bug-proofing and therefore something that's better to have sooner not later. Since we aren't quite at beta phase yet, let's put it in. Discussion: https://postgr.es/m/16364.1494520862@sss.pgh.pa.us
2017-05-13 21:14:39 +02:00
free_attstatsslot(&hslot);
free_attstatsslot(&sslot);
}
else
{
/* No most-common-elements info, so do without */
selec = mcelem_array_selec(array, typentry,
NULL, 0, NULL, 0, NULL, 0,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
operator);
}
/*
* MCE stats count only non-null rows, so adjust for null rows.
*/
selec *= (1.0 - stats->stanullfrac);
}
else
{
/* No stats at all, so do without */
selec = mcelem_array_selec(array, typentry,
NULL, 0, NULL, 0, NULL, 0,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
operator);
/* we assume no nulls here, so no stanullfrac correction */
}
/* If constant was toasted, release the copy we made */
if (PointerGetDatum(array) != constval)
pfree(array);
return selec;
}
/*
* Array selectivity estimation based on most common elements statistics
*
* This function just deconstructs and sorts the array constant's contents,
* and then passes the problem on to mcelem_array_contain_overlap_selec or
* mcelem_array_contained_selec depending on the operator.
*/
static Selectivity
mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
Datum *mcelem, int nmcelem,
float4 *numbers, int nnumbers,
float4 *hist, int nhist,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
Oid operator)
{
Selectivity selec;
int num_elems;
Datum *elem_values;
bool *elem_nulls;
bool null_present;
int nonnull_nitems;
int i;
/*
* Prepare constant array data for sorting. Sorting lets us find unique
* elements and efficiently merge with the MCELEM array.
*/
deconstruct_array(array,
typentry->type_id,
typentry->typlen,
typentry->typbyval,
typentry->typalign,
&elem_values, &elem_nulls, &num_elems);
/* Collapse out any null elements */
nonnull_nitems = 0;
null_present = false;
for (i = 0; i < num_elems; i++)
{
if (elem_nulls[i])
null_present = true;
else
elem_values[nonnull_nitems++] = elem_values[i];
}
/*
* Query "column @> '{anything, null}'" matches nothing. For the other
* two operators, presence of a null in the constant can be ignored.
*/
if (null_present && operator == OID_ARRAY_CONTAINS_OP)
{
pfree(elem_values);
pfree(elem_nulls);
return (Selectivity) 0.0;
}
/* Sort extracted elements using their default comparison function. */
qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
element_compare, typentry);
/* Separate cases according to operator */
if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
numbers, nnumbers,
elem_values, nonnull_nitems,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
operator, typentry);
else if (operator == OID_ARRAY_CONTAINED_OP)
selec = mcelem_array_contained_selec(mcelem, nmcelem,
numbers, nnumbers,
elem_values, nonnull_nitems,
hist, nhist,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
operator, typentry);
else
{
elog(ERROR, "arraycontsel called for unrecognized operator %u",
operator);
selec = 0.0; /* keep compiler quiet */
}
pfree(elem_values);
pfree(elem_nulls);
return selec;
}
/*
* Estimate selectivity of "column @> const" and "column && const" based on
* most common element statistics. This estimation assumes element
* occurrences are independent.
*
* mcelem (of length nmcelem) and numbers (of length nnumbers) are from
* the array column's MCELEM statistics slot, or are NULL/0 if stats are
* not available. array_data (of length nitems) is the constant's elements.
*
* Both the mcelem and array_data arrays are assumed presorted according
* to the element type's cmpfunc. Null elements are not present.
*
* TODO: this estimate probably could be improved by using the distinct
* elements count histogram. For example, excepting the special case of
* "column @> '{}'", we can multiply the calculated selectivity by the
* fraction of nonempty arrays in the column.
*/
static Selectivity
mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
float4 *numbers, int nnumbers,
Datum *array_data, int nitems,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
Oid operator, TypeCacheEntry *typentry)
{
Selectivity selec,
elem_selec;
int mcelem_index,
i;
bool use_bsearch;
float4 minfreq;
/*
* There should be three more Numbers than Values, because the last three
* cells should hold minimal and maximal frequency among the non-null
* elements, and then the frequency of null elements. Ignore the Numbers
* if not right.
*/
if (nnumbers != nmcelem + 3)
{
numbers = NULL;
nnumbers = 0;
}
if (numbers)
{
/* Grab the lowest observed frequency */
minfreq = numbers[nmcelem];
}
else
{
/* Without statistics make some default assumptions */
2012-10-07 23:31:10 +02:00
minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
}
/* Decide whether it is faster to use binary search or not. */
if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
use_bsearch = true;
else
use_bsearch = false;
if (operator == OID_ARRAY_CONTAINS_OP)
{
/*
* Initial selectivity for "column @> const" query is 1.0, and it will
* be decreased with each element of constant array.
*/
selec = 1.0;
}
else
{
/*
* Initial selectivity for "column && const" query is 0.0, and it will
* be increased with each element of constant array.
*/
selec = 0.0;
}
/* Scan mcelem and array in parallel. */
mcelem_index = 0;
for (i = 0; i < nitems; i++)
{
bool match = false;
/* Ignore any duplicates in the array data. */
if (i > 0 &&
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
continue;
/* Find the smallest MCELEM >= this array item. */
if (use_bsearch)
{
match = find_next_mcelem(mcelem, nmcelem, array_data[i],
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
&mcelem_index, typentry);
}
else
{
while (mcelem_index < nmcelem)
{
int cmp = element_compare(&mcelem[mcelem_index],
&array_data[i],
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
typentry);
if (cmp < 0)
mcelem_index++;
else
{
if (cmp == 0)
match = true; /* mcelem is found */
break;
}
}
}
if (match && numbers)
{
/* MCELEM matches the array item; use its frequency. */
elem_selec = numbers[mcelem_index];
mcelem_index++;
}
else
{
/*
* The element is not in MCELEM. Punt, but assume that the
* selectivity cannot be more than minfreq / 2.
*/
elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
}
/*
* Update overall selectivity using the current element's selectivity
* and an assumption of element occurrence independence.
*/
if (operator == OID_ARRAY_CONTAINS_OP)
selec *= elem_selec;
else
selec = selec + elem_selec - selec * elem_selec;
/* Clamp intermediate results to stay sane despite roundoff error */
CLAMP_PROBABILITY(selec);
}
return selec;
}
/*
* Estimate selectivity of "column <@ const" based on most common element
* statistics.
*
* mcelem (of length nmcelem) and numbers (of length nnumbers) are from
* the array column's MCELEM statistics slot, or are NULL/0 if stats are
* not available. array_data (of length nitems) is the constant's elements.
* hist (of length nhist) is from the array column's DECHIST statistics slot,
* or is NULL/0 if those stats are not available.
*
* Both the mcelem and array_data arrays are assumed presorted according
* to the element type's cmpfunc. Null elements are not present.
*
* Independent element occurrence would imply a particular distribution of
* distinct element counts among matching rows. Real data usually falsifies
* that assumption. For example, in a set of 11-element integer arrays having
* elements in the range [0..10], element occurrences are typically not
* independent. If they were, a sufficiently-large set would include all
* distinct element counts 0 through 11. We correct for this using the
* histogram of distinct element counts.
*
* In the "column @> const" and "column && const" cases, we usually have a
* "const" with low number of elements (otherwise we have selectivity close
* to 0 or 1 respectively). That's why the effect of dependence related
* to distinct element count distribution is negligible there. In the
* "column <@ const" case, number of elements is usually high (otherwise we
* have selectivity close to 0). That's why we should do a correction with
* the array distinct element count distribution here.
*
* Using the histogram of distinct element counts produces a different
* distribution law than independent occurrences of elements. This
* distribution law can be described as follows:
*
* P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
* (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
*
* where:
* o1, o2, ..., on - occurrences of elements 1, 2, ..., n
* (1 - occurrence, 0 - no occurrence) in row
* f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
* (scalar values in [0..1]) according to collected statistics
* m = o1 + o2 + ... + on = total number of distinct elements in row
* hist[m] - histogram data for occurrence of m elements.
* ind[m] - probability of m occurrences from n events assuming their
* probabilities to be equal to frequencies of array elements.
*
* ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
* ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
*/
static Selectivity
mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
float4 *numbers, int nnumbers,
Datum *array_data, int nitems,
float4 *hist, int nhist,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
Oid operator, TypeCacheEntry *typentry)
{
int mcelem_index,
i,
unique_nitems = 0;
float selec,
minfreq,
nullelem_freq;
float *dist,
*mcelem_dist,
*hist_part;
float avg_count,
mult,
rest;
float *elem_selec;
/*
* There should be three more Numbers than Values in the MCELEM slot,
* because the last three cells should hold minimal and maximal frequency
* among the non-null elements, and then the frequency of null elements.
* Punt if not right, because we can't do much without the element freqs.
*/
if (numbers == NULL || nnumbers != nmcelem + 3)
return DEFAULT_CONTAIN_SEL;
/* Can't do much without a count histogram, either */
if (hist == NULL || nhist < 3)
return DEFAULT_CONTAIN_SEL;
/*
* Grab some of the summary statistics that compute_array_stats() stores:
* lowest frequency, frequency of null elements, and average distinct
* element count.
*/
minfreq = numbers[nmcelem];
nullelem_freq = numbers[nmcelem + 2];
avg_count = hist[nhist - 1];
/*
* "rest" will be the sum of the frequencies of all elements not
* represented in MCELEM. The average distinct element count is the sum
* of the frequencies of *all* elements. Begin with that; we will proceed
* to subtract the MCELEM frequencies.
*/
rest = avg_count;
/*
* mult is a multiplier representing estimate of probability that each
* mcelem that is not present in constant doesn't occur.
*/
mult = 1.0f;
/*
* elem_selec is array of estimated frequencies for elements in the
* constant.
*/
elem_selec = (float *) palloc(sizeof(float) * nitems);
/* Scan mcelem and array in parallel. */
mcelem_index = 0;
for (i = 0; i < nitems; i++)
{
bool match = false;
/* Ignore any duplicates in the array data. */
if (i > 0 &&
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
element_compare(&array_data[i - 1], &array_data[i], typentry) == 0)
continue;
/*
* Iterate over MCELEM until we find an entry greater than or equal to
* this element of the constant. Update "rest" and "mult" for mcelem
* entries skipped over.
*/
while (mcelem_index < nmcelem)
{
int cmp = element_compare(&mcelem[mcelem_index],
&array_data[i],
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
typentry);
if (cmp < 0)
{
mult *= (1.0f - numbers[mcelem_index]);
rest -= numbers[mcelem_index];
mcelem_index++;
}
else
{
if (cmp == 0)
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:18:54 +02:00
match = true; /* mcelem is found */
break;
}
}
if (match)
{
/* MCELEM matches the array item. */
elem_selec[unique_nitems] = numbers[mcelem_index];
/* "rest" is decremented for all mcelems, matched or not */
rest -= numbers[mcelem_index];
mcelem_index++;
}
else
{
/*
* The element is not in MCELEM. Punt, but assume that the
* selectivity cannot be more than minfreq / 2.
*/
elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
minfreq / 2);
}
unique_nitems++;
}
/*
* If we handled all constant elements without exhausting the MCELEM
* array, finish walking it to complete calculation of "rest" and "mult".
*/
while (mcelem_index < nmcelem)
{
mult *= (1.0f - numbers[mcelem_index]);
rest -= numbers[mcelem_index];
mcelem_index++;
}
/*
* The presence of many distinct rare elements materially decreases
* selectivity. Use the Poisson distribution to estimate the probability
* of a column value having zero occurrences of such elements. See above
* for the definition of "rest".
*/
mult *= exp(-rest);
/*----------
* Using the distinct element count histogram requires
* O(unique_nitems * (nmcelem + unique_nitems))
* operations. Beyond a certain computational cost threshold, it's
* reasonable to sacrifice accuracy for decreased planning time. We limit
* the number of operations to EFFORT * nmcelem; since nmcelem is limited
* by the column's statistics target, the work done is user-controllable.
*
* If the number of operations would be too large, we can reduce it
* without losing all accuracy by reducing unique_nitems and considering
* only the most-common elements of the constant array. To make the
* results exactly match what we would have gotten with only those
* elements to start with, we'd have to remove any discarded elements'
* frequencies from "mult", but since this is only an approximation
* anyway, we don't bother with that. Therefore it's sufficient to qsort
* elem_selec[] and take the largest elements. (They will no longer match
* up with the elements of array_data[], but we don't care.)
*----------
*/
#define EFFORT 100
if ((nmcelem + unique_nitems) > 0 &&
unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
{
/*
* Use the quadratic formula to solve for largest allowable N. We
* have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
*/
double b = (double) nmcelem;
int n;
n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
/* Sort, then take just the first n elements */
qsort(elem_selec, unique_nitems, sizeof(float),
float_compare_desc);
unique_nitems = n;
}
/*
* Calculate probabilities of each distinct element count for both mcelems
* and constant elements. At this point, assume independent element
* occurrence.
*/
dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
/* ignore hist[nhist-1], which is the average not a histogram member */
hist_part = calc_hist(hist, nhist - 1, unique_nitems);
selec = 0.0f;
for (i = 0; i <= unique_nitems; i++)
{
/*
* mult * dist[i] / mcelem_dist[i] gives us probability of qual
* matching from assumption of independent element occurrence with the
* condition that distinct element count = i.
*/
if (mcelem_dist[i] > 0)
selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
}
pfree(dist);
pfree(mcelem_dist);
pfree(hist_part);
pfree(elem_selec);
/* Take into account occurrence of NULL element. */
selec *= (1.0f - nullelem_freq);
CLAMP_PROBABILITY(selec);
return selec;
}
/*
* Calculate the first n distinct element count probabilities from a
* histogram of distinct element counts.
*
* Returns a palloc'd array of n+1 entries, with array[k] being the
* probability of element count k, k in [0..n].
*
* We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
* (nhist - 1)) probability to each value in (a,b) and an additional half of
* that to a and b themselves.
*/
static float *
calc_hist(const float4 *hist, int nhist, int n)
{
float *hist_part;
int k,
i = 0;
float prev_interval = 0,
next_interval;
float frac;
hist_part = (float *) palloc((n + 1) * sizeof(float));
/*
* frac is a probability contribution for each interval between histogram
* values. We have nhist - 1 intervals, so contribution of each one will
* be 1 / (nhist - 1).
*/
frac = 1.0f / ((float) (nhist - 1));
for (k = 0; k <= n; k++)
{
int count = 0;
/*
* Count the histogram boundaries equal to k. (Although the histogram
* should theoretically contain only exact integers, entries are
* floats so there could be roundoff error in large values. Treat any
* fractional value as equal to the next larger k.)
*/
while (i < nhist && hist[i] <= k)
{
count++;
i++;
}
if (count > 0)
{
/* k is an exact bound for at least one histogram box. */
float val;
/* Find length between current histogram value and the next one */
if (i < nhist)
next_interval = hist[i] - hist[i - 1];
else
next_interval = 0;
/*
* count - 1 histogram boxes contain k exclusively. They
* contribute a total of (count - 1) * frac probability. Also
* factor in the partial histogram boxes on either side.
*/
val = (float) (count - 1);
if (next_interval > 0)
val += 0.5f / next_interval;
if (prev_interval > 0)
val += 0.5f / prev_interval;
hist_part[k] = frac * val;
prev_interval = next_interval;
}
else
{
/* k does not appear as an exact histogram bound. */
if (prev_interval > 0)
hist_part[k] = frac / prev_interval;
else
hist_part[k] = 0.0f;
}
}
return hist_part;
}
/*
* Consider n independent events with probabilities p[]. This function
* calculates probabilities of exact k of events occurrence for k in [0..m].
* Returns a palloc'd array of size m+1.
*
* "rest" is the sum of the probabilities of all low-probability events not
* included in p.
*
* Imagine matrix M of size (n + 1) x (m + 1). Element M[i,j] denotes the
* probability that exactly j of first i events occur. Obviously M[0,0] = 1.
* For any constant j, each increment of i increases the probability iff the
* event occurs. So, by the law of total probability:
* M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
* for i > 0, j > 0.
* M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
*/
static float *
calc_distr(const float *p, int n, int m, float rest)
{
float *row,
*prev_row,
*tmp;
int i,
j;
/*
* Since we return only the last row of the matrix and need only the
* current and previous row for calculations, allocate two rows.
*/
row = (float *) palloc((m + 1) * sizeof(float));
prev_row = (float *) palloc((m + 1) * sizeof(float));
/* M[0,0] = 1 */
row[0] = 1.0f;
for (i = 1; i <= n; i++)
{
float t = p[i - 1];
/* Swap rows */
tmp = row;
row = prev_row;
prev_row = tmp;
/* Calculate next row */
for (j = 0; j <= i && j <= m; j++)
{
float val = 0.0f;
if (j < i)
val += prev_row[j] * (1.0f - t);
if (j > 0)
val += prev_row[j - 1] * t;
row[j] = val;
}
}
/*
* The presence of many distinct rare (not in "p") elements materially
* decreases selectivity. Model their collective occurrence with the
* Poisson distribution.
*/
if (rest > DEFAULT_CONTAIN_SEL)
{
float t;
/* Swap rows */
tmp = row;
row = prev_row;
prev_row = tmp;
for (i = 0; i <= m; i++)
row[i] = 0.0f;
/* Value of Poisson distribution for 0 occurrences */
t = exp(-rest);
/*
* Calculate convolution of previously computed distribution and the
* Poisson distribution.
*/
for (i = 0; i <= m; i++)
{
for (j = 0; j <= m - i; j++)
row[j + i] += prev_row[j] * t;
/* Get Poisson distribution value for (i + 1) occurrences */
t *= rest / (float) (i + 1);
}
}
pfree(prev_row);
return row;
}
/* Fast function for floor value of 2 based logarithm calculation. */
static int
floor_log2(uint32 n)
{
int logval = 0;
if (n == 0)
return -1;
if (n >= (1 << 16))
{
n >>= 16;
logval += 16;
}
if (n >= (1 << 8))
{
n >>= 8;
logval += 8;
}
if (n >= (1 << 4))
{
n >>= 4;
logval += 4;
}
if (n >= (1 << 2))
{
n >>= 2;
logval += 2;
}
if (n >= (1 << 1))
{
logval += 1;
}
return logval;
}
/*
* find_next_mcelem binary-searches a most common elements array, starting
* from *index, for the first member >= value. It saves the position of the
* match into *index and returns true if it's an exact match. (Note: we
* assume the mcelem elements are distinct so there can't be more than one
* exact match.)
*/
static bool
find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
TypeCacheEntry *typentry)
{
int l = *index,
r = nmcelem - 1,
i,
res;
while (l <= r)
{
i = (l + r) / 2;
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
res = element_compare(&mcelem[i], &value, typentry);
if (res == 0)
{
*index = i;
return true;
}
else if (res < 0)
l = i + 1;
else
r = i - 1;
}
*index = l;
return false;
}
/*
* Comparison function for elements.
*
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
* We use the element type's default btree opclass, and its default collation
* if the type is collation-sensitive.
*
* XXX consider using SortSupport infrastructure
*/
static int
element_compare(const void *key1, const void *key2, void *arg)
{
Datum d1 = *((const Datum *) key1);
Datum d2 = *((const Datum *) key2);
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
TypeCacheEntry *typentry = (TypeCacheEntry *) arg;
FmgrInfo *cmpfunc = &typentry->cmp_proc_finfo;
Datum c;
Make pg_statistic and related code account more honestly for collations. When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
2018-12-14 18:52:49 +01:00
c = FunctionCall2Coll(cmpfunc, typentry->typcollation, d1, d2);
return DatumGetInt32(c);
}
/*
* Comparison function for sorting floats into descending order.
*/
static int
float_compare_desc(const void *key1, const void *key2)
{
float d1 = *((const float *) key1);
float d2 = *((const float *) key2);
if (d1 > d2)
return -1;
else if (d1 < d2)
return 1;
else
return 0;
}