Make all comparisons done for/with statistics use the default collation.

While this will give wrong answers when estimating selectivity for a
comparison operator that's using a non-default collation, the estimation
error probably won't be large; and anyway the former approach created
estimation errors of its own by trying to use a histogram that might have
been computed with some other collation.  So we'll adopt this simplified
approach for now and perhaps improve it sometime in the future.

This patch incorporates changes from Andres Freund to make sure that
selfuncs.c passes a valid collation OID to any datatype-specific function
it calls, in case that function wants collation information.  Said OID will
now always be DEFAULT_COLLATION_OID, but at least we won't get errors.
This commit is contained in:
Tom Lane 2011-03-12 16:30:36 -05:00
parent 94fe9c0f4e
commit 696d1f7f06
5 changed files with 35 additions and 29 deletions

View File

@ -24,6 +24,7 @@
#include "catalog/index.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_inherits_fn.h"
#include "catalog/pg_namespace.h"
#include "commands/dbcommands.h"
@ -862,13 +863,11 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
{
stats->attrtypid = exprType(index_expr);
stats->attrtypmod = exprTypmod(index_expr);
stats->attrcollation = exprCollation(index_expr);
}
else
{
stats->attrtypid = attr->atttypid;
stats->attrtypmod = attr->atttypmod;
stats->attrcollation = attr->attcollation;
}
typtuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(stats->attrtypid));
@ -1931,7 +1930,8 @@ compute_minimal_stats(VacAttrStatsP stats,
track_cnt = 0;
fmgr_info(mystats->eqfunc, &f_cmpeq);
fmgr_info_collation(stats->attrcollation, &f_cmpeq);
/* We always use the default collation for statistics */
fmgr_info_collation(DEFAULT_COLLATION_OID, &f_cmpeq);
for (i = 0; i < samplerows; i++)
{
@ -2253,7 +2253,8 @@ compute_scalar_stats(VacAttrStatsP stats,
SelectSortFunction(mystats->ltopr, false, &cmpFn, &cmpFlags);
fmgr_info(cmpFn, &f_cmpfn);
fmgr_info_collation(stats->attrcollation, &f_cmpfn);
/* We always use the default collation for statistics */
fmgr_info_collation(DEFAULT_COLLATION_OID, &f_cmpfn);
/* Initial scan to find sortable values */
for (i = 0; i < samplerows; i++)

View File

@ -2056,7 +2056,6 @@ cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
mergejoinscansel(root,
(Node *) rinfo->clause,
pathkey->pk_opfamily,
pathkey->pk_collation,
pathkey->pk_strategy,
pathkey->pk_nulls_first,
&leftstartsel,

View File

@ -145,7 +145,7 @@ static double eqjoinsel_inner(Oid operator,
static double eqjoinsel_semi(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2);
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid, Oid boundscollid,
Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound);
static double convert_numeric_to_scalar(Datum value, Oid typid);
static void convert_string_to_scalar(char *value,
@ -164,10 +164,10 @@ static double convert_one_string_to_scalar(char *value,
int rangelo, int rangehi);
static double convert_one_bytea_to_scalar(unsigned char *value, int valuelen,
int rangelo, int rangehi);
static char *convert_string_datum(Datum value, Oid typid, Oid collid);
static char *convert_string_datum(Datum value, Oid typid);
static double convert_timevalue_to_scalar(Datum value, Oid typid);
static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata,
Oid sortop, Oid collation, Datum *min, Datum *max);
Oid sortop, Datum *min, Datum *max);
static bool get_actual_variable_range(PlannerInfo *root,
VariableStatData *vardata,
Oid sortop,
@ -285,6 +285,7 @@ var_eq_const(VariableStatData *vardata, Oid operator,
FmgrInfo eqproc;
fmgr_info(get_opcode(operator), &eqproc);
fmgr_info_collation(DEFAULT_COLLATION_OID, &eqproc);
for (i = 0; i < nvalues; i++)
{
@ -514,7 +515,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
fmgr_info(get_opcode(operator), &opproc);
fmgr_info_collation(vardata->attcollation, &opproc);
fmgr_info_collation(DEFAULT_COLLATION_OID, &opproc);
/*
* If we have most-common-values info, add up the fractions of the MCV
@ -839,7 +840,7 @@ ineq_histogram_selectivity(PlannerInfo *root,
*/
if (convert_to_scalar(constval, consttype, &val,
values[i - 1], values[i],
vardata->vartype, vardata->attcollation,
vardata->vartype,
&low, &high))
{
if (high <= low)
@ -1700,6 +1701,7 @@ scalararraysel(PlannerInfo *root,
if (!oprsel)
return (Selectivity) 0.5;
fmgr_info(oprsel, &oprselproc);
fmgr_info_collation(DEFAULT_COLLATION_OID, &oprselproc);
/* deconstruct the expression */
Assert(list_length(clause->args) == 2);
@ -2116,6 +2118,7 @@ eqjoinsel_inner(Oid operator,
nmatches;
fmgr_info(get_opcode(operator), &eqproc);
fmgr_info_collation(DEFAULT_COLLATION_OID, &eqproc);
hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
@ -2338,6 +2341,7 @@ eqjoinsel_semi(Oid operator,
nmatches;
fmgr_info(get_opcode(operator), &eqproc);
fmgr_info_collation(DEFAULT_COLLATION_OID, &eqproc);
hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
@ -2588,7 +2592,7 @@ icnlikejoinsel(PG_FUNCTION_ARGS)
*/
void
mergejoinscansel(PlannerInfo *root, Node *clause,
Oid opfamily, Oid collation, int strategy, bool nulls_first,
Oid opfamily, int strategy, bool nulls_first,
Selectivity *leftstart, Selectivity *leftend,
Selectivity *rightstart, Selectivity *rightend)
{
@ -2757,20 +2761,20 @@ mergejoinscansel(PlannerInfo *root, Node *clause,
/* Try to get ranges of both inputs */
if (!isgt)
{
if (!get_variable_range(root, &leftvar, lstatop, collation,
if (!get_variable_range(root, &leftvar, lstatop,
&leftmin, &leftmax))
goto fail; /* no range available from stats */
if (!get_variable_range(root, &rightvar, rstatop, collation,
if (!get_variable_range(root, &rightvar, rstatop,
&rightmin, &rightmax))
goto fail; /* no range available from stats */
}
else
{
/* need to swap the max and min */
if (!get_variable_range(root, &leftvar, lstatop, collation,
if (!get_variable_range(root, &leftvar, lstatop,
&leftmax, &leftmin))
goto fail; /* no range available from stats */
if (!get_variable_range(root, &rightvar, rstatop, collation,
if (!get_variable_range(root, &rightvar, rstatop,
&rightmax, &rightmin))
goto fail; /* no range available from stats */
}
@ -3371,7 +3375,7 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
*/
static bool
convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
Datum lobound, Datum hibound, Oid boundstypid, Oid boundscollid,
Datum lobound, Datum hibound, Oid boundstypid,
double *scaledlobound, double *scaledhibound)
{
/*
@ -3424,9 +3428,9 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
case TEXTOID:
case NAMEOID:
{
char *valstr = convert_string_datum(value, valuetypid, boundscollid);
char *lostr = convert_string_datum(lobound, boundstypid, boundscollid);
char *histr = convert_string_datum(hibound, boundstypid, boundscollid);
char *valstr = convert_string_datum(value, valuetypid);
char *lostr = convert_string_datum(lobound, boundstypid);
char *histr = convert_string_datum(hibound, boundstypid);
convert_string_to_scalar(valstr, scaledvalue,
lostr, scaledlobound,
@ -3670,7 +3674,7 @@ convert_one_string_to_scalar(char *value, int rangelo, int rangehi)
* before continuing, so as to generate correct locale-specific results.
*/
static char *
convert_string_datum(Datum value, Oid typid, Oid collid)
convert_string_datum(Datum value, Oid typid)
{
char *val;
@ -3703,7 +3707,7 @@ convert_string_datum(Datum value, Oid typid, Oid collid)
return NULL;
}
if (!lc_collate_is_c(collid))
if (!lc_collate_is_c(DEFAULT_COLLATION_OID))
{
char *xfrmstr;
size_t xfrmlen;
@ -4102,7 +4106,6 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
vardata->rel = find_base_rel(root, var->varno);
vardata->atttype = var->vartype;
vardata->atttypmod = var->vartypmod;
vardata->attcollation = var->varcollid;
vardata->isunique = has_unique_index(vardata->rel, var->varattno);
rte = root->simple_rte_array[var->varno];
@ -4188,7 +4191,6 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
vardata->var = node;
vardata->atttype = exprType(node);
vardata->atttypmod = exprTypmod(node);
vardata->attcollation = exprCollation(node);
if (onerel)
{
@ -4397,7 +4399,7 @@ get_variable_numdistinct(VariableStatData *vardata)
* be "<" not ">", as only the former is likely to be found in pg_statistic.
*/
static bool
get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop, Oid collation,
get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop,
Datum *min, Datum *max)
{
Datum tmin = 0;
@ -4482,7 +4484,7 @@ get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop, Oid
FmgrInfo opproc;
fmgr_info(get_opcode(sortop), &opproc);
fmgr_info_collation(collation, &opproc);
fmgr_info_collation(DEFAULT_COLLATION_OID, &opproc);
for (i = 0; i < nvalues; i++)
{
@ -5109,6 +5111,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
if (cmpopr == InvalidOid)
elog(ERROR, "no >= operator for opfamily %u", opfamily);
fmgr_info(get_opcode(cmpopr), &opproc);
fmgr_info_collation(DEFAULT_COLLATION_OID, &opproc);
prefixsel = ineq_histogram_selectivity(root, vardata, &opproc, true,
prefixcon->constvalue,
@ -5130,6 +5133,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
if (cmpopr == InvalidOid)
elog(ERROR, "no < operator for opfamily %u", opfamily);
fmgr_info(get_opcode(cmpopr), &opproc);
fmgr_info_collation(DEFAULT_COLLATION_OID, &opproc);
greaterstrcon = make_greater_string(prefixcon, &opproc);
if (greaterstrcon)

View File

@ -50,6 +50,10 @@
* the information to be stored in a pg_statistic row for the column. Be
* careful to allocate any pointed-to data in anl_context, which will NOT
* be CurrentMemoryContext when compute_stats is called.
*
* Note: for the moment, all comparisons done for statistical purposes
* should use the database's default collation (DEFAULT_COLLATION_OID).
* This might change in some future release.
*----------
*/
typedef struct VacAttrStats *VacAttrStatsP;
@ -66,13 +70,12 @@ typedef struct VacAttrStats
* Note: do not assume that the data being analyzed has the same datatype
* shown in attr, ie do not trust attr->atttypid, attlen, etc. This is
* because some index opclasses store a different type than the underlying
* column/expression. Instead use attrtypid, attrtypmod, attrcollation, and attrtype for
* column/expression. Instead use attrtypid, attrtypmod, and attrtype for
* information about the datatype being fed to the typanalyze function.
*/
Form_pg_attribute attr; /* copy of pg_attribute row for column */
Oid attrtypid; /* type of data being analyzed */
int32 attrtypmod; /* typmod of data being analyzed */
Oid attrcollation; /* collation of the data being analyzed */
Form_pg_type attrtype; /* copy of pg_type row for attrtypid */
MemoryContext anl_context; /* where to save long-lived data */

View File

@ -74,7 +74,6 @@ typedef struct VariableStatData
Oid vartype; /* exposed type of expression */
Oid atttype; /* type to pass to get_attstatsslot */
int32 atttypmod; /* typmod to pass to get_attstatsslot */
Oid attcollation; /* collation of the variable */
bool isunique; /* true if matched to a unique index */
} VariableStatData;
@ -179,7 +178,7 @@ extern Selectivity rowcomparesel(PlannerInfo *root,
int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo);
extern void mergejoinscansel(PlannerInfo *root, Node *clause,
Oid opfamily, Oid collation, int strategy, bool nulls_first,
Oid opfamily, int strategy, bool nulls_first,
Selectivity *leftstart, Selectivity *leftend,
Selectivity *rightstart, Selectivity *rightend);