From 512c7356b6574e7622fddb713f96dc8407960680 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 3 Jun 2017 14:36:25 -0400 Subject: [PATCH] Fix <> and pattern-NOT-match estimators to handle nulls correctly. These estimators returned 1 minus the corresponding equality/match estimate, which is incorrect: we need to subtract off the fraction of nulls in the column, since those are neither equal nor not equal to the comparison value. The error only becomes obvious if the nullfrac is large, but it could be very bad in a mostly-nulls column, as reported in bug #14676 from Marko Tiikkaja. To fix the <> case, refactor eqsel() and neqsel() to call a common support routine, which can be made to account for nullfrac correctly. The pattern-match cases were already factored that way, and it was simply an oversight that patternsel() wasn't subtracting off nullfrac. neqjoinsel() has a similar problem, but since we're elsewhere discussing changing its behavior entirely, I left it alone for now. This is a very longstanding bug, but I'm hesitant to back-patch a fix for it. Given the lack of prior complaints, such cases must not come up often, so it's probably not worth the risk of destabilizing plans in stable branches. Discussion: https://postgr.es/m/20170529153847.4275.95416@wrigleys.postgresql.org --- src/backend/utils/adt/selfuncs.c | 163 +++++++++++++++++++------------ 1 file changed, 103 insertions(+), 60 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 6e491bbc21..52d0e48995 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -154,12 +154,13 @@ get_relation_stats_hook_type get_relation_stats_hook = NULL; get_index_stats_hook_type get_index_stats_hook = NULL; +static double eqsel_internal(PG_FUNCTION_ARGS, bool negate); static double var_eq_const(VariableStatData *vardata, Oid operator, Datum constval, bool constisnull, - bool varonleft); + bool varonleft, bool negate); static double var_eq_non_const(VariableStatData *vardata, Oid operator, Node *other, - bool varonleft); + bool varonleft, bool negate); static double ineq_histogram_selectivity(PlannerInfo *root, VariableStatData *vardata, FmgrInfo *opproc, bool isgt, @@ -226,6 +227,15 @@ static List *add_predicate_to_quals(IndexOptInfo *index, List *indexQuals); */ Datum eqsel(PG_FUNCTION_ARGS) +{ + PG_RETURN_FLOAT8((float8) eqsel_internal(fcinfo, false)); +} + +/* + * Common code for eqsel() and neqsel() + */ +static double +eqsel_internal(PG_FUNCTION_ARGS, bool negate) { PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); Oid operator = PG_GETARG_OID(1); @@ -236,13 +246,27 @@ eqsel(PG_FUNCTION_ARGS) bool varonleft; double selec; + /* + * When asked about <>, we do the estimation using the corresponding = + * operator, then convert to <> via "1.0 - eq_selectivity - nullfrac". + */ + if (negate) + { + operator = get_negator(operator); + if (!OidIsValid(operator)) + { + /* Use default selectivity (should we raise an error instead?) */ + return 1.0 - DEFAULT_EQ_SEL; + } + } + /* * If expression is not variable = something or something = variable, then * punt and return a default estimate. */ if (!get_restriction_variable(root, args, varRelid, &vardata, &other, &varonleft)) - PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); + return negate ? (1.0 - DEFAULT_EQ_SEL) : DEFAULT_EQ_SEL; /* * We can do a lot better if the something is a constant. (Note: the @@ -253,14 +277,14 @@ eqsel(PG_FUNCTION_ARGS) selec = var_eq_const(&vardata, operator, ((Const *) other)->constvalue, ((Const *) other)->constisnull, - varonleft); + varonleft, negate); else selec = var_eq_non_const(&vardata, operator, other, - varonleft); + varonleft, negate); ReleaseVariableStats(vardata); - PG_RETURN_FLOAT8((float8) selec); + return selec; } /* @@ -271,19 +295,32 @@ eqsel(PG_FUNCTION_ARGS) static double var_eq_const(VariableStatData *vardata, Oid operator, Datum constval, bool constisnull, - bool varonleft) + bool varonleft, bool negate) { double selec; + double nullfrac = 0.0; bool isdefault; Oid opfuncoid; /* * If the constant is NULL, assume operator is strict and return zero, ie, - * operator will never return TRUE. + * operator will never return TRUE. (It's zero even for a negator op.) */ if (constisnull) return 0.0; + /* + * Grab the nullfrac for use below. Note we allow use of nullfrac + * regardless of security check. + */ + if (HeapTupleIsValid(vardata->statsTuple)) + { + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + nullfrac = stats->stanullfrac; + } + /* * If we matched the var to a unique index or DISTINCT clause, assume * there is exactly one match regardless of anything else. (This is @@ -292,11 +329,12 @@ var_eq_const(VariableStatData *vardata, Oid operator, * ignoring the information.) */ if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) - return 1.0 / vardata->rel->tuples; - - if (HeapTupleIsValid(vardata->statsTuple) && - statistic_proc_security_check(vardata, - (opfuncoid = get_opcode(operator)))) + { + selec = 1.0 / vardata->rel->tuples; + } + else if (HeapTupleIsValid(vardata->statsTuple) && + statistic_proc_security_check(vardata, + (opfuncoid = get_opcode(operator)))) { Form_pg_statistic stats; AttStatsSlot sslot; @@ -363,7 +401,7 @@ var_eq_const(VariableStatData *vardata, Oid operator, for (i = 0; i < sslot.nnumbers; i++) sumcommon += sslot.numbers[i]; - selec = 1.0 - sumcommon - stats->stanullfrac; + selec = 1.0 - sumcommon - nullfrac; CLAMP_PROBABILITY(selec); /* @@ -396,6 +434,10 @@ var_eq_const(VariableStatData *vardata, Oid operator, selec = 1.0 / get_variable_numdistinct(vardata, &isdefault); } + /* now adjust if we wanted <> rather than = */ + if (negate) + selec = 1.0 - selec - nullfrac; + /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); @@ -408,11 +450,23 @@ var_eq_const(VariableStatData *vardata, Oid operator, static double var_eq_non_const(VariableStatData *vardata, Oid operator, Node *other, - bool varonleft) + bool varonleft, bool negate) { double selec; + double nullfrac = 0.0; bool isdefault; + /* + * Grab the nullfrac for use below. + */ + if (HeapTupleIsValid(vardata->statsTuple)) + { + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + nullfrac = stats->stanullfrac; + } + /* * If we matched the var to a unique index or DISTINCT clause, assume * there is exactly one match regardless of anything else. (This is @@ -421,9 +475,10 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, * ignoring the information.) */ if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) - return 1.0 / vardata->rel->tuples; - - if (HeapTupleIsValid(vardata->statsTuple)) + { + selec = 1.0 / vardata->rel->tuples; + } + else if (HeapTupleIsValid(vardata->statsTuple)) { Form_pg_statistic stats; double ndistinct; @@ -441,7 +496,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, * values, regardless of their frequency in the table. Is that a good * idea?) */ - selec = 1.0 - stats->stanullfrac; + selec = 1.0 - nullfrac; ndistinct = get_variable_numdistinct(vardata, &isdefault); if (ndistinct > 1) selec /= ndistinct; @@ -469,6 +524,10 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, selec = 1.0 / get_variable_numdistinct(vardata, &isdefault); } + /* now adjust if we wanted <> rather than = */ + if (negate) + selec = 1.0 - selec - nullfrac; + /* result should be in range, but make sure... */ CLAMP_PROBABILITY(selec); @@ -485,33 +544,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator, Datum neqsel(PG_FUNCTION_ARGS) { - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Oid operator = PG_GETARG_OID(1); - List *args = (List *) PG_GETARG_POINTER(2); - int varRelid = PG_GETARG_INT32(3); - Oid eqop; - float8 result; - - /* - * We want 1 - eqsel() where the equality operator is the one associated - * with this != operator, that is, its negator. - */ - eqop = get_negator(operator); - if (eqop) - { - result = DatumGetFloat8(DirectFunctionCall4(eqsel, - PointerGetDatum(root), - ObjectIdGetDatum(eqop), - PointerGetDatum(args), - Int32GetDatum(varRelid))); - } - else - { - /* Use default selectivity (should we raise an error instead?) */ - result = DEFAULT_EQ_SEL; - } - result = 1.0 - result; - PG_RETURN_FLOAT8(result); + PG_RETURN_FLOAT8((float8) eqsel_internal(fcinfo, true)); } /* @@ -1114,6 +1147,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) Const *patt; Const *prefix = NULL; Selectivity rest_selec = 0; + double nullfrac = 0.0; double result; /* @@ -1202,6 +1236,17 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) return result; } + /* + * Grab the nullfrac for use below. + */ + if (HeapTupleIsValid(vardata.statsTuple)) + { + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + nullfrac = stats->stanullfrac; + } + /* * Pull out any fixed prefix implied by the pattern, and estimate the * fractional selectivity of the remainder of the pattern. Unlike many of @@ -1252,7 +1297,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) if (eqopr == InvalidOid) elog(ERROR, "no = operator for opfamily %u", opfamily); result = var_eq_const(&vardata, eqopr, prefix->constvalue, - false, true); + false, true, false); } else { @@ -1275,8 +1320,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) Selectivity selec; int hist_size; FmgrInfo opproc; - double nullfrac, - mcv_selec, + double mcv_selec, sumcommon; /* Try to use the histogram entries to get selectivity */ @@ -1328,11 +1372,6 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true, &sumcommon); - if (HeapTupleIsValid(vardata.statsTuple)) - nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac; - else - nullfrac = 0.0; - /* * Now merge the results from the MCV and histogram calculations, * realizing that the histogram covers only the non-null values that @@ -1340,12 +1379,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) */ selec *= 1.0 - nullfrac - sumcommon; selec += mcv_selec; - - /* result should be in range, but make sure... */ - CLAMP_PROBABILITY(selec); result = selec; } + /* now adjust if we wanted not-match rather than match */ + if (negate) + result = 1.0 - result - nullfrac; + + /* result should be in range, but make sure... */ + CLAMP_PROBABILITY(result); + if (prefix) { pfree(DatumGetPointer(prefix->constvalue)); @@ -1354,7 +1397,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate) ReleaseVariableStats(vardata); - return negate ? (1.0 - result) : result; + return result; } /* @@ -1451,7 +1494,7 @@ boolvarsel(PlannerInfo *root, Node *arg, int varRelid) * compute the selectivity as if that is what we have. */ selec = var_eq_const(&vardata, BooleanEqualOperator, - BoolGetDatum(true), false, true); + BoolGetDatum(true), false, true, false); } else if (is_funcclause(arg)) { @@ -5788,7 +5831,7 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata, if (cmpopr == InvalidOid) elog(ERROR, "no = operator for opfamily %u", opfamily); eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue, - false, true); + false, true, false); prefixsel = Max(prefixsel, eq_sel);