Improve patternsel() by applying the operator itself to each value

listed in the column's most-common-values statistics entry. This gives us an exact selectivity result for the portion of the column population represented by the MCV list, which can be a big leg up in accuracy if that's a large fraction of the population. The heuristics involving pattern contents and prefix are applied only to the part of the population not included in the MCV list.
2006-01-10 17:35:52 +00:00 · 2006-01-10 17:35:52 +00:00 · ce8fd39e15
parent ad24b8e6ee
commit ce8fd39e15
1 changed files with 180 additions and 92 deletions
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@ -15,7 +15,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.194 2005/11/25 19:47:49 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.195 2006/01/10 17:35:52 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -131,6 +131,11 @@ typedef struct
 	} while(0)


+static double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
+							  Datum constval, double *sumcommonp);
+static double ineq_histogram_selectivity(VariableStatData *vardata,
+										 FmgrInfo *opproc, bool isgt,
+										 Datum constval, Oid consttype);
 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 				  Datum lobound, Datum hibound, Oid boundstypid,
 				  double *scaledlobound, double *scaledhibound);
@ -164,8 +169,8 @@ static void examine_variable(PlannerInfo *root, Node *node, int varRelid,
 static double get_variable_numdistinct(VariableStatData *vardata);
 static bool get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
 					 Oid sortop, Datum *max);
-static Selectivity prefix_selectivity(PlannerInfo *root, Node *variable,
-				   Oid opclass, Const *prefix);
+static Selectivity prefix_selectivity(VariableStatData *vardata,
+									  Oid opclass, Const *prefixcon);
 static Selectivity pattern_selectivity(Const *patt, Pattern_Type ptype);
 static Datum string_to_datum(const char *str, Oid datatype);
 static Const *string_to_const(const char *str, Oid datatype);
@ -426,15 +431,10 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 {
 	Form_pg_statistic stats;
 	FmgrInfo	opproc;
-	Datum	   *values;
-	int			nvalues;
-	float4	   *numbers;
-	int			nnumbers;
 	double		mcv_selec,
 				hist_selec,
 				sumcommon;
 	double		selec;
-	int			i;

 	if (!HeapTupleIsValid(vardata->statsTuple))
 	{
@ -451,10 +451,76 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 	 * to the result selectivity.  Also add up the total fraction represented
 	 * by MCV entries.
 	 */
+	mcv_selec = mcv_selectivity(vardata, &opproc, constval,
+								&sumcommon);
+
+	/*
+	 * If there is a histogram, determine which bin the constant falls in, and
+	 * compute the resulting contribution to selectivity.
+	 */
+	hist_selec = ineq_histogram_selectivity(vardata, &opproc, isgt,
+											constval, consttype);
+
+	/*
+	 * Now merge the results from the MCV and histogram calculations,
+	 * realizing that the histogram covers only the non-null values that are
+	 * not listed in MCV.
+	 */
+	selec = 1.0 - stats->stanullfrac - sumcommon;
+
+	if (hist_selec > 0.0)
+		selec *= hist_selec;
+	else
+	{
+		/*
+		 * If no histogram but there are values not accounted for by MCV,
+		 * arbitrarily assume half of them will match.
+		 */
+		selec *= 0.5;
+	}
+
+	selec += mcv_selec;
+
+	/* result should be in range, but make sure... */
+	CLAMP_PROBABILITY(selec);
+
+	return selec;
+}
+
+/*
+ *	mcv_selectivity				- Examine the MCV list for scalarineqsel
+ *
+ * Determine the fraction of the variable's MCV population that satisfies
+ * the predicate (VAR OP CONST), as well as the fraction of the total column
+ * population represented by the MCV list.  This code will work for any
+ * boolean-returning predicate operator.
+ *
+ * The function result is the MCV selectivity, and the fraction of the
+ * total population is returned into *sumcommonp.  Zeroes are returned
+ * if there is no MCV list.
+ */
+static double
+mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc, Datum constval,
+				double *sumcommonp)
+{
+	double		mcv_selec,
+				sumcommon;
+	Datum	   *values;
+	int			nvalues;
+	float4	   *numbers;
+	int			nnumbers;
+	int			i;
+
 	mcv_selec = 0.0;
 	sumcommon = 0.0;

-	if (get_attstatsslot(vardata->statsTuple,
+	/*
+	 * If we have most-common-values info, add up the fractions of the MCV
+	 * entries that satisfy MCV OP CONST.  Also add up the total fraction
+	 * represented by MCV entries.
+	 */
+	if (HeapTupleIsValid(vardata->statsTuple) &&
+		get_attstatsslot(vardata->statsTuple,
 						 vardata->atttype, vardata->atttypmod,
 						 STATISTIC_KIND_MCV, InvalidOid,
 						 &values, &nvalues,
@ -462,7 +528,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 	{
 		for (i = 0; i < nvalues; i++)
 		{
-			if (DatumGetBool(FunctionCall2(&opproc,
+			if (DatumGetBool(FunctionCall2(opproc,
 										   values[i],
 										   constval)))
 				mcv_selec += numbers[i];
@ -472,10 +538,36 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 						  numbers, nnumbers);
 	}

+	*sumcommonp = sumcommon;
+	return mcv_selec;
+}
+
+/*
+ *	ineq_histogram_selectivity	- Examine the histogram for scalarineqsel
+ *
+ * Determine the fraction of the variable's histogram population that
+ * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
+ *
+ * Returns zero if there is no histogram (valid results will always be
+ * greater than zero).
+ *
+ * Note that the result disregards both the most-common-values (if any) and
+ * null entries.  The caller is expected to combine this result with
+ * statistics for those portions of the column population.
+ */
+static double
+ineq_histogram_selectivity(VariableStatData *vardata,
+						   FmgrInfo *opproc, bool isgt,
+						   Datum constval, Oid consttype)
+{
+	double		hist_selec;
+	Datum	   *values;
+	int			nvalues;
+	int			i;
+
+	hist_selec = 0.0;
+
 	/*
-	 * If there is a histogram, determine which bin the constant falls in, and
-	 * compute the resulting contribution to selectivity.
-	 *
 	 * Someday, VACUUM might store more than one histogram per rel/att,
 	 * corresponding to more than one possible sort ordering defined for the
 	 * column type.  However, to make that work we will need to figure out
@ -485,9 +577,8 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 	 * appears in pg_statistic is sorted the same way our operator sorts, or
 	 * the reverse way if isgt is TRUE.
 	 */
-	hist_selec = 0.0;
-
-	if (get_attstatsslot(vardata->statsTuple,
+	if (HeapTupleIsValid(vardata->statsTuple) &&
+		get_attstatsslot(vardata->statsTuple,
 						 vardata->atttype, vardata->atttypmod,
 						 STATISTIC_KIND_HISTOGRAM, InvalidOid,
 						 &values, &nvalues,
@ -498,7 +589,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 			double		histfrac;
 			bool		ltcmp;

-			ltcmp = DatumGetBool(FunctionCall2(&opproc,
+			ltcmp = DatumGetBool(FunctionCall2(opproc,
 											   values[0],
 											   constval));
 			if (isgt)
@ -517,7 +608,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 				 */
 				for (i = 1; i < nvalues; i++)
 				{
-					ltcmp = DatumGetBool(FunctionCall2(&opproc,
+					ltcmp = DatumGetBool(FunctionCall2(opproc,
 													   values[i],
 													   constval));
 					if (isgt)
@ -618,30 +709,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 		free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 	}

-	/*
-	 * Now merge the results from the MCV and histogram calculations,
-	 * realizing that the histogram covers only the non-null values that are
-	 * not listed in MCV.
-	 */
-	selec = 1.0 - stats->stanullfrac - sumcommon;
-
-	if (hist_selec > 0.0)
-		selec *= hist_selec;
-	else
-	{
-		/*
-		 * If no histogram but there are values not accounted for by MCV,
-		 * arbitrarily assume half of them will match.
-		 */
-		selec *= 0.5;
-	}
-
-	selec += mcv_selec;
-
-	/* result should be in range, but make sure... */
-	CLAMP_PROBABILITY(selec);
-
-	return selec;
+	return hist_selec;
 }

 /*
@ -801,10 +869,7 @@ static double
 patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 {
 	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-
-#ifdef NOT_USED
 	Oid			operator = PG_GETARG_OID(1);
-#endif
 	List	   *args = (List *) PG_GETARG_POINTER(2);
 	int			varRelid = PG_GETARG_INT32(3);
 	VariableStatData vardata;
@ -948,18 +1013,53 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 	{
 		/*
 		 * Not exact-match pattern.  We estimate selectivity of the fixed
-		 * prefix and remainder of pattern separately, then combine the two.
+		 * prefix and remainder of pattern separately, then combine the two
+		 * to get an estimate of the selectivity for the part of the column
+		 * population represented by the histogram.  We then add up data for
+		 * any most-common-values values; these are not in the histogram
+		 * population, and we can get exact answers for them by applying
+		 * the pattern operator, so there's no reason to approximate.
+		 * (If the MCVs cover a significant part of the total population,
+		 * this gives us a big leg up in accuracy.)
 		 */
 		Selectivity prefixsel;
 		Selectivity restsel;
 		Selectivity selec;
+		FmgrInfo	opproc;
+		double		nullfrac,
+					mcv_selec,
+					sumcommon;
+
+		if (HeapTupleIsValid(vardata.statsTuple))
+			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+		else
+			nullfrac = 0.0;

 		if (pstatus == Pattern_Prefix_Partial)
-			prefixsel = prefix_selectivity(root, variable, opclass, prefix);
+			prefixsel = prefix_selectivity(&vardata, opclass, prefix);
 		else
 			prefixsel = 1.0;
 		restsel = pattern_selectivity(rest, ptype);
 		selec = prefixsel * restsel;
+
+		/*
+		 * If we have most-common-values info, add up the fractions of the MCV
+		 * entries that satisfy MCV OP PATTERN.  These fractions contribute
+		 * directly to the result selectivity.  Also add up the total fraction
+		 * represented by MCV entries.
+		 */
+		fmgr_info(get_opcode(operator), &opproc);
+		mcv_selec = mcv_selectivity(&vardata, &opproc, constval,
+									&sumcommon);
+
+		/*
+		 * Now merge the results from the MCV and histogram calculations,
+		 * realizing that the histogram covers only the non-null values that
+		 * are not listed in MCV.
+		 */
+		selec *= 1.0 - nullfrac - sumcommon;
+		selec += mcv_selec;
+
 		/* result should be in range, but make sure... */
 		CLAMP_PROBABILITY(selec);
 		result = selec;
@ -2427,7 +2527,7 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
 /*
 * convert_to_scalar
 *	  Convert non-NULL values of the indicated types to the comparison
- *	  scale needed by scalarltsel()/scalargtsel().
+ *	  scale needed by scalarineqsel().
 *	  Returns "true" if successful.
 *
 * XXX this routine is a hack: ideally we should look up the conversion
@ -3841,6 +3941,10 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
 * A fixed prefix "foo" is estimated as the selectivity of the expression
 * "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
 *
+ * The selectivity estimate is with respect to the portion of the column
+ * population represented by the histogram --- the caller must fold this
+ * together with info about MCVs and NULLs.
+ *
 * We use the >= and < operators from the specified btree opclass to do the
 * estimation.	The given variable and Const must be of the associated
 * datatype.
@ -3851,25 +3955,28 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype,
 * more useful to use the upper-bound code than not.
 */
 static Selectivity
-prefix_selectivity(PlannerInfo *root, Node *variable,
-				   Oid opclass, Const *prefixcon)
+prefix_selectivity(VariableStatData *vardata, Oid opclass, Const *prefixcon)
 {
 	Selectivity prefixsel;
 	Oid			cmpopr;
-	List	   *cmpargs;
+	FmgrInfo	opproc;
 	Const	   *greaterstrcon;

 	cmpopr = get_opclass_member(opclass, InvalidOid,
 								BTGreaterEqualStrategyNumber);
 	if (cmpopr == InvalidOid)
 		elog(ERROR, "no >= operator for opclass %u", opclass);
-	cmpargs = list_make2(variable, prefixcon);
-	/* Assume scalargtsel is appropriate for all supported types */
-	prefixsel = DatumGetFloat8(DirectFunctionCall4(scalargtsel,
-												   PointerGetDatum(root),
-												   ObjectIdGetDatum(cmpopr),
-												   PointerGetDatum(cmpargs),
-												   Int32GetDatum(0)));
+	fmgr_info(get_opcode(cmpopr), &opproc);
+
+	prefixsel = ineq_histogram_selectivity(vardata, &opproc, true,
+										   prefixcon->constvalue,
+										   prefixcon->consttype);
+
+	if (prefixsel <= 0.0)
+	{
+		/* No histogram is present ... return a suitable default estimate */
+		return 0.005;
+	}

 	/*-------
 	 * If we can create a string larger than the prefix, say
@ -3885,49 +3992,30 @@ prefix_selectivity(PlannerInfo *root, Node *variable,
 									BTLessStrategyNumber);
 		if (cmpopr == InvalidOid)
 			elog(ERROR, "no < operator for opclass %u", opclass);
-		cmpargs = list_make2(variable, greaterstrcon);
-		/* Assume scalarltsel is appropriate for all supported types */
-		topsel = DatumGetFloat8(DirectFunctionCall4(scalarltsel,
-													PointerGetDatum(root),
-													ObjectIdGetDatum(cmpopr),
-													PointerGetDatum(cmpargs),
-													Int32GetDatum(0)));
+		fmgr_info(get_opcode(cmpopr), &opproc);
+
+		topsel = ineq_histogram_selectivity(vardata, &opproc, false,
+											greaterstrcon->constvalue,
+											greaterstrcon->consttype);
+
+		/* ineq_histogram_selectivity worked before, it shouldn't fail now */
+		Assert(topsel > 0.0);

 		/*
 		 * Merge the two selectivities in the same way as for a range query
-		 * (see clauselist_selectivity()).
+		 * (see clauselist_selectivity()).  Note that we don't need to worry
+		 * about double-exclusion of nulls, since ineq_histogram_selectivity
+		 * doesn't count those anyway.
 		 */
 		prefixsel = topsel + prefixsel - 1.0;

-		/* Adjust for double-exclusion of NULLs */
-		prefixsel += nulltestsel(root, IS_NULL, variable, 0);
-
 		/*
-		 * A zero or slightly negative prefixsel should be converted into a
-		 * small positive value; we probably are dealing with a very tight
-		 * range and got a bogus result due to roundoff errors. However, if
-		 * prefixsel is very negative, then we probably have default
-		 * selectivity estimates on one or both sides of the range.  In that
-		 * case, insert a not-so-wildly-optimistic default estimate.
+		 * A zero or negative prefixsel should be converted into a small
+		 * positive value; we probably are dealing with a very tight range
+		 * and got a bogus result due to roundoff errors.
 		 */
 		if (prefixsel <= 0.0)
-		{
-			if (prefixsel < -0.01)
-			{
-				/*
-				 * No data available --- use a default estimate that is small,
-				 * but not real small.
-				 */
-				prefixsel = 0.005;
-			}
-			else
-			{
-				/*
-				 * It's just roundoff error; use a small positive value
-				 */
-				prefixsel = 1.0e-10;
-			}
-		}
+			prefixsel = 1.0e-10;
 	}

 	return prefixsel;