Modify prefix_selectivity() so that it will never estimate the selectivity

of the generated range condition var >= 'foo' AND var < 'fop' as being less than what eqsel() would estimate for var = 'foo'. This is intuitively reasonable and it gets rid of the need for some entirely ad-hoc coding we formerly used to reject bogus estimates. The basic problem here is that if the prefix is more than a few characters long, the two boundary values are too close together to be distinguishable by comparison to the column histogram, resulting in a selectivity estimate of zero, which is often not very sane. Change motivated by an example from Peter Eisentraut. Arguably this is a bug fix, but I'll refrain from back-patching it for the moment.

Modify prefix_selectivity() so that it will never estimate the selectivity
of the generated range condition var >= 'foo' AND var < 'fop' as being less than what eqsel() would estimate for var = 'foo'. This is intuitively reasonable and it gets rid of the need for some entirely ad-hoc coding we formerly used to reject bogus estimates. The basic problem here is that if the prefix is more than a few characters long, the two boundary values are too close together to be distinguishable by comparison to the column histogram, resulting in a selectivity estimate of zero, which is often not very sane. Change motivated by an example from Peter Eisentraut. Arguably this is a bug fix, but I'll refrain from back-patching it for the moment.
422495d0 · Tom Lane · 6f10eb21 · 422495d0
Commit 422495d0 authored Mar 08, 2008 by Tom Lane
Hide whitespace changes
Inline Side-by-side

Showing with 215 additions and 150 deletions

src/backend/utils/adt/selfuncs.c src/backend/utils/adt/selfuncs.c +215 -150

No files found.
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.243 2008/01/01 19:45:52 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -103,6 +103,12 @@
 #include "utils/syscache.h"
+static double var_eq_const(VariableStatData *vardata, Oid operator,
+			 Datum constval, bool constisnull,
+			 bool varonleft);
+static double var_eq_non_const(VariableStatData *vardata, Oid operator,
+				 Node *other,
+				 bool varonleft);
 static double ineq_histogram_selectivity(VariableStatData *vardata,
 						   FmgrInfo *opproc, bool isgt,
 						   Datum constval, Oid consttype);
@@ -156,10 +162,6 @@ eqsel(PG_FUNCTION_ARGS)
 	VariableStatData vardata;
 	Node	   *other;
 	bool		varonleft;
-	Datum	   *values;
-	int			nvalues;
-	float4	   *numbers;
-	int			nnumbers;
 	double		selec;
 	/*
@@ -171,149 +173,139 @@ eqsel(PG_FUNCTION_ARGS)
 		PG_RETURN_FLOAT8(DEFAULT_EQ_SEL);
 	/*
-	 * If the something is a NULL constant, assume operator is strict and
+	 * We can do a lot better if the something is a constant.  (Note: the
+	 * Const might result from estimation rather than being a simple constant
+	 * in the query.)
+	 */
+	if (IsA(other, Const))
+		selec = var_eq_const(&vardata, operator,
+							 ((Const *) other)->constvalue,
+							 ((Const *) other)->constisnull,
+							 varonleft);
+	else
+		selec = var_eq_non_const(&vardata, operator, other,
+								 varonleft);
+	ReleaseVariableStats(vardata);
+	PG_RETURN_FLOAT8((float8) selec);
+}
+/*
+ * var_eq_const --- eqsel for var = const case
+ *
+ * This is split out so that some other estimation functions can use it.
+ */
+static double
+var_eq_const(VariableStatData *vardata, Oid operator,
+			 Datum constval, bool constisnull,
+			 bool varonleft)
+{
+	double		selec;
+	/*
+	 * If the constant is NULL, assume operator is strict and
 	 * return zero, ie, operator will never return TRUE.
 	 */
-	if (IsA(other, Const) &&
+	if (constisnull)
-		((Const *) other)->constisnull)
+		return 0.0;
-	{
-		ReleaseVariableStats(vardata);
-		PG_RETURN_FLOAT8(0.0);
-	}
-	if (HeapTupleIsValid(vardata.statsTuple))
+	if (HeapTupleIsValid(vardata->statsTuple))
 	{
 		Form_pg_statistic stats;
+		Datum	   *values;
+		int			nvalues;
+		float4	   *numbers;
+		int			nnumbers;
+		bool		match = false;
+		int			i;
-		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
-		if (IsA(other, Const))
+		/*
+		 * Is the constant "=" to any of the column's most common values?
+		 * (Although the given operator may not really be "=", we will
+		 * assume that seeing whether it returns TRUE is an appropriate
+		 * test.  If you don't like this, maybe you shouldn't be using
+		 * eqsel for your operator...)
+		 */
+		if (get_attstatsslot(vardata->statsTuple,
+							 vardata->atttype, vardata->atttypmod,
+							 STATISTIC_KIND_MCV, InvalidOid,
+							 &values, &nvalues,
+							 &numbers, &nnumbers))
 		{
-			/* Variable is being compared to a known non-null constant */
+			FmgrInfo	eqproc;
-			Datum		constval = ((Const *) other)->constvalue;
-			bool		match = false;
-			int			i;
-			/*
+			fmgr_info(get_opcode(operator), &eqproc);
-			 * Is the constant "=" to any of the column's most common values?
-			 * (Although the given operator may not really be "=", we will
-			 * assume that seeing whether it returns TRUE is an appropriate
-			 * test.  If you don't like this, maybe you shouldn't be using
-			 * eqsel for your operator...)
-			 */
-			if (get_attstatsslot(vardata.statsTuple,
-								 vardata.atttype, vardata.atttypmod,
-								 STATISTIC_KIND_MCV, InvalidOid,
-								 &values, &nvalues,
-								 &numbers, &nnumbers))
-			{
-				FmgrInfo	eqproc;
-				fmgr_info(get_opcode(operator), &eqproc);
-				for (i = 0; i < nvalues; i++)
-				{
-					/* be careful to apply operator right way 'round */
-					if (varonleft)
-						match = DatumGetBool(FunctionCall2(&eqproc,
-														   values[i],
-														   constval));
-					else
-						match = DatumGetBool(FunctionCall2(&eqproc,
-														   constval,
-														   values[i]));
-					if (match)
-						break;
-				}
-			}
-			else
-			{
-				/* no most-common-value info available */
-				values = NULL;
-				numbers = NULL;
-				i = nvalues = nnumbers = 0;
-			}
-			if (match)
+			for (i = 0; i < nvalues; i++)
 			{
-				/*
+				/* be careful to apply operator right way 'round */
-				 * Constant is "=" to this common value.  We know selectivity
+				if (varonleft)
-				 * exactly (or as exactly as ANALYZE could calculate it,
+					match = DatumGetBool(FunctionCall2(&eqproc,
-				 * anyway).
+													   values[i],
-				 */
+													   constval));
-				selec = numbers[i];
+				else
-			}
+					match = DatumGetBool(FunctionCall2(&eqproc,
-			else
+													   constval,
-			{
+													   values[i]));
-				/*
+				if (match)
-				 * Comparison is against a constant that is neither NULL nor
+					break;
-				 * any of the common values.  Its selectivity cannot be more
-				 * than this:
-				 */
-				double		sumcommon = 0.0;
-				double		otherdistinct;
-				for (i = 0; i < nnumbers; i++)
-					sumcommon += numbers[i];
-				selec = 1.0 - sumcommon - stats->stanullfrac;
-				CLAMP_PROBABILITY(selec);
-				/*
-				 * and in fact it's probably a good deal less. We approximate
-				 * that all the not-common values share this remaining
-				 * fraction equally, so we divide by the number of other
-				 * distinct values.
-				 */
-				otherdistinct = get_variable_numdistinct(&vardata)
-					- nnumbers;
-				if (otherdistinct > 1)
-					selec /= otherdistinct;
-				/*
-				 * Another cross-check: selectivity shouldn't be estimated as
-				 * more than the least common "most common value".
-				 */
-				if (nnumbers > 0 && selec > numbers[nnumbers - 1])
-					selec = numbers[nnumbers - 1];
 			}
+		}
+		else
+		{
+			/* no most-common-value info available */
+			values = NULL;
+			numbers = NULL;
+			i = nvalues = nnumbers = 0;
+		}
-			free_attstatsslot(vardata.atttype, values, nvalues,
+		if (match)
-							  numbers, nnumbers);
+		{
+			/*
+			 * Constant is "=" to this common value.  We know selectivity
+			 * exactly (or as exactly as ANALYZE could calculate it,
+			 * anyway).
+			 */
+			selec = numbers[i];
 		}
 		else
 		{
-			double		ndistinct;
+			/*
+			 * Comparison is against a constant that is neither NULL nor
+			 * any of the common values.  Its selectivity cannot be more
+			 * than this:
+			 */
+			double		sumcommon = 0.0;
+			double		otherdistinct;
+			for (i = 0; i < nnumbers; i++)
+				sumcommon += numbers[i];
+			selec = 1.0 - sumcommon - stats->stanullfrac;
+			CLAMP_PROBABILITY(selec);
 			/*
-			 * Search is for a value that we do not know a priori, but we will
+			 * and in fact it's probably a good deal less. We approximate
-			 * assume it is not NULL.  Estimate the selectivity as non-null
+			 * that all the not-common values share this remaining
-			 * fraction divided by number of distinct values, so that we get a
+			 * fraction equally, so we divide by the number of other
-			 * result averaged over all possible values whether common or
+			 * distinct values.
-			 * uncommon.  (Essentially, we are assuming that the not-yet-known
-			 * comparison value is equally likely to be any of the possible
-			 * values, regardless of their frequency in the table.	Is that a
-			 * good idea?)
 			 */
-			selec = 1.0 - stats->stanullfrac;
+			otherdistinct = get_variable_numdistinct(vardata) - nnumbers;
-			ndistinct = get_variable_numdistinct(&vardata);
+			if (otherdistinct > 1)
-			if (ndistinct > 1)
+				selec /= otherdistinct;
-				selec /= ndistinct;
 			/*
-			 * Cross-check: selectivity should never be estimated as more than
+			 * Another cross-check: selectivity shouldn't be estimated as
-			 * the most common value's.
+			 * more than the least common "most common value".
 			 */
-			if (get_attstatsslot(vardata.statsTuple,
+			if (nnumbers > 0 && selec > numbers[nnumbers - 1])
-								 vardata.atttype, vardata.atttypmod,
+				selec = numbers[nnumbers - 1];
-								 STATISTIC_KIND_MCV, InvalidOid,
-								 NULL, NULL,
-								 &numbers, &nnumbers))
-			{
-				if (nnumbers > 0 && selec > numbers[0])
-					selec = numbers[0];
-				free_attstatsslot(vardata.atttype, NULL, 0, numbers, nnumbers);
-			}
 		}
+		free_attstatsslot(vardata->atttype, values, nvalues,
+						  numbers, nnumbers);
 	}
 	else
 	{
@@ -322,15 +314,78 @@ eqsel(PG_FUNCTION_ARGS)
 		 * of distinct values and assuming they are equally common. (The guess
 		 * is unlikely to be very good, but we do know a few special cases.)
 		 */
-		selec = 1.0 / get_variable_numdistinct(&vardata);
+		selec = 1.0 / get_variable_numdistinct(vardata);
 	}
-	ReleaseVariableStats(vardata);
+	/* result should be in range, but make sure... */
+	CLAMP_PROBABILITY(selec);
+	return selec;
+}
+/*
+ * var_eq_non_const --- eqsel for var = something-other-than-const case
+ */
+static double
+var_eq_non_const(VariableStatData *vardata, Oid operator,
+				 Node *other,
+				 bool varonleft)
+{
+	double		selec;
+	if (HeapTupleIsValid(vardata->statsTuple))
+	{
+		Form_pg_statistic stats;
+		double		ndistinct;
+		float4	   *numbers;
+		int			nnumbers;
+		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+		/*
+		 * Search is for a value that we do not know a priori, but we will
+		 * assume it is not NULL.  Estimate the selectivity as non-null
+		 * fraction divided by number of distinct values, so that we get a
+		 * result averaged over all possible values whether common or
+		 * uncommon.  (Essentially, we are assuming that the not-yet-known
+		 * comparison value is equally likely to be any of the possible
+		 * values, regardless of their frequency in the table.	Is that a
+		 * good idea?)
+		 */
+		selec = 1.0 - stats->stanullfrac;
+		ndistinct = get_variable_numdistinct(vardata);
+		if (ndistinct > 1)
+			selec /= ndistinct;
+		/*
+		 * Cross-check: selectivity should never be estimated as more than
+		 * the most common value's.
+		 */
+		if (get_attstatsslot(vardata->statsTuple,
+							 vardata->atttype, vardata->atttypmod,
+							 STATISTIC_KIND_MCV, InvalidOid,
+							 NULL, NULL,
+							 &numbers, &nnumbers))
+		{
+			if (nnumbers > 0 && selec > numbers[0])
+				selec = numbers[0];
+			free_attstatsslot(vardata->atttype, NULL, 0, numbers, nnumbers);
+		}
+	}
+	else
+	{
+		/*
+		 * No ANALYZE stats available, so make a guess using estimated number
+		 * of distinct values and assuming they are equally common. (The guess
+		 * is unlikely to be very good, but we do know a few special cases.)
+		 */
+		selec = 1.0 / get_variable_numdistinct(vardata);
+	}
 	/* result should be in range, but make sure... */
 	CLAMP_PROBABILITY(selec);
-	PG_RETURN_FLOAT8((float8) selec);
+	return selec;
 }
 /*
@@ -1047,16 +1102,11 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
 		 */
 		Oid			eqopr = get_opfamily_member(opfamily, vartype, vartype,
 												BTEqualStrategyNumber);
-		List	   *eqargs;
 		if (eqopr == InvalidOid)
 			elog(ERROR, "no = operator for opfamily %u", opfamily);
-		eqargs = list_make2(variable, prefix);
+		result = var_eq_const(&vardata, eqopr, prefix->constvalue,
-		result = DatumGetFloat8(DirectFunctionCall4(eqsel,
+							  false, true);
-													PointerGetDatum(root),
-													ObjectIdGetDatum(eqopr),
-													PointerGetDatum(eqargs),
-													Int32GetDatum(varRelid)));
 	}
 	else
 	{
@@ -4430,6 +4480,7 @@ prefix_selectivity(VariableStatData *vardata,
 	Oid			cmpopr;
 	FmgrInfo	opproc;
 	Const	   *greaterstrcon;
+	Selectivity	eq_sel;
 	cmpopr = get_opfamily_member(opfamily, vartype, vartype,
 								 BTGreaterEqualStrategyNumber);
@@ -4444,7 +4495,7 @@ prefix_selectivity(VariableStatData *vardata,
 	if (prefixsel <= 0.0)
 	{
 		/* No histogram is present ... return a suitable default estimate */
-		return 0.005;
+		return DEFAULT_MATCH_SEL;
 	}
 	/*-------
@@ -4452,17 +4503,17 @@ prefix_selectivity(VariableStatData *vardata,
 	 *	"x < greaterstr".
 	 *-------
 	 */
-	cmpopr = get_opfamily_member(opfamily, vartype, vartype,
-								 BTLessStrategyNumber);
-	if (cmpopr == InvalidOid)
-		elog(ERROR, "no < operator for opfamily %u", opfamily);
-	fmgr_info(get_opcode(cmpopr), &opproc);
 	greaterstrcon = make_greater_string(prefixcon, &opproc);
 	if (greaterstrcon)
 	{
 		Selectivity topsel;
+		cmpopr = get_opfamily_member(opfamily, vartype, vartype,
+									 BTLessStrategyNumber);
+		if (cmpopr == InvalidOid)
+			elog(ERROR, "no < operator for opfamily %u", opfamily);
+		fmgr_info(get_opcode(cmpopr), &opproc);
 		topsel = ineq_histogram_selectivity(vardata, &opproc, false,
 											greaterstrcon->constvalue,
 											greaterstrcon->consttype);
@@ -4477,16 +4528,30 @@ prefix_selectivity(VariableStatData *vardata,
 		 * doesn't count those anyway.
 		 */
 		prefixsel = topsel + prefixsel - 1.0;
-		/*
-		 * A zero or negative prefixsel should be converted into a small
-		 * positive value; we probably are dealing with a very tight range and
-		 * got a bogus result due to roundoff errors.
-		 */
-		if (prefixsel <= 0.0)
-			prefixsel = 1.0e-10;
 	}
+	/*
+	 * If the prefix is long then the two bounding values might be too
+	 * close together for the histogram to distinguish them usefully,
+	 * resulting in a zero estimate (plus or minus roundoff error).
+	 * To avoid returning a ridiculously small estimate, compute the
+	 * estimated selectivity for "variable = 'foo'", and clamp to that.
+	 * (Obviously, the resultant estimate should be at least that.)
+	 *
+	 * We apply this even if we couldn't make a greater string.  That case
+	 * suggests that the prefix is near the maximum possible, and thus
+	 * probably off the end of the histogram, and thus we probably got a
+	 * very small estimate from the >= condition; so we still need to clamp.
+	 */
+	cmpopr = get_opfamily_member(opfamily, vartype, vartype,
+								 BTEqualStrategyNumber);
+	if (cmpopr == InvalidOid)
+		elog(ERROR, "no = operator for opfamily %u", opfamily);
+	eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
+						  false, true);
+	prefixsel = Max(prefixsel, eq_sel);
 	return prefixsel;
 }