Change patternsel (LIKE/regex selectivity estimation) so that if there

is a large enough histogram, it will use the number of matches in the histogram to derive a selectivity estimate, rather than the admittedly pretty bogus heuristics involving examining the pattern contents. I set 'large enough' at 100, but perhaps we should change that later. Also apply the same technique in contrib/ltree's <@ and @> estimator. Per discussion with Stefan Kaltenbrunner and Matteo Beccati.

Change patternsel (LIKE/regex selectivity estimation) so that if there
is a large enough histogram, it will use the number of matches in the histogram to derive a selectivity estimate, rather than the admittedly pretty bogus heuristics involving examining the pattern contents. I set 'large enough' at 100, but perhaps we should change that later. Also apply the same technique in contrib/ltree's <@ and @> estimator. Per discussion with Stefan Kaltenbrunner and Matteo Beccati.
bfd1ffa9 · Tom Lane · 06b33f0e · bfd1ffa9 · bfd1ffa9 · bfd1ffa9
Commit bfd1ffa9 authored Sep 20, 2006 by Tom Lane
Showing with 242 additions and 113 deletions

contrib/ltree/ltree_op.c contrib/ltree/ltree_op.c +36 -4

src/backend/utils/adt/selfuncs.c src/backend/utils/adt/selfuncs.c +202 -108

src/include/utils/selfuncs.h src/include/utils/selfuncs.h +4 -1

No files found.
--- a/contrib/ltree/ltree_op.c
+++ b/contrib/ltree/ltree_op.c
 /*
 * op function for ltree
 * Teodor Sigaev <teodor@stack.net>
- * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.12 2006/05/30 22:12:13 tgl Exp $
+ * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.13 2006/09/20 19:50:21 tgl Exp $
 */
 #include "ltree.h"
 #include <ctype.h>
+#include "catalog/pg_statistic.h"
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
 #include "utils/syscache.h"
@@ -606,6 +607,7 @@ ltreeparentsel(PG_FUNCTION_ARGS)
 		FmgrInfo	contproc;
 		double		mcvsum;
 		double		mcvsel;
+		double		nullfrac;
 		fmgr_info(get_opcode(operator), &contproc);
@@ -616,10 +618,40 @@ ltreeparentsel(PG_FUNCTION_ARGS)
 								 &mcvsum);
 		/*
-		 * We have the exact selectivity for values appearing in the MCV list;
+		 * If the histogram is large enough, see what fraction of it the
-		 * use the default selectivity for the rest of the population.
+		 * constant is "<@" to, and assume that's representative of the
+		 * non-MCV population.  Otherwise use the default selectivity for
+		 * the non-MCV population.
 		 */
-		selec = mcvsel + DEFAULT_PARENT_SEL * (1.0 - mcvsum);
+		selec = histogram_selectivity(&vardata, &contproc,
+									  constval, varonleft,
+									  100, 1);
+		if (selec < 0)
+		{
+			/* Nope, fall back on default */
+			selec = DEFAULT_PARENT_SEL;
+		}
+		else
+		{
+			/* Yes, but don't believe extremely small or large estimates. */
+			if (selec < 0.0001)
+				selec = 0.0001;
+			else if (selec > 0.9999)
+				selec = 0.9999;
+		}
+		if (HeapTupleIsValid(vardata.statsTuple))
+			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+		else
+			nullfrac = 0.0;
+		/*
+		 * Now merge the results from the MCV and histogram calculations,
+		 * realizing that the histogram covers only the non-null values that
+		 * are not listed in MCV.
+		 */
+		selec *= 1.0 - nullfrac - mcvsum;
+		selec += mcvsel;
 	}
 	else
 		selec = DEFAULT_PARENT_SEL;

--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.212 2006/09/19 22:49:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.213 2006/09/20 19:50:21 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -235,7 +235,7 @@ eqsel(PG_FUNCTION_ARGS)
 			{
 				/*
 				 * Constant is "=" to this common value.  We know selectivity
-				 * exactly (or as exactly as VACUUM could calculate it,
+				 * exactly (or as exactly as ANALYZE could calculate it,
 				 * anyway).
 				 */
 				selec = numbers[i];
@@ -315,7 +315,7 @@ eqsel(PG_FUNCTION_ARGS)
 	else
 	{
 		/*
-		 * No VACUUM ANALYZE stats available, so make a guess using estimated
+		 * No ANALYZE stats available, so make a guess using estimated
 		 * number of distinct values and assuming they are equally common.
 		 * (The guess is unlikely to be very good, but we do know a few
 		 * special cases.)
@@ -446,7 +446,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 }
 /*
- *	mcv_selectivity				- Examine the MCV list for scalarineqsel
+ *	mcv_selectivity			- Examine the MCV list for selectivity estimates
 *
 * Determine the fraction of the variable's MCV population that satisfies
 * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.  Also
@@ -500,6 +500,80 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 	return mcv_selec;
 }
+/*
+ *	histogram_selectivity	- Examine the histogram for selectivity estimates
+ *
+ * Determine the fraction of the variable's histogram entries that satisfy
+ * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
+ *
+ * This code will work for any boolean-returning predicate operator, whether
+ * or not it has anything to do with the histogram sort operator.  We are
+ * essentially using the histogram just as a representative sample.  However,
+ * small histograms are unlikely to be all that representative, so the caller
+ * should specify a minimum histogram size to use, and fall back on some
+ * other approach if this routine fails.
+ *
+ * The caller also specifies n_skip, which causes us to ignore the first and
+ * last n_skip histogram elements, on the grounds that they are outliers and
+ * hence not very representative.  If in doubt, min_hist_size = 100 and
+ * n_skip = 1 are reasonable values.
+ *
+ * The function result is the selectivity, or -1 if there is no histogram
+ * or it's smaller than min_hist_size.
+ *
+ * Note that the result disregards both the most-common-values (if any) and
+ * null entries.  The caller is expected to combine this result with
+ * statistics for those portions of the column population.  It may also be
+ * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
+ */
+double
+histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
+					  Datum constval, bool varonleft,
+					  int min_hist_size, int n_skip)
+{
+	double		result;
+	Datum	   *values;
+	int			nvalues;
+	/* check sanity of parameters */
+	Assert(n_skip >= 0);
+	Assert(min_hist_size > 2 * n_skip);
+	if (HeapTupleIsValid(vardata->statsTuple) &&
+		get_attstatsslot(vardata->statsTuple,
+						 vardata->atttype, vardata->atttypmod,
+						 STATISTIC_KIND_HISTOGRAM, InvalidOid,
+						 &values, &nvalues,
+						 NULL, NULL))
+	{
+		if (nvalues >= min_hist_size)
+		{
+			int			nmatch = 0;
+			int			i;
+			for (i = n_skip; i < nvalues - n_skip; i++)
+			{
+				if (varonleft ?
+					DatumGetBool(FunctionCall2(opproc,
+											   values[i],
+											   constval)) :
+					DatumGetBool(FunctionCall2(opproc,
+											   constval,
+											   values[i])))
+					nmatch++;
+			}
+			result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
+		}
+		else
+			result = -1;
+		free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
+	}
+	else
+		result = -1;
+	return result;
+}
 /*
 *	ineq_histogram_selectivity	- Examine the histogram for scalarineqsel
 *
@@ -521,12 +595,11 @@ ineq_histogram_selectivity(VariableStatData *vardata,
 	double		hist_selec;
 	Datum	   *values;
 	int			nvalues;
-	int			i;
 	hist_selec = 0.0;
 	/*
-	 * Someday, VACUUM might store more than one histogram per rel/att,
+	 * Someday, ANALYZE might store more than one histogram per rel/att,
 	 * corresponding to more than one possible sort ordering defined for the
 	 * column type.  However, to make that work we will need to figure out
 	 * which staop to search for --- it's not necessarily the one we have at
@@ -544,43 +617,46 @@ ineq_histogram_selectivity(VariableStatData *vardata,
 	{
 		if (nvalues > 1)
 		{
+			/*
+			 * Use binary search to find proper location, ie, the first
+			 * slot at which the comparison fails.  (If the given operator
+			 * isn't actually sort-compatible with the histogram, you'll
+			 * get garbage results ... but probably not any more garbage-y
+			 * than you would from the old linear search.)
+			 */
 			double	histfrac;
+			int		lobound = 0;		/* first possible slot to search */
+			int		hibound = nvalues;	/* last+1 slot to search */
+			while (lobound < hibound)
+			{
+				int		probe = (lobound + hibound) / 2;
 				bool	ltcmp;
 				ltcmp = DatumGetBool(FunctionCall2(opproc,
-											   values[0],
+												   values[probe],
 												   constval));
 				if (isgt)
 					ltcmp = !ltcmp;
-			if (!ltcmp)
+				if (ltcmp)
+					lobound = probe + 1;
+				else
+					hibound = probe;
+			}
+			if (lobound <= 0)
 			{
 				/* Constant is below lower histogram boundary. */
 				histfrac = 0.0;
 			}
-			else
+			else if (lobound >= nvalues)
-			{
-				/*
-				 * Scan to find proper location.  This could be made faster by
-				 * using a binary-search method, but it's probably not worth
-				 * the trouble for typical histogram sizes.
-				 */
-				for (i = 1; i < nvalues; i++)
-				{
-					ltcmp = DatumGetBool(FunctionCall2(opproc,
-													   values[i],
-													   constval));
-					if (isgt)
-						ltcmp = !ltcmp;
-					if (!ltcmp)
-						break;
-				}
-				if (i >= nvalues)
 			{
 				/* Constant is above upper histogram boundary. */
 				histfrac = 1.0;
 			}
 			else
 			{
+				int			i = lobound;
 				double		val,
 							high,
 							low;
@@ -643,7 +719,6 @@ ineq_histogram_selectivity(VariableStatData *vardata,
 				histfrac = (double) (i - 1) + binfrac;
 				histfrac /= (double) (nvalues - 1);
 			}
-			}
 			/*
 			 * Now histfrac = fraction of histogram entries below the
@@ -970,28 +1045,34 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 	else
 	{
 		/*
-		 * Not exact-match pattern.  We estimate selectivity of the fixed
+		 * Not exact-match pattern.  If we have a sufficiently large
-		 * prefix and remainder of pattern separately, then combine the two
+		 * histogram, estimate selectivity for the histogram part of the
-		 * to get an estimate of the selectivity for the part of the column
+		 * population by counting matches in the histogram.  If not, estimate
-		 * population represented by the histogram.  We then add up data for
+		 * selectivity of the fixed prefix and remainder of pattern
-		 * any most-common-values values; these are not in the histogram
+		 * separately, then combine the two to get an estimate of the
-		 * population, and we can get exact answers for them by applying
+		 * selectivity for the part of the column population represented by
-		 * the pattern operator, so there's no reason to approximate.
+		 * the histogram.  We then add up data for any most-common-values
-		 * (If the MCVs cover a significant part of the total population,
+		 * values; these are not in the histogram population, and we can get
-		 * this gives us a big leg up in accuracy.)
+		 * exact answers for them by applying the pattern operator, so there's
+		 * no reason to approximate.  (If the MCVs cover a significant part of
+		 * the total population, this gives us a big leg up in accuracy.)
 		 */
-		Selectivity prefixsel;
-		Selectivity restsel;
 		Selectivity selec;
 		FmgrInfo	opproc;
 		double		nullfrac,
 					mcv_selec,
 					sumcommon;
-		if (HeapTupleIsValid(vardata.statsTuple))
+		/* Try to use the histogram entries to get selectivity */
-			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+		fmgr_info(get_opcode(operator), &opproc);
-		else
-			nullfrac = 0.0;
+		selec = histogram_selectivity(&vardata, &opproc, constval, true,
+									  100, 1);
+		if (selec < 0)
+		{
+			/* Nope, so fake it with the heuristic method */
+			Selectivity prefixsel;
+			Selectivity restsel;
 			if (pstatus == Pattern_Prefix_Partial)
 				prefixsel = prefix_selectivity(&vardata, opclass, prefix);
@@ -999,6 +1080,15 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 				prefixsel = 1.0;
 			restsel = pattern_selectivity(rest, ptype);
 			selec = prefixsel * restsel;
+		}
+		else
+		{
+			/* Yes, but don't believe extremely small or large estimates. */
+			if (selec < 0.0001)
+				selec = 0.0001;
+			else if (selec > 0.9999)
+				selec = 0.9999;
+		}
 		/*
 		 * If we have most-common-values info, add up the fractions of the MCV
@@ -1006,10 +1096,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 		 * directly to the result selectivity.  Also add up the total fraction
 		 * represented by MCV entries.
 		 */
-		fmgr_info(get_opcode(operator), &opproc);
 		mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
 									&sumcommon);
+		if (HeapTupleIsValid(vardata.statsTuple))
+			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+		else
+			nullfrac = 0.0;
 		/*
 		 * Now merge the results from the MCV and histogram calculations,
 		 * realizing that the histogram covers only the non-null values that
@@ -1332,7 +1426,7 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
 	else
 	{
 		/*
-		 * No VACUUM ANALYZE stats available, so make a guess
+		 * No ANALYZE stats available, so make a guess
 		 */
 		switch (nulltesttype)
 		{

--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.34 2006/07/01 22:07:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.35 2006/09/20 19:50:21 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -110,6 +110,9 @@ extern double get_variable_numdistinct(VariableStatData *vardata);
 extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 							  Datum constval, bool varonleft,
 							  double *sumcommonp);
+extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
+									Datum constval, bool varonleft,
+									int min_hist_size, int n_skip);
 extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
 					 Pattern_Type ptype,