Collect and use histograms of lower and upper bounds for range types.

This enables selectivity estimation of the <<, >>, &<, &> and && operators, as well as the normal inequality operators: <, <=, >=, >. "range @> element" is also supported, but the range-variant @> and <@ operators are not, because they cannot be sensibly estimated with lower and upper bound histograms alone. We would need to make some assumption about the lengths of the ranges for that. Alexander's patch included a separate histogram of lengths for that, but I left that out of the patch for simplicity. Hopefully that will be added as a followup patch. The fraction of empty ranges is also calculated and used in estimation. Alexander Korotkov, heavily modified by me.

Collect and use histograms of lower and upper bounds for range types.
This enables selectivity estimation of the <<, >>, &<, &> and && operators, as well as the normal inequality operators: <, <=, >=, >. "range @> element" is also supported, but the range-variant @> and <@ operators are not, because they cannot be sensibly estimated with lower and upper bound histograms alone. We would need to make some assumption about the lengths of the ranges for that. Alexander's patch included a separate histogram of lengths for that, but I left that out of the patch for simplicity. Hopefully that will be added as a followup patch. The fraction of empty ranges is also calculated and used in estimation. Alexander Korotkov, heavily modified by me.
918eee0c · Heikki Linnakangas · 6bb0b08f · 918eee0c · 918eee0c · 918eee0c
Commit 918eee0c authored Aug 27, 2012 by Heikki Linnakangas
9 changed files
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -30,7 +30,8 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
 	tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \
 	tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \
 	tsvector.o tsvector_op.o tsvector_parser.o \
-	txid.o uuid.o windowfuncs.o xml.o rangetypes_spgist.o
+	txid.o uuid.o windowfuncs.o xml.o rangetypes_spgist.o \
+	rangetypes_typanalyze.o rangetypes_selfuncs.o
 like.o: like.c like_match.c

--- a/src/backend/utils/adt/rangetypes.c
+++ b/src/backend/utils/adt/rangetypes.c
@@ -1228,23 +1228,6 @@ hash_range(PG_FUNCTION_ARGS)
 	PG_RETURN_INT32(result);
 }
-/* ANALYZE support */
-/* typanalyze function for range datatypes */
-Datum
-range_typanalyze(PG_FUNCTION_ARGS)
-{
-	/*
-	 * For the moment, just punt and don't analyze range columns.  If we get
-	 * close to release without having a better answer, we could consider
-	 * letting std_typanalyze do what it can ... but those stats are probably
-	 * next door to useless for most activity with range columns, so it's not
-	 * clear it's worth gathering them.
-	 */
-	PG_RETURN_BOOL(false);
-}
 /*
 *----------------------------------------------------------
 * CANONICAL FUNCTIONS

--- a/src/backend/utils/adt/rangetypes_selfuncs.c
+++ b/src/backend/utils/adt/rangetypes_selfuncs.c
--- a/src/backend/utils/adt/rangetypes_typanalyze.c
+++ b/src/backend/utils/adt/rangetypes_typanalyze.c
+/*-------------------------------------------------------------------------
+ *
+ * ragetypes_typanalyze.c
+ *	  Functions for gathering statistics from range columns
+ *
+ * For a range type column, histograms of lower and upper bounds, and
+ * the fraction of NULL and empty ranges are collected.
+ *
+ * Both histograms have the same length, and they are combined into a
+ * single array of ranges. This has the same shape as the histogram that
+ * std_typanalyze would collect, but the values are different. Each range
+ * in the array is a valid range, even though the lower and upper bounds
+ * come from different tuples. In theory, the standard scalar selectivity
+ * functions could be used with the combined histogram.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/rangetypes_typanalyze.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "catalog/pg_operator.h"
+#include "commands/vacuum.h"
+#include "utils/builtins.h"
+#include "utils/rangetypes.h"
+static void compute_range_stats(VacAttrStats *stats,
+		   AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
+/*
+ * range_typanalyze -- typanalyze function for range columns
+ */
+Datum
+range_typanalyze(PG_FUNCTION_ARGS)
+{
+	VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
+	TypeCacheEntry *typcache;
+	Form_pg_attribute attr = stats->attr;
+	/* Get information about range type */
+	typcache = range_get_typcache(fcinfo, stats->attrtypid);
+	if (attr->attstattarget < 0)
+        attr->attstattarget = default_statistics_target;
+	stats->compute_stats = compute_range_stats;
+	stats->extra_data = typcache;
+	/* same as in std_typanalyze */
+	stats->minrows = 300 * attr->attstattarget;
+	PG_RETURN_BOOL(true);
+}
+/*
+ * Comparison function for sorting RangeBounds.
+ */
+static int
+range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
+{
+	RangeBound *b1 = (RangeBound *)a1;
+	RangeBound *b2 = (RangeBound *)a2;
+	TypeCacheEntry *typcache = (TypeCacheEntry *)arg;
+	return range_cmp_bounds(typcache, b1, b2);
+}
+/*
+ * compute_range_stats() -- compute statistics for a range column
+ */
+static void
+compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
+					int samplerows, double totalrows)
+{
+	TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
+	int			null_cnt = 0;
+	int			non_null_cnt = 0;
+	int			non_empty_cnt = 0;
+	int			empty_cnt = 0;
+	int			range_no;
+	int			slot_idx;
+	int			num_bins = stats->attr->attstattarget;
+	int			num_hist;
+	RangeBound *lowers, *uppers;
+	double		total_width = 0;
+	/* Allocate memory for arrays of range bounds. */
+	lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
+	uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
+	/* Loop over the sample ranges. */
+	for (range_no = 0; range_no < samplerows; range_no++)
+	{
+		Datum		value;
+		bool		isnull,
+					empty;
+		RangeType  *range;
+		RangeBound	lower,
+					upper;
+		vacuum_delay_point();
+		value = fetchfunc(stats, range_no, &isnull);
+		if (isnull)
+		{
+			/* range is null, just count that */
+			null_cnt++;
+			continue;
+		}
+		/*
+		 * XXX: should we ignore wide values, like std_typanalyze does, to
+		 * avoid bloating the statistics table?
+		 */
+		total_width += VARSIZE_ANY(DatumGetPointer(value));
+		/* Get range and deserialize it for further analysis. */
+		range = DatumGetRangeType(value);
+		range_deserialize(typcache, range, &lower, &upper, &empty);
+		if (!empty)
+		{
+			/* Fill bound values for further usage in histograms */
+			lowers[non_empty_cnt] = lower;
+			uppers[non_empty_cnt] = upper;
+			non_empty_cnt++;
+		}
+		else
+			empty_cnt++;
+		non_null_cnt++;
+	}
+	slot_idx = 0;
+	/* We can only compute real stats if we found some non-null values. */
+	if (non_null_cnt > 0)
+	{
+		Datum	   *bound_hist_values;
+		int			pos,
+					posfrac,
+					delta,
+					deltafrac,
+					i;
+		MemoryContext old_cxt;
+		float4	   *emptyfrac;
+		stats->stats_valid = true;
+		/* Do the simple null-frac and width stats */
+		stats->stanullfrac = (double) null_cnt / (double) samplerows;
+		stats->stawidth = total_width / (double) non_null_cnt;
+		stats->stadistinct = -1.0;
+		/* Must copy the target values into anl_context */
+		old_cxt = MemoryContextSwitchTo(stats->anl_context);
+		if (non_empty_cnt > 0)
+		{
+			/* Sort bound values */
+			qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
+					  range_bound_qsort_cmp, typcache);
+			qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
+					  range_bound_qsort_cmp, typcache);
+			num_hist = non_empty_cnt;
+			if (num_hist > num_bins)
+				num_hist = num_bins + 1;
+			bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
+			/*
+			 * The object of this loop is to construct ranges from first and
+			 * last entries in lowers[] and uppers[] along with evenly-spaced
+			 * values in between. So the i'th value is a range of
+			 * lowers[(i * (nvals - 1)) / (num_hist - 1)] and
+			 * uppers[(i * (nvals - 1)) / (num_hist - 1)]. But computing that
+			 * subscript directly risks integer overflow when the stats target
+			 * is more than a couple thousand.  Instead we add
+			 * (nvals - 1) / (num_hist - 1) to pos at each step, tracking the
+			 * integral and fractional parts of the sum separately.
+			 */
+			delta = (non_empty_cnt - 1) / (num_hist - 1);
+			deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
+			pos = posfrac = 0;
+			for (i = 0; i < num_hist; i++)
+			{
+				bound_hist_values[i] = PointerGetDatum(range_serialize(
+								typcache, &lowers[pos], &uppers[pos], false));
+				pos += delta;
+				posfrac += deltafrac;
+				if (posfrac >= (num_hist - 1))
+				{
+					/* fractional part exceeds 1, carry to integer part */
+					pos++;
+					posfrac -= (num_hist - 1);
+				}
+			}
+			stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
+			stats->stavalues[slot_idx] = bound_hist_values;
+			stats->numvalues[slot_idx] = num_hist;
+			slot_idx++;
+		}
+		/* Store the fraction of empty ranges */
+		emptyfrac = (float4 *) palloc(sizeof(float4));
+		*emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
+		stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_EMPTY_FRAC;
+		stats->stanumbers[slot_idx] = emptyfrac;
+		stats->numnumbers[slot_idx] = 1;
+		slot_idx++;
+		MemoryContextSwitchTo(old_cxt);
+	}
+	else if (null_cnt > 0)
+	{
+		/* We found only nulls; assume the column is entirely null */
+		stats->stats_valid = true;
+		stats->stanullfrac = 1.0;
+		stats->stawidth = 0;		/* "unknown" */
+		stats->stadistinct = 0.0;	/* "unknown" */
+	}
+	/*
+	 * We don't need to bother cleaning up any of our temporary palloc's. The
+	 * hashtable should also go away, as it used a child memory context.
+	 */
+}
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
 */
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201208161
+#define CATALOG_VERSION_NO	201208271
 #endif
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -1676,32 +1676,45 @@ DATA(insert OID = 3882 (  "="	   PGNSP PGUID b t t 3831 3831 16 3882 3883 range_
 DESCR("equal");
 DATA(insert OID = 3883 (  "<>"	   PGNSP PGUID b f f 3831 3831 16 3883 3882 range_ne neqsel neqjoinsel ));
 DESCR("not equal");
-DATA(insert OID = 3884 (  "<"	   PGNSP PGUID b f f 3831 3831 16 3887 3886 range_lt scalarltsel scalarltjoinsel ));
+DATA(insert OID = 3884 (  "<"	   PGNSP PGUID b f f 3831 3831 16 3887 3886 range_lt rangesel scalarltjoinsel ));
 DESCR("less than");
-DATA(insert OID = 3885 (  "<="	   PGNSP PGUID b f f 3831 3831 16 3886 3887 range_le scalarltsel scalarltjoinsel ));
+#define OID_RANGE_LESS_OP 3884
+DATA(insert OID = 3885 (  "<="	   PGNSP PGUID b f f 3831 3831 16 3886 3887 range_le rangesel scalarltjoinsel ));
 DESCR("less than or equal");
-DATA(insert OID = 3886 (  ">="	   PGNSP PGUID b f f 3831 3831 16 3885 3884 range_ge scalargtsel scalargtjoinsel ));
+#define OID_RANGE_LESS_EQUAL_OP 3885
+DATA(insert OID = 3886 (  ">="	   PGNSP PGUID b f f 3831 3831 16 3885 3884 range_ge rangesel scalargtjoinsel ));
 DESCR("greater than or equal");
-DATA(insert OID = 3887 (  ">"	   PGNSP PGUID b f f 3831 3831 16 3884 3885 range_gt scalargtsel scalargtjoinsel ));
+#define OID_RANGE_GREATER_OP 3886
+DATA(insert OID = 3887 (  ">"	   PGNSP PGUID b f f 3831 3831 16 3884 3885 range_gt rangesel scalargtjoinsel ));
 DESCR("greater than");
-DATA(insert OID = 3888 (  "&&"	   PGNSP PGUID b f f 3831 3831 16 3888 0 range_overlaps areasel areajoinsel ));
+#define OID_RANGE_GREATER_EQUAL_OP 3887
+DATA(insert OID = 3888 (  "&&"	   PGNSP PGUID b f f 3831 3831 16 3888 0 range_overlaps rangesel areajoinsel ));
 DESCR("overlaps");
-DATA(insert OID = 3889 (  "@>"	   PGNSP PGUID b f f 3831 2283 16 3891 0 range_contains_elem contsel contjoinsel ));
+#define OID_RANGE_OVERLAP_OP 3888
+DATA(insert OID = 3889 (  "@>"	   PGNSP PGUID b f f 3831 2283 16 3891 0 range_contains_elem rangesel contjoinsel ));
 DESCR("contains");
-DATA(insert OID = 3890 (  "@>"	   PGNSP PGUID b f f 3831 3831 16 3892 0 range_contains contsel contjoinsel ));
+#define OID_RANGE_CONTAINS_ELEM_OP 3889
+DATA(insert OID = 3890 (  "@>"	   PGNSP PGUID b f f 3831 3831 16 3892 0 range_contains rangesel contjoinsel ));
 DESCR("contains");
-DATA(insert OID = 3891 (  "<@"	   PGNSP PGUID b f f 2283 3831 16 3889 0 elem_contained_by_range contsel contjoinsel ));
+#define OID_RANGE_CONTAINS_OP 3890
+DATA(insert OID = 3891 (  "<@"	   PGNSP PGUID b f f 2283 3831 16 3889 0 elem_contained_by_range rangesel contjoinsel ));
 DESCR("is contained by");
-DATA(insert OID = 3892 (  "<@"	   PGNSP PGUID b f f 3831 3831 16 3890 0 range_contained_by contsel contjoinsel ));
+#define OID_RANGE_ELEM_CONTAINED_OP 3891
+DATA(insert OID = 3892 (  "<@"	   PGNSP PGUID b f f 3831 3831 16 3890 0 range_contained_by rangesel contjoinsel ));
 DESCR("is contained by");
-DATA(insert OID = 3893 (  "<<"	   PGNSP PGUID b f f 3831 3831 16 3894 0 range_before scalarltsel scalarltjoinsel ));
+#define OID_RANGE_CONTAINED_OP 3892
+DATA(insert OID = 3893 (  "<<"	   PGNSP PGUID b f f 3831 3831 16 3894 0 range_before rangesel scalarltjoinsel ));
 DESCR("is left of");
-DATA(insert OID = 3894 (  ">>"	   PGNSP PGUID b f f 3831 3831 16 3893 0 range_after scalargtsel scalargtjoinsel ));
+#define OID_RANGE_LEFT_OP 3893
+DATA(insert OID = 3894 (  ">>"	   PGNSP PGUID b f f 3831 3831 16 3893 0 range_after rangesel scalargtjoinsel ));
 DESCR("is right of");
-DATA(insert OID = 3895 (  "&<"	   PGNSP PGUID b f f 3831 3831 16 0 0 range_overleft scalarltsel scalarltjoinsel ));
+#define OID_RANGE_RIGHT_OP 3894
+DATA(insert OID = 3895 (  "&<"	   PGNSP PGUID b f f 3831 3831 16 0 0 range_overleft rangesel scalarltjoinsel ));
 DESCR("overlaps or is left of");
-DATA(insert OID = 3896 (  "&>"	   PGNSP PGUID b f f 3831 3831 16 0 0 range_overright scalargtsel scalargtjoinsel ));
+#define OID_RANGE_OVERLAPS_LEFT_OP 3895
+DATA(insert OID = 3896 (  "&>"	   PGNSP PGUID b f f 3831 3831 16 0 0 range_overright rangesel scalargtjoinsel ));
 DESCR("overlaps or is right of");
+#define OID_RANGE_OVERLAPS_RIGHT_OP 3896
 DATA(insert OID = 3897 (  "-|-"    PGNSP PGUID b f f 3831 3831 16 3897 0 range_adjacent contsel contjoinsel ));
 DESCR("is adjacent to");
 DATA(insert OID = 3898 (  "+"	   PGNSP PGUID b f f 3831 3831 3831 3898 0 range_union - - ));

--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4544,6 +4544,8 @@ DATA(insert OID = 3902 (  hash_range			PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0
 DESCR("hash a range");
 DATA(insert OID = 3916 (  range_typanalyze		PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ range_typanalyze _null_ _null_ _null_ ));
 DESCR("range typanalyze");
+DATA(insert OID = 3169 (  rangesel				PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ rangesel _null_ _null_ _null_ ));
+DESCR("restriction selectivity for range operators");
 DATA(insert OID = 3914 (  int4range_canonical		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 3904 "3904" _null_ _null_ _null_ _null_ int4range_canonical _null_ _null_ _null_ ));
 DESCR("convert an int4 range to canonical form");

--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -268,4 +268,22 @@ typedef FormData_pg_statistic *Form_pg_statistic;
 */
 #define STATISTIC_KIND_DECHIST	5
+/*
+ * An "empty frac" slot describes the fraction of empty ranges in a range-type
+ * column.  stavalues is not used and should be NULL.  stanumbers contains a
+ * single entry, the fraction of empty ranges (0.0 to 1.0).
+ */
+#define STATISTIC_KIND_RANGE_EMPTY_FRAC  6
+/*
+ * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for
+ * a range-type column.  stavalues contains M (>=2) range values that divide
+ * the column data values into M-1 bins of approximately equal population.
+ * Unlike a regular scalar histogram, this is actually two histograms combined
+ * into a single array, with the lower bounds of each value forming a
+ * histogram of lower bounds, and the upper bounds a histogram of upper
+ * bounds.  Only non-NULL, non-empty ranges are included.
+ */
+#define STATISTIC_KIND_BOUNDS_HISTOGRAM  7
 #endif   /* PG_STATISTIC_H */
--- a/src/include/utils/rangetypes.h
+++ b/src/include/utils/rangetypes.h
@@ -170,6 +170,7 @@ extern Datum hash_range(PG_FUNCTION_ARGS);
 /* ANALYZE support */
 extern Datum range_typanalyze(PG_FUNCTION_ARGS);
+extern Datum rangesel(PG_FUNCTION_ARGS);
 /* Canonical functions */
 extern Datum int4range_canonical(PG_FUNCTION_ARGS);