Commit 59d0bf9d authored by Heikki Linnakangas's avatar Heikki Linnakangas

Add cost estimation of range @> and <@ operators.

The estimates are based on the existing lower bound histogram, and a new
histogram of range lengths.

Bump catversion, because the range length histogram now needs to be present
in statistic slot kind 6, or you get an error on @> and <@ queries. (A
re-ANALYZE would be enough to fix that, though)

Alexander Korotkov, with some refactoring by me.
parent 788bce13
This diff is collapsed.
......@@ -29,6 +29,8 @@
#include "utils/builtins.h"
#include "utils/rangetypes.h"
static int float8_qsort_cmp(const void *a1, const void *a2);
static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
static void compute_range_stats(VacAttrStats *stats,
AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
......@@ -56,6 +58,23 @@ range_typanalyze(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(true);
}
/*
* Comparison function for sorting float8s, used for range lengths.
*/
static int
float8_qsort_cmp(const void *a1, const void *a2)
{
const float8 *f1 = (const float8 *) a1;
const float8 *f2 = (const float8 *) a2;
if (*f1 < *f2)
return -1;
else if (*f1 == *f2)
return 0;
else
return 1;
}
/*
* Comparison function for sorting RangeBounds.
*/
......@@ -77,6 +96,7 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
int samplerows, double totalrows)
{
TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
int null_cnt = 0;
int non_null_cnt = 0;
int non_empty_cnt = 0;
......@@ -85,12 +105,14 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
int slot_idx;
int num_bins = stats->attr->attstattarget;
int num_hist;
float8 *lengths;
RangeBound *lowers, *uppers;
double total_width = 0;
/* Allocate memory for arrays of range bounds. */
/* Allocate memory to hold range bounds and lengths of the sample ranges. */
lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
lengths = (float8 *) palloc(sizeof(float8) * samplerows);
/* Loop over the sample ranges. */
for (range_no = 0; range_no < samplerows; range_no++)
......@@ -101,6 +123,7 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
RangeType *range;
RangeBound lower,
upper;
float8 length;
vacuum_delay_point();
......@@ -124,9 +147,33 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
if (!empty)
{
/* Fill bound values for further usage in histograms */
/* Remember bounds and length for further usage in histograms */
lowers[non_empty_cnt] = lower;
uppers[non_empty_cnt] = upper;
if (lower.infinite || upper.infinite)
{
/* Length of any kind of an infinite range is infinite */
length = get_float8_infinity();
}
else if (has_subdiff)
{
/*
* For an ordinary range, use subdiff function between upper
* and lower bound values.
*/
length = DatumGetFloat8(FunctionCall2Coll(
&typcache->rng_subdiff_finfo,
typcache->rng_collation,
upper.val, lower.val));
}
else
{
/* Use default value of 1.0 if no subdiff is available. */
length = 1.0;
}
lengths[non_empty_cnt] = length;
non_empty_cnt++;
}
else
......@@ -141,6 +188,7 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
if (non_null_cnt > 0)
{
Datum *bound_hist_values;
Datum *length_hist_values;
int pos,
posfrac,
delta,
......@@ -159,7 +207,8 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
old_cxt = MemoryContextSwitchTo(stats->anl_context);
/*
* Generate a histogram slot entry if there are at least two values.
* Generate a bounds histogram slot entry if there are at least two
* values.
*/
if (non_empty_cnt >= 2)
{
......@@ -210,12 +259,80 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
slot_idx++;
}
/*
* Generate a length histogram slot entry if there are at least two
* values.
*/
if (non_empty_cnt >= 2)
{
/*
* Ascending sort of range lengths for further filling of
* histogram
*/
qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
num_hist = non_empty_cnt;
if (num_hist > num_bins)
num_hist = num_bins + 1;
length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
/*
* The object of this loop is to copy the first and last lengths[]
* entries along with evenly-spaced values in between. So the i'th
* value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
* computing that subscript directly risks integer overflow when the
* stats target is more than a couple thousand. Instead we add
* (nvals - 1) / (num_hist - 1) to pos at each step, tracking the
* integral and fractional parts of the sum separately.
*/
delta = (non_empty_cnt - 1) / (num_hist - 1);
deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
pos = posfrac = 0;
for (i = 0; i < num_hist; i++)
{
length_hist_values[i] = Float8GetDatum(lengths[pos]);
pos += delta;
posfrac += deltafrac;
if (posfrac >= (num_hist - 1))
{
/* fractional part exceeds 1, carry to integer part */
pos++;
posfrac -= (num_hist - 1);
}
}
}
else
{
/*
* Even when we don't create the histogram, store an empty array
* to mean "no histogram". We can't just leave stavalues NULL,
* because get_attstatsslot() errors if you ask for stavalues, and
* it's NULL. We'll still store the empty fraction in stanumbers.
*/
length_hist_values = palloc(0);
num_hist = 0;
}
stats->staop[slot_idx] = Float8LessOperator;
stats->stavalues[slot_idx] = length_hist_values;
stats->numvalues[slot_idx] = num_hist;
stats->statypid[slot_idx] = FLOAT8OID;
stats->statyplen[slot_idx] = sizeof(float8);
#ifdef USE_FLOAT8_BYVAL
stats->statypbyval[slot_idx] = true;
#else
stats->statypbyval[slot_idx] = false;
#endif
stats->statypalign[slot_idx] = 'd';
/* Store the fraction of empty ranges */
emptyfrac = (float4 *) palloc(sizeof(float4));
*emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_EMPTY_FRAC;
stats->stanumbers[slot_idx] = emptyfrac;
stats->numnumbers[slot_idx] = 1;
stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
slot_idx++;
MemoryContextSwitchTo(old_cxt);
......
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201303101
#define CATALOG_VERSION_NO 201303141
#endif
......@@ -527,6 +527,7 @@ DATA(insert OID = 671 ( "<>" PGNSP PGUID b f f 701 701 16 671 670 float8ne
DESCR("not equal");
DATA(insert OID = 672 ( "<" PGNSP PGUID b f f 701 701 16 674 675 float8lt scalarltsel scalarltjoinsel ));
DESCR("less than");
#define Float8LessOperator 672
DATA(insert OID = 673 ( "<=" PGNSP PGUID b f f 701 701 16 675 674 float8le scalarltsel scalarltjoinsel ));
DESCR("less than or equal");
DATA(insert OID = 674 ( ">" PGNSP PGUID b f f 701 701 16 672 673 float8gt scalargtsel scalargtjoinsel ));
......
......@@ -269,11 +269,15 @@ typedef FormData_pg_statistic *Form_pg_statistic;
#define STATISTIC_KIND_DECHIST 5
/*
* An "empty frac" slot describes the fraction of empty ranges in a range-type
* column. stavalues is not used and should be NULL. stanumbers contains a
* single entry, the fraction of empty ranges (0.0 to 1.0).
* A "length histogram" slot describes the distribution of range lengths in
* rows of a range-type column. stanumbers contains a single entry, the
* fraction of empty ranges. stavalues is a histogram of non-empty lengths, in
* a format similar to STATISTIC_KIND_HISTOGRAM: it contains M (>=2) range
* values that divide the column data values into M-1 bins of approximately
* equal population. The lengths are stores as float8s, as measured by the
* range type's subdiff function. Only non-null rows are considered.
*/
#define STATISTIC_KIND_RANGE_EMPTY_FRAC 6
#define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM 6
/*
* A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment