Commit abd94bca authored by Robert Haas's avatar Robert Haas

Use abbreviated keys for faster sorting of numeric datums.

Andrew Gierth, reviewed by Peter Geoghegan, with further tweaks by me.
parent 00ee6c76
...@@ -29,13 +29,16 @@ ...@@ -29,13 +29,16 @@
#include "access/hash.h" #include "access/hash.h"
#include "catalog/pg_type.h" #include "catalog/pg_type.h"
#include "funcapi.h" #include "funcapi.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h" #include "libpq/pqformat.h"
#include "miscadmin.h" #include "miscadmin.h"
#include "nodes/nodeFuncs.h" #include "nodes/nodeFuncs.h"
#include "utils/array.h" #include "utils/array.h"
#include "utils/builtins.h" #include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/int8.h" #include "utils/int8.h"
#include "utils/numeric.h" #include "utils/numeric.h"
#include "utils/sortsupport.h"
/* ---------- /* ----------
* Uncomment the following to enable compilation of dump_numeric() * Uncomment the following to enable compilation of dump_numeric()
...@@ -57,6 +60,12 @@ ...@@ -57,6 +60,12 @@
* are easy. Also, it's actually more efficient if NBASE is rather less than * are easy. Also, it's actually more efficient if NBASE is rather less than
* sqrt(INT_MAX), so that there is "headroom" for mul_var and div_var_fast to * sqrt(INT_MAX), so that there is "headroom" for mul_var and div_var_fast to
* postpone processing carries. * postpone processing carries.
*
* Values of NBASE other than 10000 are considered of historical interest only
* and are no longer supported in any sense; no mechanism exists for the client
* to discover the base, so every client supporting binary mode expects the
* base-10000 format. If you plan to change this, also note the numeric
* abbreviation code, which assumes NBASE=10000.
* ---------- * ----------
*/ */
...@@ -274,6 +283,29 @@ typedef struct ...@@ -274,6 +283,29 @@ typedef struct
} generate_series_numeric_fctx; } generate_series_numeric_fctx;
/* ----------
* Sort support.
* ----------
*/
typedef struct
{
void *buf; /* buffer for short varlenas */
int64 input_count; /* number of non-null values seen */
bool estimating; /* true if estimating cardinality */
hyperLogLogState abbr_card; /* cardinality estimator */
} NumericSortSupport;
#define NUMERIC_ABBREV_BITS (SIZEOF_DATUM * BITS_PER_BYTE)
#ifdef SIZEOF_DATUM == 8
#define DatumGetNumericAbbrev(d) ((int64) d)
#define NUMERIC_ABBREV_NAN Int64GetDatum(PG_INT64_MIN)
#else
#define DatumGetNumericAbbrev(d) ((int32) d)
#define NUMERIC_ABBREV_NAN Int32GetDatum(PG_INT32_MIN)
#endif
/* ---------- /* ----------
* Some preinitialized constants * Some preinitialized constants
* ---------- * ----------
...@@ -409,6 +441,13 @@ static void int128_to_numericvar(int128 val, NumericVar *var); ...@@ -409,6 +441,13 @@ static void int128_to_numericvar(int128 val, NumericVar *var);
static double numeric_to_double_no_overflow(Numeric num); static double numeric_to_double_no_overflow(Numeric num);
static double numericvar_to_double_no_overflow(NumericVar *var); static double numericvar_to_double_no_overflow(NumericVar *var);
static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup);
static bool numeric_abbrev_abort(int memtupcount, SortSupport ssup);
static int numeric_fast_cmp(Datum x, Datum y, SortSupport ssup);
static int numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup);
static Datum numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss);
static int cmp_numerics(Numeric num1, Numeric num2); static int cmp_numerics(Numeric num1, Numeric num2);
static int cmp_var(NumericVar *var1, NumericVar *var2); static int cmp_var(NumericVar *var1, NumericVar *var2);
static int cmp_var_common(const NumericDigit *var1digits, int var1ndigits, static int cmp_var_common(const NumericDigit *var1digits, int var1ndigits,
...@@ -1507,9 +1546,428 @@ compute_bucket(Numeric operand, Numeric bound1, Numeric bound2, ...@@ -1507,9 +1546,428 @@ compute_bucket(Numeric operand, Numeric bound1, Numeric bound2,
* Note: btree indexes need these routines not to leak memory; therefore, * Note: btree indexes need these routines not to leak memory; therefore,
* be careful to free working copies of toasted datums. Most places don't * be careful to free working copies of toasted datums. Most places don't
* need to be so careful. * need to be so careful.
*
* Sort support:
*
* We implement the sortsupport strategy routine in order to get the benefit of
* abbreviation. The ordinary numeric comparison can be quite slow as a result
* of palloc/pfree cycles (due to detoasting packed values for alignment);
* while this could be worked on itself, the abbreviation strategy gives more
* speedup in many common cases.
*
* Two different representations are used for the abbreviated form, one in
* int32 and one in int64, whichever fits into a by-value Datum. In both cases
* the representation is negated relative to the original value, because we use
* the largest negative value for NaN, which sorts higher than other values. We
* convert the absolute value of the numeric to a 31-bit or 63-bit positive
* value, and then negate it if the original number was positive.
*
* We abort the abbreviation process if the abbreviation cardinality is below
* 0.01% of the row count (1 per 10k non-null rows). The actual break-even
* point is somewhat below that, perhaps 1 per 30k (at 1 per 100k there's a
* very small penalty), but we don't want to build up too many abbreviated
* values before first testing for abort, so we take the slightly pessimistic
* number. We make no attempt to estimate the cardinality of the real values,
* since it plays no part in the cost model here (if the abbreviation is equal,
* the cost of comparing equal and unequal underlying values is comparable).
* We discontinue even checking for abort (saving us the hashing overhead) if
* the estimated cardinality gets to 100k; that would be enough to support many
* billions of rows while doing no worse than breaking even.
*
* ---------------------------------------------------------------------- * ----------------------------------------------------------------------
*/ */
/*
* Sort support strategy routine.
*/
Datum
numeric_sortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
ssup->comparator = numeric_fast_cmp;
if (ssup->abbreviate)
{
NumericSortSupport *nss;
MemoryContext oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
nss = palloc(sizeof(NumericSortSupport));
/*
* palloc a buffer for handling unaligned packed values in addition to
* the support struct
*/
nss->buf = palloc(VARATT_SHORT_MAX + VARHDRSZ + 1);
nss->input_count = 0;
nss->estimating = true;
initHyperLogLog(&nss->abbr_card, 10);
ssup->ssup_extra = nss;
ssup->abbrev_full_comparator = ssup->comparator;
ssup->comparator = numeric_cmp_abbrev;
ssup->abbrev_converter = numeric_abbrev_convert;
ssup->abbrev_abort = numeric_abbrev_abort;
MemoryContextSwitchTo(oldcontext);
}
PG_RETURN_VOID();
}
/*
* Abbreviate a numeric datum, handling NaNs and detoasting
* (must not leak memory!)
*/
static Datum
numeric_abbrev_convert(Datum original_datum, SortSupport ssup)
{
NumericSortSupport *nss = ssup->ssup_extra;
void *original_varatt = PG_DETOAST_DATUM_PACKED(original_datum);
Numeric value;
Datum result;
nss->input_count += 1;
/*
* This is to handle packed datums without needing a palloc/pfree cycle;
* we keep and reuse a buffer large enough to handle any short datum.
*/
if (VARATT_IS_SHORT(original_varatt))
{
void *buf = nss->buf;
Size sz = VARSIZE_SHORT(original_varatt) - VARHDRSZ_SHORT;
Assert(sz <= VARATT_SHORT_MAX - VARHDRSZ_SHORT);
SET_VARSIZE(buf, VARHDRSZ + sz);
memcpy(VARDATA(buf), VARDATA_SHORT(original_varatt), sz);
value = (Numeric) buf;
}
else
value = (Numeric) original_varatt;
if (NUMERIC_IS_NAN(value))
{
result = NUMERIC_ABBREV_NAN;
}
else
{
NumericVar var;
init_var_from_num(value, &var);
result = numeric_abbrev_convert_var(&var, nss);
}
/* should happen only for external/compressed toasts */
if ((Pointer) original_varatt != DatumGetPointer(original_datum))
pfree(original_varatt);
return result;
}
/*
* Consider whether to abort abbreviation.
*
* We pay no attention to the cardinality of the non-abbreviated data. There is
* no reason to do so: unlike text, we have no fast check for equal values, so
* we pay the full overhead whenever the abbreviations are equal regardless of
* whether the underlying values are also equal.
*/
static bool
numeric_abbrev_abort(int memtupcount, SortSupport ssup)
{
NumericSortSupport *nss = ssup->ssup_extra;
double abbr_card;
if (memtupcount < 10000 || nss->input_count < 10000 || !nss->estimating)
return false;
abbr_card = estimateHyperLogLog(&nss->abbr_card);
/*
* If we have >100k distinct values, then even if we were sorting many
* billion rows we'd likely still break even, and the penalty of undoing
* that many rows of abbrevs would probably not be worth it. Stop even
* counting at that point.
*/
if (abbr_card > 100000.0)
{
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"numeric_abbrev: estimation ends at cardinality %f"
" after " INT64_FORMAT " values (%d rows)",
abbr_card, nss->input_count, memtupcount);
#endif
nss->estimating = false;
return false;
}
/*
* Target minimum cardinality is 1 per ~10k of non-null inputs. (The
* break even point is somewhere between one per 100k rows, where
* abbreviation has a very slight penalty, and 1 per 10k where it wins by
* a measurable percentage.) We use the relatively pessimistic 10k
* threshold, and add a 0.5 row fudge factor, because it allows us to
* abort earlier on genuinely pathological data where we've had exactly
* one abbreviated value in the first 10k (non-null) rows.
*/
if (abbr_card < nss->input_count / 10000.0 + 0.5)
{
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"numeric_abbrev: aborting abbreviation at cardinality %f"
" below threshold %f after " INT64_FORMAT " values (%d rows)",
abbr_card, nss->input_count / 10000.0 + 0.5,
nss->input_count, memtupcount);
#endif
return true;
}
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"numeric_abbrev: cardinality %f"
" after " INT64_FORMAT " values (%d rows)",
abbr_card, nss->input_count, memtupcount);
#endif
return false;
}
/*
* Non-fmgr interface to the comparison routine to allow sortsupport to elide
* the fmgr call. The saving here is small given how slow numeric comparisons
* are, but it is a required part of the sort support API when abbreviations
* are performed.
*
* Two palloc/pfree cycles could be saved here by using persistent buffers for
* aligning short-varlena inputs, but this has not so far been considered to
* be worth the effort.
*/
static int
numeric_fast_cmp(Datum x, Datum y, SortSupport ssup)
{
Numeric nx = DatumGetNumeric(x);
Numeric ny = DatumGetNumeric(y);
int result;
result = cmp_numerics(nx, ny);
if ((Pointer) nx != DatumGetPointer(x))
pfree(nx);
if ((Pointer) ny != DatumGetPointer(y))
pfree(ny);
return result;
}
/*
* Compare abbreviations of values. (Abbreviations may be equal where the true
* values differ, but if the abbreviations differ, they must reflect the
* ordering of the true values.)
*/
static int
numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
{
/*
* NOTE WELL: this is intentionally backwards, because the abbreviation is
* negated relative to the original value, to handle NaN.
*/
if (DatumGetNumericAbbrev(x) < DatumGetNumericAbbrev(y))
return 1;
if (DatumGetNumericAbbrev(x) > DatumGetNumericAbbrev(y))
return -1;
return 0;
}
/*
* Abbreviate a NumericVar according to the available bit size.
*
* The 31-bit value is constructed as:
*
* 0 + 7bits digit weight + 24 bits digit value
*
* where the digit weight is in single decimal digits, not digit words, and
* stored in excess-44 representation[1]. The 24-bit digit value is the 7 most
* significant decimal digits of the value converted to binary. Values whose
* weights would fall outside the representable range are rounded off to zero
* (which is also used to represent actual zeros) or to 0x7FFFFFFF (which
* otherwise cannot occur). Abbreviation therefore fails to gain any advantage
* where values are outside the range 10^-44 to 10^83, which is not considered
* to be a serious limitation, or when values are of the same magnitude and
* equal in the first 7 decimal digits, which is considered to be an
* unavoidable limitation given the available bits. (Stealing three more bits
* to compare another digit would narrow the range of representable weights by
* a factor of 8, which starts to look like a real limiting factor.)
*
* (The value 44 for the excess is essentially arbitrary)
*
* The 63-bit value is constructed as:
*
* 0 + 7bits weight + 4 x 14-bit packed digit words
*
* The weight in this case is again stored in excess-44, but this time it is
* the original weight in digit words (i.e. powers of 10000). The first four
* digit words of the value (if present; trailing zeros are assumed as needed)
* are packed into 14 bits each to form the rest of the value. Again,
* out-of-range values are rounded off to 0 or 0x7FFFFFFFFFFFFFFF. The
* representable range in this case is 10^-176 to 10^332, which is considered
* to be good enough for all practical purposes, and comparison of 4 words
* means that at least 13 decimal digits are compared, which is considered to
* be a reasonable compromise between effectiveness and efficiency in computing
* the abbreviation.
*
* (The value 44 for the excess is even more arbitrary here, it was chosen just
* to match the value used in the 31-bit case)
*
* [1] - Excess-k representation means that the value is offset by adding 'k'
* and then treated as unsigned, so the smallest representable value is stored
* with all bits zero. This allows simple comparisons to work on the composite
* value.
*/
#if NUMERIC_ABBREV_BITS == 64
static Datum
numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss)
{
int ndigits = var->ndigits;
int weight = var->weight;
int64 result;
if (ndigits == 0 || weight < -44)
{
result = 0;
}
else if (weight > 83)
{
result = PG_INT64_MAX;
}
else
{
result = ((int64) (weight + 44) << 56);
switch (ndigits)
{
default:
result |= ((int64) var->digits[3]);
/* FALLTHROUGH */
case 3:
result |= ((int64) var->digits[2]) << 14;
/* FALLTHROUGH */
case 2:
result |= ((int64) var->digits[1]) << 28;
/* FALLTHROUGH */
case 1:
result |= ((int64) var->digits[0]) << 42;
break;
}
}
/* the abbrev is negated relative to the original */
if (var->sign == NUMERIC_POS)
result = -result;
if (nss->estimating)
{
uint32 tmp = ((uint32) result
^ (uint32) ((uint64) result >> 32));
addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
}
return Int64GetDatum(result);
}
#endif /* NUMERIC_ABBREV_BITS == 64 */
#if NUMERIC_ABBREV_BITS == 32
static Datum
numeric_abbrev_convert_var(NumericVar *var, NumericSortSupport *nss)
{
int ndigits = var->ndigits;
int weight = var->weight;
int32 result;
if (ndigits == 0 || weight < -11)
{
result = 0;
}
else if (weight > 20)
{
result = PG_INT32_MAX;
}
else
{
NumericDigit nxt1 = (ndigits > 1) ? var->digits[1] : 0;
weight = (weight + 11) * 4;
result = var->digits[0];
/*
* "result" now has 1 to 4 nonzero decimal digits. We pack in more
* digits to make 7 in total (largest we can fit in 24 bits)
*/
if (result > 999)
{
/* already have 4 digits, add 3 more */
result = (result * 1000) + (nxt1 / 10);
weight += 3;
}
else if (result > 99)
{
/* already have 3 digits, add 4 more */
result = (result * 10000) + nxt1;
weight += 2;
}
else if (result > 9)
{
NumericDigit nxt2 = (ndigits > 2) ? var->digits[2] : 0;
/* already have 2 digits, add 5 more */
result = (result * 100000) + (nxt1 * 10) + (nxt2 / 1000);
weight += 1;
}
else
{
NumericDigit nxt2 = (ndigits > 2) ? var->digits[2] : 0;
/* already have 1 digit, add 6 more */
result = (result * 1000000) + (nxt1 * 100) + (nxt2 / 100);
}
result = result | (weight << 24);
}
/* the abbrev is negated relative to the original */
if (var->sign == NUMERIC_POS)
result = -result;
if (nss->estimating)
{
uint32 tmp = (uint32) result;
addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
}
return Int32GetDatum(result);
}
#endif /* NUMERIC_ABBREV_BITS == 32 */
/*
* Ordinary (non-sortsupport) comparisons follow.
*/
Datum Datum
numeric_cmp(PG_FUNCTION_ARGS) numeric_cmp(PG_FUNCTION_ARGS)
......
...@@ -112,9 +112,6 @@ extern char *temp_tablespaces; ...@@ -112,9 +112,6 @@ extern char *temp_tablespaces;
extern bool ignore_checksum_failure; extern bool ignore_checksum_failure;
extern bool synchronize_seqscans; extern bool synchronize_seqscans;
#ifdef TRACE_SORT
extern bool trace_sort;
#endif
#ifdef TRACE_SYNCSCAN #ifdef TRACE_SYNCSCAN
extern bool trace_syncscan; extern bool trace_syncscan;
#endif #endif
......
...@@ -53,6 +53,6 @@ ...@@ -53,6 +53,6 @@
*/ */
/* yyyymmddN */ /* yyyymmddN */
#define CATALOG_VERSION_NO 201503302 #define CATALOG_VERSION_NO 201504021
#endif #endif
...@@ -118,6 +118,7 @@ DATA(insert ( 1984 829 829 1 836 )); ...@@ -118,6 +118,7 @@ DATA(insert ( 1984 829 829 1 836 ));
DATA(insert ( 1986 19 19 1 359 )); DATA(insert ( 1986 19 19 1 359 ));
DATA(insert ( 1986 19 19 2 3135 )); DATA(insert ( 1986 19 19 2 3135 ));
DATA(insert ( 1988 1700 1700 1 1769 )); DATA(insert ( 1988 1700 1700 1 1769 ));
DATA(insert ( 1988 1700 1700 2 3283 ));
DATA(insert ( 1989 26 26 1 356 )); DATA(insert ( 1989 26 26 1 356 ));
DATA(insert ( 1989 26 26 2 3134 )); DATA(insert ( 1989 26 26 2 3134 ));
DATA(insert ( 1991 30 30 1 404 )); DATA(insert ( 1991 30 30 1 404 ));
......
...@@ -2370,6 +2370,8 @@ DATA(insert OID = 1767 ( numeric_larger PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 ...@@ -2370,6 +2370,8 @@ DATA(insert OID = 1767 ( numeric_larger PGNSP PGUID 12 1 0 0 0 f f f f t f i 2
DESCR("larger of two"); DESCR("larger of two");
DATA(insert OID = 1769 ( numeric_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "1700 1700" _null_ _null_ _null_ _null_ numeric_cmp _null_ _null_ _null_ )); DATA(insert OID = 1769 ( numeric_cmp PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 23 "1700 1700" _null_ _null_ _null_ _null_ numeric_cmp _null_ _null_ _null_ ));
DESCR("less-equal-greater"); DESCR("less-equal-greater");
DATA(insert OID = 3283 ( numeric_sortsupport PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2278 "2281" _null_ _null_ _null_ _null_ numeric_sortsupport _null_ _null_ _null_ ));
DESCR("sort support");
DATA(insert OID = 1771 ( numeric_uminus PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 1700 "1700" _null_ _null_ _null_ _null_ numeric_uminus _null_ _null_ _null_ )); DATA(insert OID = 1771 ( numeric_uminus PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 1700 "1700" _null_ _null_ _null_ _null_ numeric_uminus _null_ _null_ _null_ ));
DATA(insert OID = 1779 ( int8 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 20 "1700" _null_ _null_ _null_ _null_ numeric_int8 _null_ _null_ _null_ )); DATA(insert OID = 1779 ( int8 PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 20 "1700" _null_ _null_ _null_ _null_ numeric_int8 _null_ _null_ _null_ ));
DESCR("convert numeric to int8"); DESCR("convert numeric to int8");
......
...@@ -978,6 +978,7 @@ extern Datum numeric_round(PG_FUNCTION_ARGS); ...@@ -978,6 +978,7 @@ extern Datum numeric_round(PG_FUNCTION_ARGS);
extern Datum numeric_trunc(PG_FUNCTION_ARGS); extern Datum numeric_trunc(PG_FUNCTION_ARGS);
extern Datum numeric_ceil(PG_FUNCTION_ARGS); extern Datum numeric_ceil(PG_FUNCTION_ARGS);
extern Datum numeric_floor(PG_FUNCTION_ARGS); extern Datum numeric_floor(PG_FUNCTION_ARGS);
extern Datum numeric_sortsupport(PG_FUNCTION_ARGS);
extern Datum numeric_cmp(PG_FUNCTION_ARGS); extern Datum numeric_cmp(PG_FUNCTION_ARGS);
extern Datum numeric_eq(PG_FUNCTION_ARGS); extern Datum numeric_eq(PG_FUNCTION_ARGS);
extern Datum numeric_ne(PG_FUNCTION_ARGS); extern Datum numeric_ne(PG_FUNCTION_ARGS);
......
...@@ -258,6 +258,10 @@ extern int tcp_keepalives_idle; ...@@ -258,6 +258,10 @@ extern int tcp_keepalives_idle;
extern int tcp_keepalives_interval; extern int tcp_keepalives_interval;
extern int tcp_keepalives_count; extern int tcp_keepalives_count;
#ifdef TRACE_SORT
extern bool trace_sort;
#endif
/* /*
* Functions exported by guc.c * Functions exported by guc.c
*/ */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment