Commit 4ea51cdf authored by Robert Haas's avatar Robert Haas

Use abbreviated keys for faster sorting of text datums.

This commit extends the SortSupport infrastructure to allow operator
classes the option to provide abbreviated representations of Datums;
in the case of text, we abbreviate by taking the first few characters
of the strxfrm() blob.  If the abbreviated comparison is insufficent
to resolve the comparison, we fall back on the normal comparator.
This can be much faster than the old way of doing sorting if the
first few bytes of the string are usually sufficient to resolve the
comparison.

There is the potential for a performance regression if all of the
strings to be sorted are identical for the first 8+ characters and
differ only in later positions; therefore, the SortSupport machinery
now provides an infrastructure to abort the use of abbreviation if
it appears that abbreviation is producing comparatively few distinct
keys.  HyperLogLog, a streaming cardinality estimator, is included in
this commit and used to make that determination for text.

Peter Geoghegan, reviewed by me.
parent 1605291b
......@@ -717,6 +717,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
sortKey->ssup_nulls_first =
(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
sortKey->ssup_attno = scanKey->sk_attno;
/* Abbreviation is not supported here */
sortKey->abbreviate = false;
AssertState(sortKey->ssup_attno != 0);
......
......@@ -2301,6 +2301,12 @@ compute_scalar_stats(VacAttrStatsP stats,
/* We always use the default collation for statistics */
ssup.ssup_collation = DEFAULT_COLLATION_OID;
ssup.ssup_nulls_first = false;
/*
* For now, don't perform abbreviated key conversion, because full values
* are required for MCV slot generation. Supporting that optimization
* would necessitate teaching compare_scalars() to call a tie-breaker.
*/
ssup.abbreviate = false;
PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
......
......@@ -363,6 +363,10 @@ initialize_aggregates(AggState *aggstate,
* We use a plain Datum sorter when there's a single input column;
* otherwise sort the full tuple. (See comments for
* process_ordered_aggregate_single.)
*
* In the future, we should consider forcing the
* tuplesort_begin_heap() case when the abbreviated key
* optimization can thereby be used, even when numInputs is 1.
*/
peraggstate->sortstate =
(peraggstate->numInputs == 1) ?
......
......@@ -137,6 +137,15 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags)
sortKey->ssup_nulls_first = node->nullsFirst[i];
sortKey->ssup_attno = node->sortColIdx[i];
/*
* It isn't feasible to perform abbreviated key conversion, since
* tuples are pulled into mergestate's binary heap as needed. It would
* likely be counter-productive to convert tuples into an abbreviated
* representation as they're pulled up, so opt out of that additional
* optimization entirely.
*/
sortKey->abbreviate = false;
PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
}
......
......@@ -229,6 +229,14 @@ MJExamineQuals(List *mergeclauses,
elog(ERROR, "cannot merge using non-equality operator %u",
qual->opno);
/*
* sortsupport routine must know if abbreviation optimization is
* applicable in principle. It is never applicable for merge joins
* because there is no convenient opportunity to convert to alternative
* representation.
*/
clause->ssup.abbreviate = false;
/* And get the matching support or comparison function */
Assert(clause->ssup.comparator == NULL);
sortfunc = get_opfamily_proc(opfamily,
......
......@@ -12,6 +12,6 @@ subdir = src/backend/lib
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = ilist.o binaryheap.o pairingheap.o rbtree.o stringinfo.o
OBJS = ilist.o binaryheap.o hyperloglog.o pairingheap.o rbtree.o stringinfo.o
include $(top_srcdir)/src/backend/common.mk
/*-------------------------------------------------------------------------
*
* hyperloglog.c
* HyperLogLog cardinality estimator
*
* Portions Copyright (c) 2014, PostgreSQL Global Development Group
*
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
* suited to estimating the cardinality of very large sets; in particular, we
* have not attempted to further optimize the implementation as described in
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
* Engineering of a State of The Art Cardinality Estimation Algorithm".
*
* A sparse representation of HyperLogLog state is used, with fixed space
* overhead.
*
* The copyright terms of Ohno's original version (the MIT license) follow.
*
* IDENTIFICATION
* src/backend/lib/hyperloglog.c
*
*-------------------------------------------------------------------------
*/
/*
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the 'Software'), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "postgres.h"
#include <math.h>
#include "lib/hyperloglog.h"
#define POW_2_32 (4294967296.0)
#define NEG_POW_2_32 (-4294967296.0)
static inline uint8 rho(uint32 x, uint8 b);
/*
* Initialize HyperLogLog track state
*
* bwidth is bit width (so register size will be 2 to the power of bwidth).
* Must be between 4 and 16 inclusive.
*/
void
initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
{
double alpha;
if (bwidth < 4 || bwidth > 16)
elog(ERROR, "bit width must be between 4 and 16 inclusive");
cState->registerWidth = bwidth;
cState->nRegisters = 1 << bwidth;
cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
/*
* Initialize hashes array to zero, not negative infinity, per discussion
* of the coupon collector problem in the HyperLogLog paper
*/
cState->hashesArr = palloc0(cState->arrSize);
/*
* "alpha" is a value that for each possible number of registers (m) is
* used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
* is "the indicator function" through which we finally compute E,
* estimated cardinality).
*/
switch (cState->nRegisters)
{
case 16:
alpha = 0.673;
break;
case 32:
alpha = 0.697;
break;
case 64:
alpha = 0.709;
break;
default:
alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
}
/*
* Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
* estimate E
*/
cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
}
/*
* Adds element to the estimator, from caller-supplied hash.
*
* It is critical that the hash value passed be an actual hash value, typically
* generated using hash_any(). The algorithm relies on a specific bit-pattern
* observable in conjunction with stochastic averaging. There must be a
* uniform distribution of bits in hash values for each distinct original value
* observed.
*/
void
addHyperLogLog(hyperLogLogState *cState, uint32 hash)
{
uint8 count;
uint32 index;
/* Use the first "k" (registerWidth) bits as a zero based index */
index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
count = rho(hash << cState->registerWidth,
BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
}
/*
* Estimates cardinality, based on elements added so far
*/
double
estimateHyperLogLog(hyperLogLogState *cState)
{
double result;
double sum = 0.0;
int i;
for (i = 0; i < cState->nRegisters; i++)
{
sum += 1.0 / pow(2.0, cState->hashesArr[i]);
}
/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
result = cState->alphaMM / sum;
if (result <= (5.0 / 2.0) * cState->nRegisters)
{
/* Small range correction */
int zero_count = 0;
for (i = 0; i < cState->nRegisters; i++)
{
if (cState->hashesArr[i] == 0)
zero_count++;
}
if (zero_count != 0)
result = cState->nRegisters * log((double) cState->nRegisters /
zero_count);
}
else if (result > (1.0 / 30.0) * POW_2_32)
{
/* Large range correction */
result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
}
return result;
}
/*
* Merges the estimate from one HyperLogLog state to another, returning the
* estimate of their union.
*
* The number of registers in each must match.
*/
void
mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState)
{
int r;
if (cState->nRegisters != oState->nRegisters)
elog(ERROR, "number of registers mismatch: %zu != %zu",
cState->nRegisters, oState->nRegisters);
for (r = 0; r < cState->nRegisters; ++r)
{
cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]);
}
}
/*
* Worker for addHyperLogLog().
*
* Calculates the position of the first set bit in first b bits of x argument
* starting from the first, reading from most significant to least significant
* bits.
*
* Example (when considering fist 10 bits of x):
*
* rho(x = 0b1000000000) returns 1
* rho(x = 0b0010000000) returns 3
* rho(x = 0b0000000000) returns b + 1
*
* "The binary address determined by the first b bits of x"
*
* Return value "j" used to index bit pattern to watch.
*/
static inline uint8
rho(uint32 x, uint8 b)
{
uint8 j = 1;
while (j <= b && !(x & 0x80000000))
{
j++;
x <<= 1;
}
return j;
}
......@@ -266,7 +266,13 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
osastate->qstate = qstate;
osastate->gcontext = gcontext;
/* Initialize tuplesort object */
/*
* Initialize tuplesort object.
*
* In the future, we should consider forcing the tuplesort_begin_heap()
* case when the abbreviated key optimization can thereby be used, even
* when !use_tuples.
*/
if (use_tuples)
osastate->sortstate = tuplesort_begin_heap(qstate->tupdesc,
qstate->numSortCols,
......
This diff is collapsed.
This diff is collapsed.
/*
* hyperloglog.h
*
* A simple HyperLogLog cardinality estimator implementation
*
* Portions Copyright (c) 2014, PostgreSQL Global Development Group
*
* Based on Hideaki Ohno's C++ implementation. The copyright terms of Ohno's
* original version (the MIT license) follow.
*
* src/include/lib/hyperloglog.h
*/
/*
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the 'Software'), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef HYPERLOGLOG_H
#define HYPERLOGLOG_H
/*
* HyperLogLog is an approximate technique for computing the number of distinct
* entries in a set. Importantly, it does this by using a fixed amount of
* memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal
* cardinality estimation algorithm" for more.
*
* hyperLogLogState
*
* registerWidth register width, in bits ("k")
* nRegisters number of registers
* alphaMM alpha * m ^ 2 (see initHyperLogLog())
* hashesArr array of hashes
* arrSize size of hashesArr
*/
typedef struct hyperLogLogState
{
uint8 registerWidth;
Size nRegisters;
double alphaMM;
uint8 *hashesArr;
Size arrSize;
} hyperLogLogState;
extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
extern void addHyperLogLog(hyperLogLogState *cState, uint32 hash);
extern double estimateHyperLogLog(hyperLogLogState *cState);
extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
#endif /* HYPERLOGLOG_H */
......@@ -214,13 +214,13 @@
#endif
/*
* Assumed cache line size. This doesn't affect correctness, but can be
* used for low-level optimizations. Currently, this is only used to pad
* some data structures in xlog.c, to ensure that highly-contended fields
* are on different cache lines. Too small a value can hurt performance due
* to false sharing, while the only downside of too large a value is a few
* bytes of wasted memory. The default is 128, which should be large enough
* for all supported platforms.
* Assumed cache line size. This doesn't affect correctness, but can be used
* for low-level optimizations. Currently, this is used to pad some data
* structures in xlog.c, to ensure that highly-contended fields are on
* different cache lines. Too small a value can hurt performance due to false
* sharing, while the only downside of too large a value is a few bytes of
* wasted memory. The default is 128, which should be large enough for all
* supported platforms.
*/
#define PG_CACHE_LINE_SIZE 128
......
......@@ -21,7 +21,12 @@
* required to provide all of them. The BTSORTSUPPORT function should
* simply not set any function pointers for mechanisms it doesn't support.
* Opclasses that provide BTSORTSUPPORT and don't provide a comparator
* function will have a shim set up by sort support automatically.
* function will have a shim set up by sort support automatically. However,
* opclasses that support the optional additional abbreviated key capability
* must always provide an authoritative comparator used to tie-break
* inconclusive abbreviated comparisons and also used when aborting
* abbreviation. Furthermore, a converter and abort/costing function must be
* provided.
*
* All sort support functions will be passed the address of the
* SortSupportData struct when called, so they can use it to store
......@@ -93,12 +98,96 @@ typedef struct SortSupportData
* than, equal to, or greater than y. Note that x and y are guaranteed
* not null, and there is no way to return null either. Do not return
* INT_MIN, as callers are allowed to negate the result before using it.
*
* This may be either the authoritative comparator, or the abbreviated
* comparator. Core code may switch this over the initial preference of an
* opclass support function despite originally indicating abbreviation was
* applicable, by assigning the authoritative comparator back.
*/
int (*comparator) (Datum x, Datum y, SortSupport ssup);
/*
* Additional sort-acceleration functions might be added here later.
* "Abbreviated key" infrastructure follows.
*
* All callbacks must be set by sortsupport opclasses that make use of this
* optional additional infrastructure (unless for whatever reasons the
* opclass doesn't proceed with abbreviation, in which case
* abbrev_converter must not be set).
*
* This allows opclass authors to supply a conversion routine, used to
* create an alternative representation of the underlying type (an
* "abbreviated key"). Typically, this representation is an ad-hoc,
* pass-by-value Datum format that only the opclass has knowledge of. An
* alternative comparator, used only with this alternative representation
* must also be provided (which is assigned to "comparator"). This
* representation is a simple approximation of the original Datum. It must
* be possible to compare datums of this representation with each other
* using the supplied alternative comparator, and have any non-zero return
* value be a reliable proxy for what a proper comparison would indicate.
* Returning zero from the alternative comparator does not indicate
* equality, as with a conventional support routine 1, though -- it
* indicates that it wasn't possible to determine how the two abbreviated
* values compared. A proper comparison, using "abbrev_full_comparator"/
* ApplySortAbbrevFullComparator() is therefore required. In many cases
* this results in most or all comparisons only using the cheap alternative
* comparison func, which is typically implemented as code that compiles to
* just a few CPU instructions. CPU cache miss penalties are expensive; to
* get good overall performance, sort infrastructure must heavily weigh
* cache performance.
*
* Opclass authors must consider the final cardinality of abbreviated keys
* when devising an encoding scheme. It's possible for a strategy to work
* better than an alternative strategy with one usage pattern, while the
* reverse might be true for another usage pattern. All of these factors
* must be considered.
*/
/*
* "abbreviate" concerns whether or not the abbreviated key optimization is
* applicable in principle (that is, the sortsupport routine needs to know
* if its dealing with a key where an abbreviated representation can
* usefully be packed together. Conventionally, this is the leading
* attribute key). Note, however, that in order to determine that
* abbreviation is not in play, the core code always checks whether or not
* the opclass has set abbrev_converter. This is a one way, one time
* message to the opclass.
*/
bool abbreviate;
/*
* Converter to abbreviated format, from original representation. Core
* code uses this callback to convert from a pass-by-reference "original"
* Datum to a pass-by-value abbreviated key Datum. Note that original is
* guaranteed NOT NULL, because it doesn't make sense to factor NULLness
* into ad-hoc cost model.
*
* abbrev_converter is tested to see if abbreviation is in play. Core code
* may set it to NULL to indicate abbreviation should not be used (which is
* something sortsupport routines need not concern themselves with).
* However, sortsupport routines must not set it when it is immediately
* established that abbreviation should not proceed (for abbreviation
* calls, or platform-specific impediments to using abbreviation).
*/
Datum (*abbrev_converter) (Datum original, SortSupport ssup);
/*
* abbrev_abort callback allows clients to verify that the current strategy
* is working out, using a sortsupport routine defined ad-hoc cost model.
* If there is a lot of duplicate abbreviated keys in practice, it's useful
* to be able to abandon the strategy before paying too high a cost in
* conversion (perhaps certain opclass-specific adaptations are useful
* too).
*/
bool (*abbrev_abort) (int memtupcount, SortSupport ssup);
/*
* Full, authoritative comparator for key that an abbreviated
* representation was generated for, used when an abbreviated comparison
* was inconclusive (by calling ApplySortComparatorFull()), or used to
* replace "comparator" when core system ultimately decides against
* abbreviation.
*/
int (*abbrev_full_comparator) (Datum x, Datum y, SortSupport ssup);
} SortSupportData;
......@@ -110,6 +199,9 @@ typedef struct SortSupportData
extern int ApplySortComparator(Datum datum1, bool isNull1,
Datum datum2, bool isNull2,
SortSupport ssup);
extern int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
Datum datum2, bool isNull2,
SortSupport ssup);
#endif /* !PG_USE_INLINE */
#if defined(PG_USE_INLINE) || defined(SORTSUPPORT_INCLUDE_DEFINITIONS)
/*
......@@ -148,6 +240,44 @@ ApplySortComparator(Datum datum1, bool isNull1,
return compare;
}
/*
* Apply a sort comparator function and return a 3-way comparison using full,
* authoritative comparator. This takes care of handling reverse-sort and
* NULLs-ordering properly.
*/
STATIC_IF_INLINE int
ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
Datum datum2, bool isNull2,
SortSupport ssup)
{
int compare;
if (isNull1)
{
if (isNull2)
compare = 0; /* NULL "=" NULL */
else if (ssup->ssup_nulls_first)
compare = -1; /* NULL "<" NOT_NULL */
else
compare = 1; /* NULL ">" NOT_NULL */
}
else if (isNull2)
{
if (ssup->ssup_nulls_first)
compare = 1; /* NOT_NULL ">" NULL */
else
compare = -1; /* NOT_NULL "<" NULL */
}
else
{
compare = (*ssup->abbrev_full_comparator) (datum1, datum2, ssup);
if (ssup->ssup_reverse)
compare = -compare;
}
return compare;
}
#endif /*-- PG_USE_INLINE || SORTSUPPORT_INCLUDE_DEFINITIONS */
/* Other functions in utils/sort/sortsupport.c */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment