Commit f85155e1 authored by Robert Haas's avatar Robert Haas

Change the way we decide whether to give up on abbreviated text keys.

Be more aggressive about aborting early on if it looks like it's not
helping, but be less aggressive about aborting later on, since it's
more expensive at that point, and also since we're currently aborting
in some cases where abbreviation can still deliver a substantial win.

Peter Geoghegan. Extensive testing by Tomas Vondra.
parent 9b8d4782
...@@ -66,6 +66,7 @@ typedef struct ...@@ -66,6 +66,7 @@ typedef struct
bool collate_c; bool collate_c;
hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
hyperLogLogState full_card; /* Full key cardinality state */ hyperLogLogState full_card; /* Full key cardinality state */
double prop_card; /* Required cardinality proportion */
#ifdef HAVE_LOCALE_T #ifdef HAVE_LOCALE_T
pg_locale_t locale; pg_locale_t locale;
#endif #endif
...@@ -1845,6 +1846,7 @@ btsortsupport_worker(SortSupport ssup, Oid collid) ...@@ -1845,6 +1846,7 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
*/ */
if (abbreviate) if (abbreviate)
{ {
tss->prop_card = 0.20;
initHyperLogLog(&tss->abbr_card, 10); initHyperLogLog(&tss->abbr_card, 10);
initHyperLogLog(&tss->full_card, 10); initHyperLogLog(&tss->full_card, 10);
ssup->abbrev_full_comparator = ssup->comparator; ssup->abbrev_full_comparator = ssup->comparator;
...@@ -2125,7 +2127,7 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) ...@@ -2125,7 +2127,7 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
Assert(ssup->abbreviate); Assert(ssup->abbreviate);
/* Have a little patience */ /* Have a little patience */
if (memtupcount < 20) if (memtupcount < 100)
return false; return false;
abbrev_distinct = estimateHyperLogLog(&tss->abbr_card); abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
...@@ -2151,8 +2153,9 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) ...@@ -2151,8 +2153,9 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
{ {
double norm_abbrev_card = abbrev_distinct / (double) memtupcount; double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)", elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card); memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
tss->prop_card);
} }
#endif #endif
...@@ -2172,8 +2175,38 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) ...@@ -2172,8 +2175,38 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
* abbreviated comparison with a cheap memcmp()-based authoritative * abbreviated comparison with a cheap memcmp()-based authoritative
* resolution are equivalent. * resolution are equivalent.
*/ */
if (abbrev_distinct > key_distinct * 0.05) if (abbrev_distinct > key_distinct * tss->prop_card)
{
/*
* When we have exceeded 10,000 tuples, decay required cardinality
* aggressively for next call.
*
* This is useful because the number of comparisons required on average
* increases at a linearithmic rate, and at roughly 10,000 tuples that
* factor will start to dominate over the linear costs of string
* transformation (this is a conservative estimate). The decay rate is
* chosen to be a little less aggressive than halving -- which (since
* we're called at points at which memtupcount has doubled) would never
* see the cost model actually abort past the first call following a
* decay. This decay rate is mostly a precaution against a sudden,
* violent swing in how well abbreviated cardinality tracks full key
* cardinality. The decay also serves to prevent a marginal case from
* being aborted too late, when too much has already been invested in
* string transformation.
*
* It's possible for sets of several million distinct strings with mere
* tens of thousands of distinct abbreviated keys to still benefit very
* significantly. This will generally occur provided each abbreviated
* key is a proxy for a roughly uniform number of the set's full keys.
* If it isn't so, we hope to catch that early and abort. If it isn't
* caught early, by the time the problem is apparent it's probably not
* worth aborting.
*/
if (memtupcount > 10000)
tss->prop_card *= 0.65;
return false; return false;
}
/* /*
* Abort abbreviation strategy. * Abort abbreviation strategy.
...@@ -2187,8 +2220,8 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup) ...@@ -2187,8 +2220,8 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
* lose but much to gain, which our strategy reflects. * lose but much to gain, which our strategy reflects.
*/ */
#ifdef DEBUG_ABBREV_KEYS #ifdef DEBUG_ABBREV_KEYS
elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f", elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f, prop_card: %f",
memtupcount, abbrev_distinct, key_distinct); memtupcount, abbrev_distinct, key_distinct, tss->prop_card);
/* Actually abort only when debugging is disabled */ /* Actually abort only when debugging is disabled */
return false; return false;
#endif #endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment