Commit 1542e16f authored by Peter Geoghegan's avatar Peter Geoghegan

Consider outliers in split interval calculation.

Commit 0d861bbb, which introduced deduplication to nbtree, added some
logic to take large posting list tuples into account when choosing a
split point.  We subtract firstright posting list overhead from the
projected new high key size when calculating leftfree/rightfree values
for an affected candidate split point.  Posting list tuples aren't
special to nbtsplitloc.c, but taking them into account like this makes a
huge difference in practice.  Posting list tuples are frequently tuple
size outliers.

However, commit 0d861bbb missed a closely related issue: split interval
itself is calculated based on the assumption that tuples on the page
being split are roughly equisized.  That assumption was acceptable back
when commit fab25024 taught the logic for choosing a split point about
suffix truncation, but it's pretty questionable now that very large
tuple sizes are common.  This oversight led to unbalanced page splits in
low cardinality multi-column indexes when deduplication was used: page
splits that don't give sufficient weight to how unbalanced the split is
when the interval happens to include some large posting list tuples (and
when most other tuples on the page are not so large).

Nail this down by calculating an initial split interval in a way that's
attuned to the actual cost that we want to keep under control (not a
fuzzy proxy for the cost): apply a leftfree + rightfree evenness test to
each candidate split point that actually gets included in the split
interval (for the default strategy).  This replaces logic that used a
percentage of all legal split points for the page as the basis of the
initial split interval.

Discussion: https://postgr.es/m/CAH2-WznJt5aT2uUB2Bs+JBLdwe0XTX67+xeLFcaNvCKxO=QBVQ@mail.gmail.com
parent 1c455078
...@@ -17,10 +17,6 @@ ...@@ -17,10 +17,6 @@
#include "access/nbtree.h" #include "access/nbtree.h"
#include "storage/lmgr.h" #include "storage/lmgr.h"
/* limits on split interval (default strategy only) */
#define MAX_LEAF_INTERVAL 9
#define MAX_INTERNAL_INTERVAL 18
typedef enum typedef enum
{ {
/* strategy for searching through materialized list of split points */ /* strategy for searching through materialized list of split points */
...@@ -76,6 +72,7 @@ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, ...@@ -76,6 +72,7 @@ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
bool *newitemonleft, FindSplitStrat strategy); bool *newitemonleft, FindSplitStrat strategy);
static int _bt_defaultinterval(FindSplitData *state);
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
SplitPoint *rightpage, FindSplitStrat *strategy); SplitPoint *rightpage, FindSplitStrat *strategy);
static void _bt_interval_edges(FindSplitData *state, static void _bt_interval_edges(FindSplitData *state,
...@@ -279,7 +276,7 @@ _bt_findsplitloc(Relation rel, ...@@ -279,7 +276,7 @@ _bt_findsplitloc(Relation rel,
* left side of the split, in order to maximize the number of trailing * left side of the split, in order to maximize the number of trailing
* attributes that can be truncated away. Only candidate split points * attributes that can be truncated away. Only candidate split points
* that imply an acceptable balance of free space on each side are * that imply an acceptable balance of free space on each side are
* considered. * considered. See _bt_defaultinterval().
*/ */
if (!state.is_leaf) if (!state.is_leaf)
{ {
...@@ -338,19 +335,6 @@ _bt_findsplitloc(Relation rel, ...@@ -338,19 +335,6 @@ _bt_findsplitloc(Relation rel,
fillfactormult = 0.50; fillfactormult = 0.50;
} }
/*
* Set an initial limit on the split interval/number of candidate split
* points as appropriate. The "Prefix B-Trees" paper refers to this as
* sigma l for leaf splits and sigma b for internal ("branch") splits.
* It's hard to provide a theoretical justification for the initial size
* of the split interval, though it's clear that a small split interval
* makes suffix truncation much more effective without noticeably
* affecting space utilization over time.
*/
state.interval = Min(Max(1, state.nsplits * 0.05),
state.is_leaf ? MAX_LEAF_INTERVAL :
MAX_INTERNAL_INTERVAL);
/* /*
* Save leftmost and rightmost splits for page before original ordinal * Save leftmost and rightmost splits for page before original ordinal
* sort order is lost by delta/fillfactormult sort * sort order is lost by delta/fillfactormult sort
...@@ -361,6 +345,9 @@ _bt_findsplitloc(Relation rel, ...@@ -361,6 +345,9 @@ _bt_findsplitloc(Relation rel,
/* Give split points a fillfactormult-wise delta, and sort on deltas */ /* Give split points a fillfactormult-wise delta, and sort on deltas */
_bt_deltasortsplits(&state, fillfactormult, usemult); _bt_deltasortsplits(&state, fillfactormult, usemult);
/* Determine split interval for default strategy */
state.interval = _bt_defaultinterval(&state);
/* /*
* Determine if default strategy/split interval will produce a * Determine if default strategy/split interval will produce a
* sufficiently distinguishing split, or if we should change strategies. * sufficiently distinguishing split, or if we should change strategies.
...@@ -850,11 +837,13 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, ...@@ -850,11 +837,13 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
*/ */
if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost && if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
!final->newitemonleft && final->firstrightoff >= state->newitemoff && !final->newitemonleft && final->firstrightoff >= state->newitemoff &&
final->firstrightoff < state->newitemoff + MAX_LEAF_INTERVAL) final->firstrightoff < state->newitemoff + 9)
{ {
/* /*
* Avoid the problem by performing a 50:50 split when the new item is * Avoid the problem by performing a 50:50 split when the new item is
* just to the right of the would-be "many duplicates" split point. * just to the right of the would-be "many duplicates" split point.
* (Note that the test used for an insert that is "just to the right"
* of the split point is conservative.)
*/ */
final = &state->splits[0]; final = &state->splits[0];
} }
...@@ -863,6 +852,79 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, ...@@ -863,6 +852,79 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
return final->firstrightoff; return final->firstrightoff;
} }
#define LEAF_SPLIT_DISTANCE 0.050
#define INTERNAL_SPLIT_DISTANCE 0.075
/*
* Return a split interval to use for the default strategy. This is a limit
* on the number of candidate split points to give further consideration to.
* Only a fraction of all candidate splits points (those located at the start
* of the now-sorted splits array) fall within the split interval. Split
* interval is applied within _bt_bestsplitloc().
*
* Split interval represents an acceptable range of split points -- those that
* have leftfree and rightfree values that are acceptably balanced. The final
* split point chosen is the split point with the lowest "penalty" among split
* points in this split interval (unless we change our entire strategy, in
* which case the interval also changes -- see _bt_strategy()).
*
* The "Prefix B-Trees" paper calls split interval sigma l for leaf splits,
* and sigma b for internal ("branch") splits. It's hard to provide a
* theoretical justification for the size of the split interval, though it's
* clear that a small split interval can make tuples on level L+1 much smaller
* on average, without noticeably affecting space utilization on level L.
* (Note that the way that we calculate split interval might need to change if
* suffix truncation is taught to truncate tuples "within" the last
* attribute/datum for data types like text, which is more or less how it is
* assumed to work in the paper.)
*/
static int
_bt_defaultinterval(FindSplitData *state)
{
SplitPoint *spaceoptimal;
int16 tolerance,
lowleftfree,
lowrightfree,
highleftfree,
highrightfree;
/*
* Determine leftfree and rightfree values that are higher and lower than
* we're willing to tolerate. Note that the final split interval will be
* about 10% of nsplits in the common case where all non-pivot tuples
* (data items) from a leaf page are uniformly sized. We're a bit more
* aggressive when splitting internal pages.
*/
if (state->is_leaf)
tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
else
tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
/* First candidate split point is the most evenly balanced */
spaceoptimal = state->splits;
lowleftfree = spaceoptimal->leftfree - tolerance;
lowrightfree = spaceoptimal->rightfree - tolerance;
highleftfree = spaceoptimal->leftfree + tolerance;
highrightfree = spaceoptimal->rightfree + tolerance;
/*
* Iterate through split points, starting from the split immediately after
* 'spaceoptimal'. Find the first split point that divides free space so
* unevenly that including it in the split interval would be unacceptable.
*/
for (int i = 1; i < state->nsplits; i++)
{
SplitPoint *split = state->splits + i;
/* Cannot use curdelta here, since its value is often weighted */
if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
split->leftfree > highleftfree || split->rightfree > highrightfree)
return i;
}
return state->nsplits;
}
/* /*
* Subroutine to decide whether split should use default strategy/initial * Subroutine to decide whether split should use default strategy/initial
* split interval, or whether it should finish splitting the page using * split interval, or whether it should finish splitting the page using
...@@ -1097,7 +1159,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) ...@@ -1097,7 +1159,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split)
} }
/* /*
* Subroutine to get a lastleft IndexTuple for a split point from page * Subroutine to get a lastleft IndexTuple for a split point
*/ */
static inline IndexTuple static inline IndexTuple
_bt_split_lastleft(FindSplitData *state, SplitPoint *split) _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
...@@ -1113,7 +1175,7 @@ _bt_split_lastleft(FindSplitData *state, SplitPoint *split) ...@@ -1113,7 +1175,7 @@ _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
} }
/* /*
* Subroutine to get a firstright IndexTuple for a split point from page * Subroutine to get a firstright IndexTuple for a split point
*/ */
static inline IndexTuple static inline IndexTuple
_bt_split_firstright(FindSplitData *state, SplitPoint *split) _bt_split_firstright(FindSplitData *state, SplitPoint *split)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment