Commit 80a5cf64 authored by Tom Lane's avatar Tom Lane

Improve contrib/pg_trgm's heuristics for regexp index searches.

When extracting trigrams from a regular expression for search of a GIN or
GIST trigram index, it's useful to penalize (preferentially discard)
trigrams that contain whitespace, since those are typically far more common
in the index than trigrams not containing whitespace.  Of course, this
should only be a preference not a hard rule, since we might otherwise end
up with no trigrams to search for.  The previous coding tended to produce
fairly inefficient trigram search sets for anchored regexp patterns, as
reported by Erik Rijkers.  This patch penalizes whitespace-containing
trigrams, and also reduces the target number of extracted trigrams, since
experience suggests that the original coding tended to select too many
trigrams to search for.

Alexander Korotkov, reviewed by Tom Lane
parent 5d8117e1
...@@ -122,9 +122,23 @@ ...@@ -122,9 +122,23 @@
* thousands of trigrams would be slow, and would likely produce so many * thousands of trigrams would be slow, and would likely produce so many
* false positives that we would have to traverse a large fraction of the * false positives that we would have to traverse a large fraction of the
* index, the graph is simplified further in a lossy fashion by removing * index, the graph is simplified further in a lossy fashion by removing
* color trigrams until the number of trigrams after expansion is below * color trigrams. When a color trigram is removed, the states connected by
* the MAX_TRGM_COUNT threshold. When a color trigram is removed, the states * any arcs labelled with that trigram are merged.
* connected by any arcs labelled with that trigram are merged. *
* Trigrams do not all have equivalent value for searching: some of them are
* more frequent and some of them are less frequent. Ideally, we would like
* to know the distribution of trigrams, but we don't. But because of padding
* we know for sure that the empty character is more frequent than others,
* so we can penalize trigrams according to presence of whitespace. The
* penalty assigned to each color trigram is the number of simple trigrams
* it would produce, times the penalties[] multiplier associated with its
* whitespace content. (The penalties[] constants were calculated by analysis
* of some real-life text.) We eliminate color trigrams starting with the
* highest-penalty one, until we get to a total penalty of no more than
* WISH_TRGM_PENALTY. However, we cannot remove a color trigram if that would
* lead to merging the initial and final states, so we may not be able to
* reach WISH_TRGM_PENALTY. It's still okay so long as we have no more than
* MAX_TRGM_COUNT simple trigrams in total, otherwise we fail.
* *
* 4) Pack the graph into a compact representation * 4) Pack the graph into a compact representation
* ----------------------------------------------- * -----------------------------------------------
...@@ -199,13 +213,30 @@ ...@@ -199,13 +213,30 @@
* MAX_EXPANDED_STATES - How many states we allow in expanded graph * MAX_EXPANDED_STATES - How many states we allow in expanded graph
* MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph
* MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted
* WISH_TRGM_PENALTY - Maximum desired sum of color trigram penalties
* COLOR_COUNT_LIMIT - Maximum number of characters per color * COLOR_COUNT_LIMIT - Maximum number of characters per color
*/ */
#define MAX_EXPANDED_STATES 128 #define MAX_EXPANDED_STATES 128
#define MAX_EXPANDED_ARCS 1024 #define MAX_EXPANDED_ARCS 1024
#define MAX_TRGM_COUNT 256 #define MAX_TRGM_COUNT 256
#define WISH_TRGM_PENALTY 16
#define COLOR_COUNT_LIMIT 256 #define COLOR_COUNT_LIMIT 256
/*
* Penalty multipliers for trigram counts depending on whitespace contents.
* Numbers based on analysis of real-life texts.
*/
const float4 penalties[8] = {
1.0, /* "aaa" */
3.5, /* "aa " */
0.0, /* "a a" (impossible) */
0.0, /* "a " (impossible) */
4.2, /* " aa" */
2.1, /* " a " */
25.0, /* " a" */
0.0 /* " " (impossible) */
};
/* Struct representing a single pg_wchar, converted back to multibyte form */ /* Struct representing a single pg_wchar, converted back to multibyte form */
typedef struct typedef struct
{ {
...@@ -339,6 +370,7 @@ typedef struct ...@@ -339,6 +370,7 @@ typedef struct
ColorTrgm ctrgm; ColorTrgm ctrgm;
int number; int number;
int count; int count;
float4 penalty;
bool expanded; bool expanded;
List *arcs; List *arcs;
} ColorTrgmInfo; } ColorTrgmInfo;
...@@ -459,7 +491,7 @@ static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext); ...@@ -459,7 +491,7 @@ static TRGM *expandColorTrigrams(TrgmNFA *trgmNFA, MemoryContext rcontext);
static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]); static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]);
static void mergeStates(TrgmState *state1, TrgmState *state2); static void mergeStates(TrgmState *state1, TrgmState *state2);
static int colorTrgmInfoCmp(const void *p1, const void *p2); static int colorTrgmInfoCmp(const void *p1, const void *p2);
static int colorTrgmInfoCountCmp(const void *p1, const void *p2); static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2);
static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext); static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext);
static int packArcInfoCmp(const void *a1, const void *a2); static int packArcInfoCmp(const void *a1, const void *a2);
...@@ -1424,6 +1456,7 @@ selectColorTrigrams(TrgmNFA *trgmNFA) ...@@ -1424,6 +1456,7 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
TrgmState *state; TrgmState *state;
ColorTrgmInfo *colorTrgms; ColorTrgmInfo *colorTrgms;
int64 totalTrgmCount; int64 totalTrgmCount;
float4 totalTrgmPenalty;
int number; int number;
/* Collect color trigrams from all arcs */ /* Collect color trigrams from all arcs */
...@@ -1482,53 +1515,67 @@ selectColorTrigrams(TrgmNFA *trgmNFA) ...@@ -1482,53 +1515,67 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
} }
/* /*
* Count number of simple trigrams generated by each color trigram. * Count number of simple trigrams generated by each color trigram, and
* also compute a penalty value, which is the number of simple trigrams
* times a multiplier that depends on its whitespace content.
* *
* Note: per-color-trigram counts cannot overflow an int so long as * Note: per-color-trigram counts cannot overflow an int so long as
* COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about
* 1290. However, the grand total totalTrgmCount might conceivably * 1290. However, the grand total totalTrgmCount might conceivably
* overflow an int, so we use int64 for that within this routine. * overflow an int, so we use int64 for that within this routine. Also,
* penalties are calculated in float4 arithmetic to avoid any overflow
* worries.
*/ */
totalTrgmCount = 0; totalTrgmCount = 0;
totalTrgmPenalty = 0.0f;
for (i = 0; i < trgmNFA->colorTrgmsCount; i++) for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
{ {
ColorTrgmInfo *trgmInfo = &colorTrgms[i]; ColorTrgmInfo *trgmInfo = &colorTrgms[i];
int j, int j,
count = 1; count = 1,
typeIndex = 0;
for (j = 0; j < 3; j++) for (j = 0; j < 3; j++)
{ {
TrgmColor c = trgmInfo->ctrgm.colors[j]; TrgmColor c = trgmInfo->ctrgm.colors[j];
if (c != COLOR_BLANK) typeIndex *= 2;
if (c == COLOR_BLANK)
typeIndex++;
else
count *= trgmNFA->colorInfo[c].wordCharsCount; count *= trgmNFA->colorInfo[c].wordCharsCount;
} }
trgmInfo->count = count; trgmInfo->count = count;
totalTrgmCount += count; totalTrgmCount += count;
trgmInfo->penalty = penalties[typeIndex] * (float4) count;
totalTrgmPenalty += trgmInfo->penalty;
} }
/* Sort color trigrams in descending order of simple trigram counts */ /* Sort color trigrams in descending order of their penalties */
qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo),
colorTrgmInfoCountCmp); colorTrgmInfoPenaltyCmp);
/* /*
* Remove color trigrams from the graph so long as total number of simple * Remove color trigrams from the graph so long as total penalty of color
* trigrams exceeds MAX_TRGM_COUNT. We prefer to remove color trigrams * trigrams exceeds WISH_TRGM_PENALTY. (If we fail to get down to
* with the most associated simple trigrams, since those are the most * WISH_TRGM_PENALTY, it's OK so long as total count is no more than
* promising for reducing the total number of simple trigrams. When * MAX_TRGM_COUNT.) We prefer to remove color trigrams with higher
* removing a color trigram we have to merge states connected by arcs * penalty, since those are the most promising for reducing the total
* labeled with that trigram. It's necessary to not merge initial and * penalty. When removing a color trigram we have to merge states
* final states, because our graph becomes useless if that happens; so we * connected by arcs labeled with that trigram. It's necessary to not
* cannot always remove the trigram we'd prefer to. * merge initial and final states, because our graph becomes useless if
*/ * that happens; so we cannot always remove the trigram we'd prefer to.
for (i = 0; */
(i < trgmNFA->colorTrgmsCount) && (totalTrgmCount > MAX_TRGM_COUNT); for (i = 0; i < trgmNFA->colorTrgmsCount; i++)
i++)
{ {
ColorTrgmInfo *trgmInfo = &colorTrgms[i]; ColorTrgmInfo *trgmInfo = &colorTrgms[i];
bool canRemove = true; bool canRemove = true;
ListCell *cell; ListCell *cell;
/* Done if we've reached the target */
if (totalTrgmPenalty <= WISH_TRGM_PENALTY)
break;
/* /*
* Does any arc of this color trigram connect initial and final * Does any arc of this color trigram connect initial and final
* states? If so we can't remove it. * states? If so we can't remove it.
...@@ -1570,9 +1617,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA) ...@@ -1570,9 +1617,10 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
mergeStates(source, target); mergeStates(source, target);
} }
/* Mark trigram unexpanded, and update totalTrgmCount */ /* Mark trigram unexpanded, and update totals */
trgmInfo->expanded = false; trgmInfo->expanded = false;
totalTrgmCount -= trgmInfo->count; totalTrgmCount -= trgmInfo->count;
totalTrgmPenalty -= trgmInfo->penalty;
} }
/* Did we succeed in fitting into MAX_TRGM_COUNT? */ /* Did we succeed in fitting into MAX_TRGM_COUNT? */
...@@ -1746,17 +1794,17 @@ colorTrgmInfoCmp(const void *p1, const void *p2) ...@@ -1746,17 +1794,17 @@ colorTrgmInfoCmp(const void *p1, const void *p2)
/* /*
* Compare function for sorting color trigrams in descending order of * Compare function for sorting color trigrams in descending order of
* their simple trigrams counts. * their penalty fields.
*/ */
static int static int
colorTrgmInfoCountCmp(const void *p1, const void *p2) colorTrgmInfoPenaltyCmp(const void *p1, const void *p2)
{ {
const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1; float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty;
const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2; float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty;
if (c1->count < c2->count) if (penalty1 < penalty2)
return 1; return 1;
else if (c1->count == c2->count) else if (penalty1 == penalty2)
return 0; return 0;
else else
return -1; return -1;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment