Commit 97532f7c authored by Tom Lane's avatar Tom Lane

Add some knowledge about prefix matches to tsmatchsel(). It's not terribly

bright, but it beats assuming that a prefix match behaves identically to an
exact match, which is what the code was doing before :-(.  Noted while
experimenting with Artur Dobrowski's example.
parent d4fe61b0
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, ...@@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
* *
* 1 - select(oper) in NOT nodes * 1 - select(oper) in NOT nodes
* *
* freq[val] in VAL nodes, if the value is in MCELEM * histogram-based estimation in prefix VAL nodes
*
* freq[val] in exact VAL nodes, if the value is in MCELEM
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
* *
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
* binary search for determining freq[MCELEM]. * binary search for determining freq[MCELEM].
* *
* If we don't have stats for the tsvector, we still use this logic, * If we don't have stats for the tsvector, we still use this logic,
* except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case * except we use default estimates for VAL nodes. This case is signaled
* is signaled by lookup == NULL. * by lookup == NULL.
*/ */
static Selectivity static Selectivity
tsquery_opr_selec(QueryItem *item, char *operand, tsquery_opr_selec(QueryItem *item, char *operand,
TextFreq *lookup, int length, float4 minfreq) TextFreq *lookup, int length, float4 minfreq)
{ {
LexemeKey key; Selectivity selec;
TextFreq *searchres;
Selectivity selec,
s1,
s2;
/* since this function recurses, it could be driven to stack overflow */ /* since this function recurses, it could be driven to stack overflow */
check_stack_depth(); check_stack_depth();
...@@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand, ...@@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
if (item->type == QI_VAL) if (item->type == QI_VAL)
{ {
QueryOperand *oper = (QueryOperand *) item; QueryOperand *oper = (QueryOperand *) item;
LexemeKey key;
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
if (lookup == NULL)
return (Selectivity) DEFAULT_TS_MATCH_SEL;
/* /*
* Prepare the key for bsearch(). * Prepare the key for bsearch().
...@@ -294,6 +289,59 @@ tsquery_opr_selec(QueryItem *item, char *operand, ...@@ -294,6 +289,59 @@ tsquery_opr_selec(QueryItem *item, char *operand,
key.lexeme = operand + oper->distance; key.lexeme = operand + oper->distance;
key.length = oper->length; key.length = oper->length;
if (oper->prefix)
{
/* Prefix match, ie the query item is lexeme:* */
Selectivity matched,
allmcvs;
int i;
/*
* Our strategy is to scan through the MCV list and add up the
* frequencies of the ones that match the prefix, thereby
* assuming that the MCVs are representative of the whole lexeme
* population in this respect. Compare histogram_selectivity().
*
* This is only a good plan if we have a pretty fair number of
* MCVs available; we set the threshold at 100. If no stats or
* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
*/
if (lookup == NULL || length < 100)
return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
matched = allmcvs = 0;
for (i = 0; i < length; i++)
{
TextFreq *t = lookup + i;
int tlen = VARSIZE_ANY_EXHDR(t->element);
if (tlen >= key.length &&
strncmp(key.lexeme, VARDATA_ANY(t->element),
key.length) == 0)
matched += t->frequency;
allmcvs += t->frequency;
}
if (allmcvs > 0) /* paranoia about zero divide */
selec = matched / allmcvs;
else
selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
/*
* In any case, never believe that a prefix match has selectivity
* less than DEFAULT_TS_MATCH_SEL.
*/
selec = Max(DEFAULT_TS_MATCH_SEL, selec);
}
else
{
/* Regular exact lexeme match */
TextFreq *searchres;
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
if (lookup == NULL)
return (Selectivity) DEFAULT_TS_MATCH_SEL;
searchres = (TextFreq *) bsearch(&key, lookup, length, searchres = (TextFreq *) bsearch(&key, lookup, length,
sizeof(TextFreq), sizeof(TextFreq),
compare_lexeme_textfreq); compare_lexeme_textfreq);
...@@ -301,10 +349,10 @@ tsquery_opr_selec(QueryItem *item, char *operand, ...@@ -301,10 +349,10 @@ tsquery_opr_selec(QueryItem *item, char *operand,
if (searchres) if (searchres)
{ {
/* /*
* The element is in MCELEM. Return precise selectivity (or at * The element is in MCELEM. Return precise selectivity (or
* least as precise as ANALYZE could find out). * at least as precise as ANALYZE could find out).
*/ */
return (Selectivity) searchres->frequency; selec = searchres->frequency;
} }
else else
{ {
...@@ -312,11 +360,16 @@ tsquery_opr_selec(QueryItem *item, char *operand, ...@@ -312,11 +360,16 @@ tsquery_opr_selec(QueryItem *item, char *operand,
* The element is not in MCELEM. Punt, but assume that the * The element is not in MCELEM. Punt, but assume that the
* selectivity cannot be more than minfreq / 2. * selectivity cannot be more than minfreq / 2.
*/ */
return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
} }
} }
}
else
{
/* Current TSQuery node is an operator */ /* Current TSQuery node is an operator */
Selectivity s1,
s2;
switch (item->qoperator.oper) switch (item->qoperator.oper)
{ {
case OP_NOT: case OP_NOT:
...@@ -345,6 +398,7 @@ tsquery_opr_selec(QueryItem *item, char *operand, ...@@ -345,6 +398,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
selec = 0; /* keep compiler quiet */ selec = 0; /* keep compiler quiet */
break; break;
} }
}
/* Clamp intermediate results to stay sane despite roundoff error */ /* Clamp intermediate results to stay sane despite roundoff error */
CLAMP_PROBABILITY(selec); CLAMP_PROBABILITY(selec);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment