Commit 97532f7c authored by Tom Lane's avatar Tom Lane

Add some knowledge about prefix matches to tsmatchsel(). It's not terribly

bright, but it beats assuming that a prefix match behaves identically to an
exact match, which is what the code was doing before :-(.  Noted while
experimenting with Artur Dobrowski's example.
parent d4fe61b0
......@@ -7,7 +7,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -257,25 +257,23 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
*
* 1 - select(oper) in NOT nodes
*
* freq[val] in VAL nodes, if the value is in MCELEM
* histogram-based estimation in prefix VAL nodes
*
* freq[val] in exact VAL nodes, if the value is in MCELEM
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
*
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
* binary search for determining freq[MCELEM].
*
* If we don't have stats for the tsvector, we still use this logic,
* except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case
* is signaled by lookup == NULL.
* except we use default estimates for VAL nodes. This case is signaled
* by lookup == NULL.
*/
static Selectivity
tsquery_opr_selec(QueryItem *item, char *operand,
TextFreq *lookup, int length, float4 minfreq)
{
LexemeKey key;
TextFreq *searchres;
Selectivity selec,
s1,
s2;
Selectivity selec;
/* since this function recurses, it could be driven to stack overflow */
check_stack_depth();
......@@ -283,10 +281,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
if (item->type == QI_VAL)
{
QueryOperand *oper = (QueryOperand *) item;
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
if (lookup == NULL)
return (Selectivity) DEFAULT_TS_MATCH_SEL;
LexemeKey key;
/*
* Prepare the key for bsearch().
......@@ -294,56 +289,115 @@ tsquery_opr_selec(QueryItem *item, char *operand,
key.lexeme = operand + oper->distance;
key.length = oper->length;
searchres = (TextFreq *) bsearch(&key, lookup, length,
sizeof(TextFreq),
compare_lexeme_textfreq);
if (searchres)
if (oper->prefix)
{
/* Prefix match, ie the query item is lexeme:* */
Selectivity matched,
allmcvs;
int i;
/*
* Our strategy is to scan through the MCV list and add up the
* frequencies of the ones that match the prefix, thereby
* assuming that the MCVs are representative of the whole lexeme
* population in this respect. Compare histogram_selectivity().
*
* This is only a good plan if we have a pretty fair number of
* MCVs available; we set the threshold at 100. If no stats or
* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
*/
if (lookup == NULL || length < 100)
return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
matched = allmcvs = 0;
for (i = 0; i < length; i++)
{
TextFreq *t = lookup + i;
int tlen = VARSIZE_ANY_EXHDR(t->element);
if (tlen >= key.length &&
strncmp(key.lexeme, VARDATA_ANY(t->element),
key.length) == 0)
matched += t->frequency;
allmcvs += t->frequency;
}
if (allmcvs > 0) /* paranoia about zero divide */
selec = matched / allmcvs;
else
selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
/*
* The element is in MCELEM. Return precise selectivity (or at
* least as precise as ANALYZE could find out).
* In any case, never believe that a prefix match has selectivity
* less than DEFAULT_TS_MATCH_SEL.
*/
return (Selectivity) searchres->frequency;
selec = Max(DEFAULT_TS_MATCH_SEL, selec);
}
else
{
/*
* The element is not in MCELEM. Punt, but assume that the
* selectivity cannot be more than minfreq / 2.
*/
return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
/* Regular exact lexeme match */
TextFreq *searchres;
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
if (lookup == NULL)
return (Selectivity) DEFAULT_TS_MATCH_SEL;
searchres = (TextFreq *) bsearch(&key, lookup, length,
sizeof(TextFreq),
compare_lexeme_textfreq);
if (searchres)
{
/*
* The element is in MCELEM. Return precise selectivity (or
* at least as precise as ANALYZE could find out).
*/
selec = searchres->frequency;
}
else
{
/*
* The element is not in MCELEM. Punt, but assume that the
* selectivity cannot be more than minfreq / 2.
*/
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
}
}
}
/* Current TSQuery node is an operator */
switch (item->qoperator.oper)
else
{
case OP_NOT:
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
lookup, length, minfreq);
break;
case OP_AND:
s1 = tsquery_opr_selec(item + 1, operand,
lookup, length, minfreq);
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
lookup, length, minfreq);
selec = s1 * s2;
break;
case OP_OR:
s1 = tsquery_opr_selec(item + 1, operand,
lookup, length, minfreq);
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
lookup, length, minfreq);
selec = s1 + s2 - s1 * s2;
break;
default:
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
selec = 0; /* keep compiler quiet */
break;
/* Current TSQuery node is an operator */
Selectivity s1,
s2;
switch (item->qoperator.oper)
{
case OP_NOT:
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
lookup, length, minfreq);
break;
case OP_AND:
s1 = tsquery_opr_selec(item + 1, operand,
lookup, length, minfreq);
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
lookup, length, minfreq);
selec = s1 * s2;
break;
case OP_OR:
s1 = tsquery_opr_selec(item + 1, operand,
lookup, length, minfreq);
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
lookup, length, minfreq);
selec = s1 + s2 - s1 * s2;
break;
default:
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
selec = 0; /* keep compiler quiet */
break;
}
}
/* Clamp intermediate results to stay sane despite roundoff error */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment