Commit d009992b authored by Bruce Momjian's avatar Bruce Momjian

Have text search thesaurus files use "?" for stop words.

Throw an error for actual stop words, rather than a warning.  This fixes
problems with cache reloading causing warning messages.

Re-enable stop words in regression tests;  was disabled by Tom.

Document "?" as API change.
parent 82748bc2
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.30 2007/11/05 15:55:53 mha Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.31 2007/11/10 15:39:34 momjian Exp $ -->
<chapter id="textsearch"> <chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title> <title id="textsearch-title">Full Text Search</title>
...@@ -2258,20 +2258,17 @@ more sample word(s) : more indexed word(s) ...@@ -2258,20 +2258,17 @@ more sample word(s) : more indexed word(s)
</para> </para>
<para> <para>
Stop words recognized by the subdictionary are replaced by a <quote>stop Specific stop words recognized by the subdictionary cannot be
word placeholder</quote> to record their position. To illustrate this, specified; instead use <literal>?</> to mark the location where any
consider these phrases: stop word can appear. For example, assuming that <literal>a</> and
<literal>the</> are stop words according to the subdictionary:
<programlisting> <programlisting>
a one the two : swsw ? one ? two : swsw
the one a two : swsw2
</programlisting> </programlisting>
Assuming that <literal>a</> and <literal>the</> are stop words according matches <literal>a one the two</> and <literal>the one a two</>;
to the subdictionary, these two phrases are identical to the thesaurus: both would be replaced by <literal>swsw</>.
they both look like <replaceable>stopword</> <literal>one</>
<replaceable>stopword</> <literal>two</>. Input matching this pattern
will be replaced by <literal>swsw2</>, according to the tie-breaking rule.
</para> </para>
<para> <para>
...@@ -3576,6 +3573,12 @@ Parser: "pg_catalog.default" ...@@ -3576,6 +3573,12 @@ Parser: "pg_catalog.default"
</para> </para>
</listitem> </listitem>
<listitem>
<para>
Thesaurus files now use <literal>?</> for stop words.
</para>
</listitem>
<listitem> <listitem>
<para> <para>
What else? What else?
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.5 2007/11/09 01:32:22 momjian Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.6 2007/11/10 15:39:34 momjian Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -412,6 +412,10 @@ compileTheLexeme(DictThesaurus * d) ...@@ -412,6 +412,10 @@ compileTheLexeme(DictThesaurus * d)
{ {
TSLexeme *ptr; TSLexeme *ptr;
if (strcmp(d->wrds[i].lexeme, "?") == 0) /* Is stop word marker? */
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
else
{
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize), ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
PointerGetDatum(d->subdict->dictData), PointerGetDatum(d->subdict->dictData),
PointerGetDatum(d->wrds[i].lexeme), PointerGetDatum(d->wrds[i].lexeme),
...@@ -422,12 +426,8 @@ compileTheLexeme(DictThesaurus * d) ...@@ -422,12 +426,8 @@ compileTheLexeme(DictThesaurus * d)
elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)", elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1); d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
else if (!(ptr->lexeme)) else if (!(ptr->lexeme))
{ elog(ERROR, "thesaurus word-sample \"%s\" is recognized as stop-word, use \"?\" for stop words instead (rule %d)",
elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1); d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
}
else else
{ {
while (ptr->lexeme) while (ptr->lexeme)
...@@ -455,6 +455,7 @@ compileTheLexeme(DictThesaurus * d) ...@@ -455,6 +455,7 @@ compileTheLexeme(DictThesaurus * d)
ptr = remptr; ptr = remptr;
} }
} }
}
pfree(d->wrds[i].lexeme); pfree(d->wrds[i].lexeme);
pfree(d->wrds[i].entries); pfree(d->wrds[i].entries);
......
...@@ -14,4 +14,5 @@ two : *2 ...@@ -14,4 +14,5 @@ two : *2
supernovae stars : *sn supernovae stars : *sn
supernovae : *sn supernovae : *sn
booking tickets : order invitation cards booking tickets : order invitation cards
# booking the tickets : order invitation Cards booking ? tickets : order invitation Cards
...@@ -312,7 +312,7 @@ SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usuall ...@@ -312,7 +312,7 @@ SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usuall
SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets'); SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
to_tsvector to_tsvector
--------------------------------------------------------------------- -------------------------------------------------------
'book':8 'card':3 'like':6 'look':5 'invit':2 'order':1 'ticket':10 'card':3,10 'like':6 'look':5 'invit':2,9 'order':1,8
(1 row) (1 row)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment