Commit ca450a07 authored by Tom Lane's avatar Tom Lane

Add an Accept parameter to "simple" dictionaries. The default of true

gives the old behavior; selecting false allows the dictionary to be used
as a filter ahead of other dictionaries, because it will pass on rather
than accept words that aren't in its stopword list.
Jan Urbanski
parent a44c81d1
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.32 2007/11/14 03:26:24 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
<chapter id="textsearch"> <chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title> <title id="textsearch-title">Full Text Search</title>
...@@ -2093,9 +2093,11 @@ SELECT ts_rank_cd (to_tsvector('english','list stop words'), to_tsquery('list &a ...@@ -2093,9 +2093,11 @@ SELECT ts_rank_cd (to_tsvector('english','list stop words'), to_tsquery('list &a
<para> <para>
The <literal>simple</> dictionary template operates by converting the The <literal>simple</> dictionary template operates by converting the
input token to lower case and checking it against a file of stop words. input token to lower case and checking it against a file of stop words.
If it is found in the file then <literal>NULL</> is returned, causing If it is found in the file then an empty array is returned, causing
the token to be discarded. If not, the lower-cased form of the word the token to be discarded. If not, the lower-cased form of the word
is returned as the normalized lexeme. is returned as the normalized lexeme. Alternatively, the dictionary
can be configured to report non-stop-words as unrecognized, allowing
them to be passed on to the next dictionary in the list.
</para> </para>
<para> <para>
...@@ -2138,6 +2140,35 @@ SELECT ts_lexize('public.simple_dict','The'); ...@@ -2138,6 +2140,35 @@ SELECT ts_lexize('public.simple_dict','The');
</programlisting> </programlisting>
</para> </para>
<para>
We can also choose to return <literal>NULL</>, instead of the lower-cased
word, if it is not found in the stop words file. This behavior is
selected by setting the dictionary's <literal>Accept</> parameter to
<literal>false</>. Continuing the example:
<programlisting>
ALTER TEXT SEARCH DICTIONARY public.simple_dict ( Accept = false );
SELECT ts_lexize('public.simple_dict','YeS');
ts_lexize
-----------
SELECT ts_lexize('public.simple_dict','The');
ts_lexize
-----------
{}
</programlisting>
</para>
<para>
With the default setting of <literal>Accept</> = <literal>true</>,
it is only useful to place a <literal>simple</> dictionary at the end
of a list of dictionaries, since it will never pass on any token to
a following dictionary. Conversely, <literal>Accept</> = <literal>false</>
is only useful when there is at least one following dictionary.
</para>
<caution> <caution>
<para> <para>
Most types of dictionaries rely on configuration files, such as files of Most types of dictionaries rely on configuration files, such as files of
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.4 2007/11/14 18:36:37 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
typedef struct typedef struct
{ {
StopList stoplist; StopList stoplist;
bool accept;
} DictSimple; } DictSimple;
...@@ -31,9 +32,12 @@ dsimple_init(PG_FUNCTION_ARGS) ...@@ -31,9 +32,12 @@ dsimple_init(PG_FUNCTION_ARGS)
{ {
List *dictoptions = (List *) PG_GETARG_POINTER(0); List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple)); DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
bool stoploaded = false; bool stoploaded = false,
acceptloaded = false;
ListCell *l; ListCell *l;
d->accept = true; /* default */
foreach(l, dictoptions) foreach(l, dictoptions)
{ {
DefElem *defel = (DefElem *) lfirst(l); DefElem *defel = (DefElem *) lfirst(l);
...@@ -47,6 +51,15 @@ dsimple_init(PG_FUNCTION_ARGS) ...@@ -47,6 +51,15 @@ dsimple_init(PG_FUNCTION_ARGS)
readstoplist(defGetString(defel), &d->stoplist, lowerstr); readstoplist(defGetString(defel), &d->stoplist, lowerstr);
stoploaded = true; stoploaded = true;
} }
else if (pg_strcasecmp("Accept", defel->defname) == 0)
{
if (acceptloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple Accept parameters")));
d->accept = defGetBoolean(defel);
acceptloaded = true;
}
else else
{ {
ereport(ERROR, ereport(ERROR,
...@@ -66,14 +79,28 @@ dsimple_lexize(PG_FUNCTION_ARGS) ...@@ -66,14 +79,28 @@ dsimple_lexize(PG_FUNCTION_ARGS)
char *in = (char *) PG_GETARG_POINTER(1); char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2); int32 len = PG_GETARG_INT32(2);
char *txt; char *txt;
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); TSLexeme *res;
txt = lowerstr_with_len(in, len); txt = lowerstr_with_len(in, len);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
{
/* reject as stopword */
pfree(txt); pfree(txt);
else res = palloc0(sizeof(TSLexeme) * 2);
PG_RETURN_POINTER(res);
}
else if (d->accept)
{
/* accept */
res = palloc0(sizeof(TSLexeme) * 2);
res[0].lexeme = txt; res[0].lexeme = txt;
PG_RETURN_POINTER(res);
PG_RETURN_POINTER(res); }
else
{
/* report as unrecognized */
pfree(txt);
PG_RETURN_POINTER(NULL);
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment