Add an Accept parameter to "simple" dictionaries. The default of true

gives the old behavior; selecting false allows the dictionary to be used as a filter ahead of other dictionaries, because it will pass on rather than accept words that aren't in its stopword list. Jan Urbanski

Add an Accept parameter to "simple" dictionaries. The default of true
gives the old behavior; selecting false allows the dictionary to be used as a filter ahead of other dictionaries, because it will pass on rather than accept words that aren't in its stopword list. Jan Urbanski
ca450a07 · Tom Lane · a44c81d1 · ca450a07 · ca450a07
Commit ca450a07 authored Nov 14, 2007 by Tom Lane
Hide whitespace changes
Inline Side-by-side

Showing with 67 additions and 9 deletions

doc/src/sgml/textsearch.sgml doc/src/sgml/textsearch.sgml +34 -3

src/backend/tsearch/dict_simple.c src/backend/tsearch/dict_simple.c +33 -6

No files found.
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.32 2007/11/14 03:26:24 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
 <chapter id="textsearch">
 <title id="textsearch-title">Full Text Search</title>
@@ -2093,9 +2093,11 @@ SELECT ts_rank_cd (to_tsvector('english','list stop words'), to_tsquery('list &a
   <para>
    The <literal>simple</> dictionary template operates by converting the
    input token to lower case and checking it against a file of stop words.
-    If it is found in the file then <literal>NULL</> is returned, causing
+    If it is found in the file then an empty array is returned, causing
    the token to be discarded.  If not, the lower-cased form of the word
-    is returned as the normalized lexeme.
+    is returned as the normalized lexeme.  Alternatively, the dictionary
+    can be configured to report non-stop-words as unrecognized, allowing
+    them to be passed on to the next dictionary in the list.
   </para>
   <para>
@@ -2138,6 +2140,35 @@ SELECT ts_lexize('public.simple_dict','The');
 </programlisting>
   </para>
+   <para>
+    We can also choose to return <literal>NULL</>, instead of the lower-cased
+    word, if it is not found in the stop words file.  This behavior is
+    selected by setting the dictionary's <literal>Accept</> parameter to
+    <literal>false</>.  Continuing the example:
+<programlisting>
+ALTER TEXT SEARCH DICTIONARY public.simple_dict ( Accept = false );
+SELECT ts_lexize('public.simple_dict','YeS');
+ ts_lexize
+-----------
+SELECT ts_lexize('public.simple_dict','The');
+ ts_lexize
+-----------
+ {}
+</programlisting>
+   </para>
+   <para>
+    With the default setting of <literal>Accept</> = <literal>true</>,
+    it is only useful to place a <literal>simple</> dictionary at the end
+    of a list of dictionaries, since it will never pass on any token to
+    a following dictionary.  Conversely, <literal>Accept</> = <literal>false</>
+    is only useful when there is at least one following dictionary.
+   </para>
   <caution>
    <para>
     Most types of dictionaries rely on configuration files, such as files of

--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.4 2007/11/14 18:36:37 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -23,6 +23,7 @@
 typedef struct
 {
 	StopList	stoplist;
+	bool		accept;
 } DictSimple;
@@ -31,9 +32,12 @@ dsimple_init(PG_FUNCTION_ARGS)
 {
 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
 	DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple));
-	bool		stoploaded = false;
+	bool		stoploaded = false,
+				acceptloaded = false;
 	ListCell   *l;
+	d->accept = true;			/* default */
 	foreach(l, dictoptions)
 	{
 		DefElem    *defel = (DefElem *) lfirst(l);
@@ -47,6 +51,15 @@ dsimple_init(PG_FUNCTION_ARGS)
 			readstoplist(defGetString(defel), &d->stoplist, lowerstr);
 			stoploaded = true;
 		}
+		else if (pg_strcasecmp("Accept", defel->defname) == 0)
+		{
+			if (acceptloaded)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("multiple Accept parameters")));
+			d->accept = defGetBoolean(defel);
+			acceptloaded = true;
+		}
 		else
 		{
 			ereport(ERROR,
@@ -66,14 +79,28 @@ dsimple_lexize(PG_FUNCTION_ARGS)
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	int32	   len = PG_GETARG_INT32(2);
 	char	   *txt;
-	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
+	TSLexeme   *res;
 	txt = lowerstr_with_len(in, len);
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+	{
+		/* reject as stopword */
 		pfree(txt);
-	else
+		res = palloc0(sizeof(TSLexeme) * 2);
+		PG_RETURN_POINTER(res);
+	}
+	else if (d->accept)
+	{
+		/* accept */
+		res = palloc0(sizeof(TSLexeme) * 2);
 		res[0].lexeme = txt;
+		PG_RETURN_POINTER(res);
-	PG_RETURN_POINTER(res);
+	}
+	else
+	{
+		/* report as unrecognized */
+		pfree(txt);
+		PG_RETURN_POINTER(NULL);
+	}
 }