Rename and slightly redefine the default text search parser's "word"

categories, as per discussion. asciiword (formerly lword) is still ASCII-letters-only, and numword (formerly word) is still the most general mixed-alpha-and-digits case. But word (formerly nlword) is now any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as before. This is no worse than before for parsing mixed Russian/English text, which seems to have been the design center for the original coding; and it should simplify matters for parsing most European languages. In particular it will not be necessary for any language to accept strings containing digits as being regular "words". The hyphenated-word categories are adjusted similarly.

Rename and slightly redefine the default text search parser's "word"
categories, as per discussion. asciiword (formerly lword) is still ASCII-letters-only, and numword (formerly word) is still the most general mixed-alpha-and-digits case. But word (formerly nlword) is now any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as before. This is no worse than before for parsing mixed Russian/English text, which seems to have been the design center for the original coding; and it should simplify matters for parsing most European languages. In particular it will not be necessary for any language to accept strings containing digits as being regular "words". The hyphenated-word categories are adjusted similarly.
dbaec70c · Tom Lane · 344d0cae · dbaec70c · dbaec70c · dbaec70c
Commit dbaec70c authored Oct 23, 2007 by Tom Lane
10 changed files
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
-<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.403 2007/10/22 20:13:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.404 2007/10/23 20:46:11 tgl Exp $ -->
 <chapter id="functions">
  <title>Functions and Operators</title>
@@ -7861,7 +7861,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
        <entry><type>setof record</type></entry>
        <entry>test a configuration</entry>
        <entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry>
-        <entry><literal>(lword,"Latin word",The,{english_stem},english_stem,{}) ...</literal></entry>
+        <entry><literal>(asciiword,"Word, all ASCII",The,{english_stem},english_stem,{}) ...</literal></entry>
       </row>
       <row>
        <entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry>
@@ -7889,14 +7889,14 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
        <entry><type>setof record</type></entry>
        <entry>get token types defined by parser</entry>
        <entry><literal>ts_token_type('default')</literal></entry>
-        <entry><literal>(1,lword,"Latin word") ...</literal></entry>
+        <entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
       </row>
       <row>
        <entry><literal><function>ts_token_type</function>(<replaceable class="PARAMETER">parser_oid</> <type>oid</>, OUT <replaceable class="PARAMETER">tokid</> <type>integer</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>)</literal></entry>
        <entry><type>setof record</type></entry>
        <entry>get token types defined by parser</entry>
        <entry><literal>ts_token_type(3722)</literal></entry>
-        <entry><literal>(1,lword,"Latin word") ...</literal></entry>
+        <entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
       </row>
       <row>
        <entry><literal><function>ts_stat</function>(<replaceable class="PARAMETER">sqlquery</replaceable> <type>text</>, <optional> <replaceable class="PARAMETER">weights</replaceable> <type>text</>, </optional> OUT <replaceable class="PARAMETER">word</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">ndoc</replaceable> <type>integer</>, OUT <replaceable class="PARAMETER">nentry</replaceable> <type>integer</>)</literal></entry>

--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
--- a/src/backend/snowball/Makefile
+++ b/src/backend/snowball/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for src/backend/snowball
 #
-# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $
+# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
 #
 #-------------------------------------------------------------------------
@@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
 	stem_UTF_8_swedish.o \
 	stem_UTF_8_turkish.o
-# second column is name of latin dictionary, if different
+# first column is language name and also name of dictionary for not-all-ASCII
-# Note order dependency: use of some other language as latin dictionary
+# words, second is name of dictionary for all-ASCII words
+# Note order dependency: use of some other language as ASCII dictionary
 # must come after creation of that language
 LANGUAGES=  \
 	danish		danish 		\
@@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
 	while [ "$$#" -gt 0 ] ; \
 	do \
 		lang=$$1; shift; \
-		nonlatdictname=$$lang; \
+		nonascdictname=$$lang; \
-		latdictname=$$1; shift; \
+		ascdictname=$$1; shift; \
 		if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
 			stop=", StopWords=$${lang}" ; \
 		else \
@@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
 			sed -e "s#_LANGNAME_#$$lang#g" | \
 			sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
 			sed -e "s#_CFGNAME_#$$lang#g" | \
-			sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \
+			sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
-			sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \
+			sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
 			sed -e "s#_STOPWORDS_#$$stop#g" ; \
 	done >> $@
 else

--- a/src/backend/snowball/snowball.sql.in
+++ b/src/backend/snowball/snowball.sql.in
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$
+-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$
 -- text search configuration for _LANGNAME_ language
 CREATE TEXT SEARCH DICTIONARY _DICTNAME_
@@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
 COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';
 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-	FOR email, url, host, sfloat, version, uri, file, float, int, uint
+	FOR email, url, host, sfloat, version, uri, file, float, int, uint,
+            numword, hword_numpart, numhword
 	WITH simple;
 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-    FOR lhword, lpart_hword, lword
+    FOR asciiword, hword_asciipart, asciihword
-	WITH _LATDICTNAME_;
+	WITH _ASCDICTNAME_;
 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-    FOR hword, nlhword, nlpart_hword, nlword, word, part_hword
+    FOR word, hword_part, hword
-	WITH _NONLATDICTNAME_;
+	WITH _NONASCDICTNAME_;
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.435 2007/10/22 20:13:37 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.436 2007/10/23 20:46:12 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -53,6 +53,6 @@
 */
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200710221
+#define CATALOG_VERSION_NO	200710231
 #endif
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle');
 (1 row)
 -- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize
+-- More tests in configuration checks because ts_lexize()
-- can not give more tat one word as it may wish thesaurus.
+-- cannot pass more than one word to thesaurus.
 CREATE TEXT SEARCH DICTIONARY thesaurus (
                        Template=thesaurus,
 						DictFile=thesaurus_sample, 
@@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
 						COPY=english
 );
 ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
-	hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word 
+	word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
 	WITH ispell, english_stem;
 SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
                                            to_tsvector                                             
@@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
 						COPY=english
 );
 ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, english_stem;
 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
                    to_tsvector                    
@@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
 						COPY=synonym_tst
 );
 ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, thesaurus, english_stem;
 SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
           to_tsvector            

--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -208,31 +208,31 @@ SELECT ts_lexize('english_stem', 'identity');
 (1 row)
 SELECT * FROM ts_token_type('default');
- tokid |    alias     |            description            
+ tokid |      alias      |               description                
-------+--------------+-----------------------------------
+-------+-----------------+------------------------------------------
-     1 | lword        | Latin word
+     1 | asciiword       | Word, all ASCII
-     2 | nlword       | Non-latin word
+     2 | word            | Word, all letters
-     3 | word         | Word
+     3 | numword         | Word, letters and digits
-     4 | email        | Email
+     4 | email           | Email address
-     5 | url          | URL
+     5 | url             | URL
-     6 | host         | Host
+     6 | host            | Host
-     7 | sfloat       | Scientific notation
+     7 | sfloat          | Scientific notation
-     8 | version      | VERSION
+     8 | version         | Version number
-     9 | part_hword   | Part of hyphenated word
+     9 | hword_numpart   | Hyphenated word part, letters and digits
-    10 | nlpart_hword | Non-latin part of hyphenated word
+    10 | hword_part      | Hyphenated word part, all letters
-    11 | lpart_hword  | Latin part of hyphenated word
+    11 | hword_asciipart | Hyphenated word part, all ASCII
-    12 | blank        | Space symbols
+    12 | blank           | Space symbols
-    13 | tag          | HTML Tag
+    13 | tag             | HTML tag
-    14 | protocol     | Protocol head
+    14 | protocol        | Protocol head
-    15 | hword        | Hyphenated word
+    15 | numhword        | Hyphenated word, letters and digits
-    16 | lhword       | Latin hyphenated word
+    16 | asciihword      | Hyphenated word, all ASCII
-    17 | nlhword      | Non-latin hyphenated word
+    17 | hword           | Hyphenated word, all letters
-    18 | uri          | URI
+    18 | uri             | URI
-    19 | file         | File or path name
+    19 | file            | File or path name
-    20 | float        | Decimal notation
+    20 | float           | Decimal notation
-    21 | int          | Signed integer
+    21 | int             | Signed integer
-    22 | uint         | Unsigned integer
+    22 | uint            | Unsigned integer
-    23 | entity       | HTML Entity
+    23 | entity          | HTML entity
 (23 rows)
 SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">

--- a/src/test/regress/sql/tsdicts.sql
+++ b/src/test/regress/sql/tsdicts.sql
@@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs');
 SELECT ts_lexize('synonym', 'Gogle');
 -- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize
+-- More tests in configuration checks because ts_lexize()
-- can not give more tat one word as it may wish thesaurus.
+-- cannot pass more than one word to thesaurus.
 CREATE TEXT SEARCH DICTIONARY thesaurus (
                        Template=thesaurus,
 						DictFile=thesaurus_sample, 
@@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
 );
 ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
-	hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word 
+	word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
 	WITH ispell, english_stem;
 SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
@@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
 );
 ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, english_stem;
 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
@@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
 );
 ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR 
-	lword, lpart_hword, lhword 
+	asciiword, hword_asciipart, asciihword 
 	WITH synonym, thesaurus, english_stem;
 SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
 SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
 SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
--- a/src/tools/msvc/Install.pm
+++ b/src/tools/msvc/Install.pm
@@ -3,7 +3,7 @@ package Install;
 #
 # Package that provides 'make install' functionality for msvc builds
 #
-# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.24 2007/10/16 16:00:00 tgl Exp $
+# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.25 2007/10/23 20:46:12 tgl Exp $
 #
 use strict;
 use warnings;
@@ -258,7 +258,7 @@ sub GenerateTsearchFiles
    while ($#pieces > 0)
    {
        my $lang = shift @pieces || last;
-        my $latlang = shift @pieces || last;
+        my $asclang = shift @pieces || last;
        my $txt = $tmpl;
        my $stop = '';
@@ -269,8 +269,8 @@ sub GenerateTsearchFiles
        $txt =~ s#_LANGNAME_#${lang}#gs;
        $txt =~ s#_DICTNAME_#${lang}_stem#gs;
        $txt =~ s#_CFGNAME_#${lang}#gs;
-        $txt =~ s#_LATDICTNAME_#${latlang}_stem#gs;
+        $txt =~ s#_ASCDICTNAME_#${asclang}_stem#gs;
-        $txt =~ s#_NONLATDICTNAME_#${lang}_stem#gs;
+        $txt =~ s#_NONASCDICTNAME_#${lang}_stem#gs;
        $txt =~ s#_STOPWORDS_#$stop#gs;
        print $F $txt;
        print ".";