Commit dbaec70c authored by Tom Lane's avatar Tom Lane

Rename and slightly redefine the default text search parser's "word"

categories, as per discussion.  asciiword (formerly lword) is still
ASCII-letters-only, and numword (formerly word) is still the most general
mixed-alpha-and-digits case.  But word (formerly nlword) is now
any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as
before.  This is no worse than before for parsing mixed Russian/English text,
which seems to have been the design center for the original coding; and it
should simplify matters for parsing most European languages.  In particular
it will not be necessary for any language to accept strings containing digits
as being regular "words".  The hyphenated-word categories are adjusted
similarly.
parent 344d0cae
<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.403 2007/10/22 20:13:37 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.404 2007/10/23 20:46:11 tgl Exp $ -->
<chapter id="functions"> <chapter id="functions">
<title>Functions and Operators</title> <title>Functions and Operators</title>
...@@ -7861,7 +7861,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple ...@@ -7861,7 +7861,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<entry><type>setof record</type></entry> <entry><type>setof record</type></entry>
<entry>test a configuration</entry> <entry>test a configuration</entry>
<entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry> <entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry>
<entry><literal>(lword,"Latin word",The,{english_stem},english_stem,{}) ...</literal></entry> <entry><literal>(asciiword,"Word, all ASCII",The,{english_stem},english_stem,{}) ...</literal></entry>
</row> </row>
<row> <row>
<entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry> <entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry>
...@@ -7889,14 +7889,14 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple ...@@ -7889,14 +7889,14 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<entry><type>setof record</type></entry> <entry><type>setof record</type></entry>
<entry>get token types defined by parser</entry> <entry>get token types defined by parser</entry>
<entry><literal>ts_token_type('default')</literal></entry> <entry><literal>ts_token_type('default')</literal></entry>
<entry><literal>(1,lword,"Latin word") ...</literal></entry> <entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
</row> </row>
<row> <row>
<entry><literal><function>ts_token_type</function>(<replaceable class="PARAMETER">parser_oid</> <type>oid</>, OUT <replaceable class="PARAMETER">tokid</> <type>integer</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>)</literal></entry> <entry><literal><function>ts_token_type</function>(<replaceable class="PARAMETER">parser_oid</> <type>oid</>, OUT <replaceable class="PARAMETER">tokid</> <type>integer</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>)</literal></entry>
<entry><type>setof record</type></entry> <entry><type>setof record</type></entry>
<entry>get token types defined by parser</entry> <entry>get token types defined by parser</entry>
<entry><literal>ts_token_type(3722)</literal></entry> <entry><literal>ts_token_type(3722)</literal></entry>
<entry><literal>(1,lword,"Latin word") ...</literal></entry> <entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
</row> </row>
<row> <row>
<entry><literal><function>ts_stat</function>(<replaceable class="PARAMETER">sqlquery</replaceable> <type>text</>, <optional> <replaceable class="PARAMETER">weights</replaceable> <type>text</>, </optional> OUT <replaceable class="PARAMETER">word</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">ndoc</replaceable> <type>integer</>, OUT <replaceable class="PARAMETER">nentry</replaceable> <type>integer</>)</literal></entry> <entry><literal><function>ts_stat</function>(<replaceable class="PARAMETER">sqlquery</replaceable> <type>text</>, <optional> <replaceable class="PARAMETER">weights</replaceable> <type>text</>, </optional> OUT <replaceable class="PARAMETER">word</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">ndoc</replaceable> <type>integer</>, OUT <replaceable class="PARAMETER">nentry</replaceable> <type>integer</>)</literal></entry>
......
This diff is collapsed.
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# #
# Makefile for src/backend/snowball # Makefile for src/backend/snowball
# #
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $ # $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
# #
#------------------------------------------------------------------------- #-------------------------------------------------------------------------
...@@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \ ...@@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
stem_UTF_8_swedish.o \ stem_UTF_8_swedish.o \
stem_UTF_8_turkish.o stem_UTF_8_turkish.o
# second column is name of latin dictionary, if different # first column is language name and also name of dictionary for not-all-ASCII
# Note order dependency: use of some other language as latin dictionary # words, second is name of dictionary for all-ASCII words
# Note order dependency: use of some other language as ASCII dictionary
# must come after creation of that language # must come after creation of that language
LANGUAGES= \ LANGUAGES= \
danish danish \ danish danish \
...@@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes) ...@@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
while [ "$$#" -gt 0 ] ; \ while [ "$$#" -gt 0 ] ; \
do \ do \
lang=$$1; shift; \ lang=$$1; shift; \
nonlatdictname=$$lang; \ nonascdictname=$$lang; \
latdictname=$$1; shift; \ ascdictname=$$1; shift; \
if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \ if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
stop=", StopWords=$${lang}" ; \ stop=", StopWords=$${lang}" ; \
else \ else \
...@@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes) ...@@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
sed -e "s#_LANGNAME_#$$lang#g" | \ sed -e "s#_LANGNAME_#$$lang#g" | \
sed -e "s#_DICTNAME_#$${lang}_stem#g" | \ sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
sed -e "s#_CFGNAME_#$$lang#g" | \ sed -e "s#_CFGNAME_#$$lang#g" | \
sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \ sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \ sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
sed -e "s#_STOPWORDS_#$$stop#g" ; \ sed -e "s#_STOPWORDS_#$$stop#g" ; \
done >> $@ done >> $@
else else
......
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$ -- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$
-- text search configuration for _LANGNAME_ language -- text search configuration for _LANGNAME_ language
CREATE TEXT SEARCH DICTIONARY _DICTNAME_ CREATE TEXT SEARCH DICTIONARY _DICTNAME_
...@@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_ ...@@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language'; COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR email, url, host, sfloat, version, uri, file, float, int, uint FOR email, url, host, sfloat, version, uri, file, float, int, uint,
numword, hword_numpart, numhword
WITH simple; WITH simple;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR lhword, lpart_hword, lword FOR asciiword, hword_asciipart, asciihword
WITH _LATDICTNAME_; WITH _ASCDICTNAME_;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR hword, nlhword, nlpart_hword, nlword, word, part_hword FOR word, hword_part, hword
WITH _NONLATDICTNAME_; WITH _NONASCDICTNAME_;
This diff is collapsed.
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.435 2007/10/22 20:13:37 tgl Exp $ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.436 2007/10/23 20:46:12 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -53,6 +53,6 @@ ...@@ -53,6 +53,6 @@
*/ */
/* yyyymmddN */ /* yyyymmddN */
#define CATALOG_VERSION_NO 200710221 #define CATALOG_VERSION_NO 200710231
#endif #endif
...@@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle'); ...@@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle');
(1 row) (1 row)
-- Create and simple test thesaurus dictionary -- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize -- More tests in configuration checks because ts_lexize()
-- can not give more tat one word as it may wish thesaurus. -- cannot pass more than one word to thesaurus.
CREATE TEXT SEARCH DICTIONARY thesaurus ( CREATE TEXT SEARCH DICTIONARY thesaurus (
Template=thesaurus, Template=thesaurus,
DictFile=thesaurus_sample, DictFile=thesaurus_sample,
...@@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst ( ...@@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
COPY=english COPY=english
); );
ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem; WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
to_tsvector to_tsvector
...@@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst ( ...@@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
COPY=english COPY=english
); );
ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
lword, lpart_hword, lhword asciiword, hword_asciipart, asciihword
WITH synonym, english_stem; WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
to_tsvector to_tsvector
...@@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( ...@@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
COPY=synonym_tst COPY=synonym_tst
); );
ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR
lword, lpart_hword, lhword asciiword, hword_asciipart, asciihword
WITH synonym, thesaurus, english_stem; WITH synonym, thesaurus, english_stem;
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one'); SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
to_tsvector to_tsvector
......
...@@ -209,30 +209,30 @@ SELECT ts_lexize('english_stem', 'identity'); ...@@ -209,30 +209,30 @@ SELECT ts_lexize('english_stem', 'identity');
SELECT * FROM ts_token_type('default'); SELECT * FROM ts_token_type('default');
tokid | alias | description tokid | alias | description
-------+--------------+----------------------------------- -------+-----------------+------------------------------------------
1 | lword | Latin word 1 | asciiword | Word, all ASCII
2 | nlword | Non-latin word 2 | word | Word, all letters
3 | word | Word 3 | numword | Word, letters and digits
4 | email | Email 4 | email | Email address
5 | url | URL 5 | url | URL
6 | host | Host 6 | host | Host
7 | sfloat | Scientific notation 7 | sfloat | Scientific notation
8 | version | VERSION 8 | version | Version number
9 | part_hword | Part of hyphenated word 9 | hword_numpart | Hyphenated word part, letters and digits
10 | nlpart_hword | Non-latin part of hyphenated word 10 | hword_part | Hyphenated word part, all letters
11 | lpart_hword | Latin part of hyphenated word 11 | hword_asciipart | Hyphenated word part, all ASCII
12 | blank | Space symbols 12 | blank | Space symbols
13 | tag | HTML Tag 13 | tag | HTML tag
14 | protocol | Protocol head 14 | protocol | Protocol head
15 | hword | Hyphenated word 15 | numhword | Hyphenated word, letters and digits
16 | lhword | Latin hyphenated word 16 | asciihword | Hyphenated word, all ASCII
17 | nlhword | Non-latin hyphenated word 17 | hword | Hyphenated word, all letters
18 | uri | URI 18 | uri | URI
19 | file | File or path name 19 | file | File or path name
20 | float | Decimal notation 20 | float | Decimal notation
21 | int | Signed integer 21 | int | Signed integer
22 | uint | Unsigned integer 22 | uint | Unsigned integer
23 | entity | HTML Entity 23 | entity | HTML entity
(23 rows) (23 rows)
SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
......
...@@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs'); ...@@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs');
SELECT ts_lexize('synonym', 'Gogle'); SELECT ts_lexize('synonym', 'Gogle');
-- Create and simple test thesaurus dictionary -- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize -- More tests in configuration checks because ts_lexize()
-- can not give more tat one word as it may wish thesaurus. -- cannot pass more than one word to thesaurus.
CREATE TEXT SEARCH DICTIONARY thesaurus ( CREATE TEXT SEARCH DICTIONARY thesaurus (
Template=thesaurus, Template=thesaurus,
DictFile=thesaurus_sample, DictFile=thesaurus_sample,
...@@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst ( ...@@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
); );
ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem; WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
...@@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst ( ...@@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
); );
ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
lword, lpart_hword, lhword asciiword, hword_asciipart, asciihword
WITH synonym, english_stem; WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
...@@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( ...@@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
); );
ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR
lword, lpart_hword, lhword asciiword, hword_asciipart, asciihword
WITH synonym, thesaurus, english_stem; WITH synonym, thesaurus, english_stem;
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one'); SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)'); SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets'); SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
...@@ -3,7 +3,7 @@ package Install; ...@@ -3,7 +3,7 @@ package Install;
# #
# Package that provides 'make install' functionality for msvc builds # Package that provides 'make install' functionality for msvc builds
# #
# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.24 2007/10/16 16:00:00 tgl Exp $ # $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.25 2007/10/23 20:46:12 tgl Exp $
# #
use strict; use strict;
use warnings; use warnings;
...@@ -258,7 +258,7 @@ sub GenerateTsearchFiles ...@@ -258,7 +258,7 @@ sub GenerateTsearchFiles
while ($#pieces > 0) while ($#pieces > 0)
{ {
my $lang = shift @pieces || last; my $lang = shift @pieces || last;
my $latlang = shift @pieces || last; my $asclang = shift @pieces || last;
my $txt = $tmpl; my $txt = $tmpl;
my $stop = ''; my $stop = '';
...@@ -269,8 +269,8 @@ sub GenerateTsearchFiles ...@@ -269,8 +269,8 @@ sub GenerateTsearchFiles
$txt =~ s#_LANGNAME_#${lang}#gs; $txt =~ s#_LANGNAME_#${lang}#gs;
$txt =~ s#_DICTNAME_#${lang}_stem#gs; $txt =~ s#_DICTNAME_#${lang}_stem#gs;
$txt =~ s#_CFGNAME_#${lang}#gs; $txt =~ s#_CFGNAME_#${lang}#gs;
$txt =~ s#_LATDICTNAME_#${latlang}_stem#gs; $txt =~ s#_ASCDICTNAME_#${asclang}_stem#gs;
$txt =~ s#_NONLATDICTNAME_#${lang}_stem#gs; $txt =~ s#_NONASCDICTNAME_#${lang}_stem#gs;
$txt =~ s#_STOPWORDS_#$stop#gs; $txt =~ s#_STOPWORDS_#$stop#gs;
print $F $txt; print $F $txt;
print "."; print ".";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment