Commit dbaec70c authored by Tom Lane's avatar Tom Lane

Rename and slightly redefine the default text search parser's "word"

categories, as per discussion.  asciiword (formerly lword) is still
ASCII-letters-only, and numword (formerly word) is still the most general
mixed-alpha-and-digits case.  But word (formerly nlword) is now
any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as
before.  This is no worse than before for parsing mixed Russian/English text,
which seems to have been the design center for the original coding; and it
should simplify matters for parsing most European languages.  In particular
it will not be necessary for any language to accept strings containing digits
as being regular "words".  The hyphenated-word categories are adjusted
similarly.
parent 344d0cae
<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.403 2007/10/22 20:13:37 tgl Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.404 2007/10/23 20:46:11 tgl Exp $ -->
<chapter id="functions">
<title>Functions and Operators</title>
......@@ -7861,7 +7861,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<entry><type>setof record</type></entry>
<entry>test a configuration</entry>
<entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry>
<entry><literal>(lword,"Latin word",The,{english_stem},english_stem,{}) ...</literal></entry>
<entry><literal>(asciiword,"Word, all ASCII",The,{english_stem},english_stem,{}) ...</literal></entry>
</row>
<row>
<entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry>
......@@ -7889,14 +7889,14 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<entry><type>setof record</type></entry>
<entry>get token types defined by parser</entry>
<entry><literal>ts_token_type('default')</literal></entry>
<entry><literal>(1,lword,"Latin word") ...</literal></entry>
<entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
</row>
<row>
<entry><literal><function>ts_token_type</function>(<replaceable class="PARAMETER">parser_oid</> <type>oid</>, OUT <replaceable class="PARAMETER">tokid</> <type>integer</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>)</literal></entry>
<entry><type>setof record</type></entry>
<entry>get token types defined by parser</entry>
<entry><literal>ts_token_type(3722)</literal></entry>
<entry><literal>(1,lword,"Latin word") ...</literal></entry>
<entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
</row>
<row>
<entry><literal><function>ts_stat</function>(<replaceable class="PARAMETER">sqlquery</replaceable> <type>text</>, <optional> <replaceable class="PARAMETER">weights</replaceable> <type>text</>, </optional> OUT <replaceable class="PARAMETER">word</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">ndoc</replaceable> <type>integer</>, OUT <replaceable class="PARAMETER">nentry</replaceable> <type>integer</>)</literal></entry>
......
This diff is collapsed.
......@@ -2,7 +2,7 @@
#
# Makefile for src/backend/snowball
#
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
#
#-------------------------------------------------------------------------
......@@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
stem_UTF_8_swedish.o \
stem_UTF_8_turkish.o
# second column is name of latin dictionary, if different
# Note order dependency: use of some other language as latin dictionary
# first column is language name and also name of dictionary for not-all-ASCII
# words, second is name of dictionary for all-ASCII words
# Note order dependency: use of some other language as ASCII dictionary
# must come after creation of that language
LANGUAGES= \
danish danish \
......@@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
while [ "$$#" -gt 0 ] ; \
do \
lang=$$1; shift; \
nonlatdictname=$$lang; \
latdictname=$$1; shift; \
nonascdictname=$$lang; \
ascdictname=$$1; shift; \
if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
stop=", StopWords=$${lang}" ; \
else \
......@@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
sed -e "s#_LANGNAME_#$$lang#g" | \
sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
sed -e "s#_CFGNAME_#$$lang#g" | \
sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \
sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \
sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
sed -e "s#_STOPWORDS_#$$stop#g" ; \
done >> $@
else
......
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$
-- text search configuration for _LANGNAME_ language
CREATE TEXT SEARCH DICTIONARY _DICTNAME_
......@@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR email, url, host, sfloat, version, uri, file, float, int, uint
FOR email, url, host, sfloat, version, uri, file, float, int, uint,
numword, hword_numpart, numhword
WITH simple;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR lhword, lpart_hword, lword
WITH _LATDICTNAME_;
FOR asciiword, hword_asciipart, asciihword
WITH _ASCDICTNAME_;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR hword, nlhword, nlpart_hword, nlword, word, part_hword
WITH _NONLATDICTNAME_;
FOR word, hword_part, hword
WITH _NONASCDICTNAME_;
This diff is collapsed.
......@@ -37,7 +37,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.435 2007/10/22 20:13:37 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.436 2007/10/23 20:46:12 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 200710221
#define CATALOG_VERSION_NO 200710231
#endif
......@@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle');
(1 row)
-- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize
-- can not give more tat one word as it may wish thesaurus.
-- More tests in configuration checks because ts_lexize()
-- cannot pass more than one word to thesaurus.
CREATE TEXT SEARCH DICTIONARY thesaurus (
Template=thesaurus,
DictFile=thesaurus_sample,
......@@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
COPY=english
);
ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word
word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
to_tsvector
......@@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
COPY=english
);
ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
lword, lpart_hword, lhword
asciiword, hword_asciipart, asciihword
WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
to_tsvector
......@@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
COPY=synonym_tst
);
ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR
lword, lpart_hword, lhword
asciiword, hword_asciipart, asciihword
WITH synonym, thesaurus, english_stem;
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
to_tsvector
......
......@@ -208,31 +208,31 @@ SELECT ts_lexize('english_stem', 'identity');
(1 row)
SELECT * FROM ts_token_type('default');
tokid | alias | description
-------+--------------+-----------------------------------
1 | lword | Latin word
2 | nlword | Non-latin word
3 | word | Word
4 | email | Email
5 | url | URL
6 | host | Host
7 | sfloat | Scientific notation
8 | version | VERSION
9 | part_hword | Part of hyphenated word
10 | nlpart_hword | Non-latin part of hyphenated word
11 | lpart_hword | Latin part of hyphenated word
12 | blank | Space symbols
13 | tag | HTML Tag
14 | protocol | Protocol head
15 | hword | Hyphenated word
16 | lhword | Latin hyphenated word
17 | nlhword | Non-latin hyphenated word
18 | uri | URI
19 | file | File or path name
20 | float | Decimal notation
21 | int | Signed integer
22 | uint | Unsigned integer
23 | entity | HTML Entity
tokid | alias | description
-------+-----------------+------------------------------------------
1 | asciiword | Word, all ASCII
2 | word | Word, all letters
3 | numword | Word, letters and digits
4 | email | Email address
5 | url | URL
6 | host | Host
7 | sfloat | Scientific notation
8 | version | Version number
9 | hword_numpart | Hyphenated word part, letters and digits
10 | hword_part | Hyphenated word part, all letters
11 | hword_asciipart | Hyphenated word part, all ASCII
12 | blank | Space symbols
13 | tag | HTML tag
14 | protocol | Protocol head
15 | numhword | Hyphenated word, letters and digits
16 | asciihword | Hyphenated word, all ASCII
17 | hword | Hyphenated word, all letters
18 | uri | URI
19 | file | File or path name
20 | float | Decimal notation
21 | int | Signed integer
22 | uint | Unsigned integer
23 | entity | HTML entity
(23 rows)
SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
......
......@@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs');
SELECT ts_lexize('synonym', 'Gogle');
-- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize
-- can not give more tat one word as it may wish thesaurus.
-- More tests in configuration checks because ts_lexize()
-- cannot pass more than one word to thesaurus.
CREATE TEXT SEARCH DICTIONARY thesaurus (
Template=thesaurus,
DictFile=thesaurus_sample,
......@@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
);
ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word
word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
......@@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
);
ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
lword, lpart_hword, lhword
asciiword, hword_asciipart, asciihword
WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
......@@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
);
ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR
lword, lpart_hword, lhword
asciiword, hword_asciipart, asciihword
WITH synonym, thesaurus, english_stem;
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
......@@ -3,7 +3,7 @@ package Install;
#
# Package that provides 'make install' functionality for msvc builds
#
# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.24 2007/10/16 16:00:00 tgl Exp $
# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.25 2007/10/23 20:46:12 tgl Exp $
#
use strict;
use warnings;
......@@ -258,7 +258,7 @@ sub GenerateTsearchFiles
while ($#pieces > 0)
{
my $lang = shift @pieces || last;
my $latlang = shift @pieces || last;
my $asclang = shift @pieces || last;
my $txt = $tmpl;
my $stop = '';
......@@ -269,8 +269,8 @@ sub GenerateTsearchFiles
$txt =~ s#_LANGNAME_#${lang}#gs;
$txt =~ s#_DICTNAME_#${lang}_stem#gs;
$txt =~ s#_CFGNAME_#${lang}#gs;
$txt =~ s#_LATDICTNAME_#${latlang}_stem#gs;
$txt =~ s#_NONLATDICTNAME_#${lang}_stem#gs;
$txt =~ s#_ASCDICTNAME_#${asclang}_stem#gs;
$txt =~ s#_NONASCDICTNAME_#${lang}_stem#gs;
$txt =~ s#_STOPWORDS_#$stop#gs;
print $F $txt;
print ".";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment