Commit dbaec70c authored by Tom Lane's avatar Tom Lane

Rename and slightly redefine the default text search parser's "word"

categories, as per discussion.  asciiword (formerly lword) is still
ASCII-letters-only, and numword (formerly word) is still the most general
mixed-alpha-and-digits case.  But word (formerly nlword) is now
any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as
before.  This is no worse than before for parsing mixed Russian/English text,
which seems to have been the design center for the original coding; and it
should simplify matters for parsing most European languages.  In particular
it will not be necessary for any language to accept strings containing digits
as being regular "words".  The hyphenated-word categories are adjusted
similarly.
parent 344d0cae
<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.403 2007/10/22 20:13:37 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.404 2007/10/23 20:46:11 tgl Exp $ -->
<chapter id="functions"> <chapter id="functions">
<title>Functions and Operators</title> <title>Functions and Operators</title>
...@@ -7861,7 +7861,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple ...@@ -7861,7 +7861,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<entry><type>setof record</type></entry> <entry><type>setof record</type></entry>
<entry>test a configuration</entry> <entry>test a configuration</entry>
<entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry> <entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry>
<entry><literal>(lword,"Latin word",The,{english_stem},english_stem,{}) ...</literal></entry> <entry><literal>(asciiword,"Word, all ASCII",The,{english_stem},english_stem,{}) ...</literal></entry>
</row> </row>
<row> <row>
<entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry> <entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry>
...@@ -7889,14 +7889,14 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple ...@@ -7889,14 +7889,14 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<entry><type>setof record</type></entry> <entry><type>setof record</type></entry>
<entry>get token types defined by parser</entry> <entry>get token types defined by parser</entry>
<entry><literal>ts_token_type('default')</literal></entry> <entry><literal>ts_token_type('default')</literal></entry>
<entry><literal>(1,lword,"Latin word") ...</literal></entry> <entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
</row> </row>
<row> <row>
<entry><literal><function>ts_token_type</function>(<replaceable class="PARAMETER">parser_oid</> <type>oid</>, OUT <replaceable class="PARAMETER">tokid</> <type>integer</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>)</literal></entry> <entry><literal><function>ts_token_type</function>(<replaceable class="PARAMETER">parser_oid</> <type>oid</>, OUT <replaceable class="PARAMETER">tokid</> <type>integer</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>)</literal></entry>
<entry><type>setof record</type></entry> <entry><type>setof record</type></entry>
<entry>get token types defined by parser</entry> <entry>get token types defined by parser</entry>
<entry><literal>ts_token_type(3722)</literal></entry> <entry><literal>ts_token_type(3722)</literal></entry>
<entry><literal>(1,lword,"Latin word") ...</literal></entry> <entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
</row> </row>
<row> <row>
<entry><literal><function>ts_stat</function>(<replaceable class="PARAMETER">sqlquery</replaceable> <type>text</>, <optional> <replaceable class="PARAMETER">weights</replaceable> <type>text</>, </optional> OUT <replaceable class="PARAMETER">word</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">ndoc</replaceable> <type>integer</>, OUT <replaceable class="PARAMETER">nentry</replaceable> <type>integer</>)</literal></entry> <entry><literal><function>ts_stat</function>(<replaceable class="PARAMETER">sqlquery</replaceable> <type>text</>, <optional> <replaceable class="PARAMETER">weights</replaceable> <type>text</>, </optional> OUT <replaceable class="PARAMETER">word</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">ndoc</replaceable> <type>integer</>, OUT <replaceable class="PARAMETER">nentry</replaceable> <type>integer</>)</literal></entry>
......
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.23 2007/10/22 20:13:37 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.24 2007/10/23 20:46:12 tgl Exp $ -->
<chapter id="textsearch"> <chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title> <title id="textsearch-title">Full Text Search</title>
...@@ -1775,119 +1775,120 @@ LIMIT 10; ...@@ -1775,119 +1775,120 @@ LIMIT 10;
</thead> </thead>
<tbody> <tbody>
<row> <row>
<entry>lword</entry> <entry><literal>asciiword</></entry>
<entry>Latin word (only ASCII letters)</entry> <entry>Word, all ASCII letters</entry>
<entry><literal>foo</literal></entry> <entry><literal>foo</literal></entry>
</row> </row>
<row> <row>
<entry>nlword</entry> <entry><literal>word</></entry>
<entry>Non-latin word (only non-ASCII letters)</entry> <entry>Word, all letters</entry>
<entry><literal></literal></entry> <entry><literal>f&oslash;&oslash;</literal></entry>
</row> </row>
<row> <row>
<entry>word</entry> <entry><literal>numword</></entry>
<entry>Word (other cases)</entry> <entry>Word, letters and digits</entry>
<entry><literal>beta1</literal></entry> <entry><literal>beta1</literal></entry>
</row> </row>
<row> <row>
<entry>lhword</entry> <entry><literal>asciihword</></entry>
<entry>Latin hyphenated word</entry> <entry>Hyphenated word, all ASCII</entry>
<entry><literal>foo-bar</literal></entry> <entry><literal>foo-bar</literal></entry>
</row> </row>
<row> <row>
<entry>nlhword</entry> <entry><literal>hword</></entry>
<entry>Non-latin hyphenated word</entry> <entry>Hyphenated word, all letters</entry>
<entry><literal></literal></entry> <entry><literal>f&oslash;&oslash;-bar</literal></entry>
</row> </row>
<row> <row>
<entry>hword</entry> <entry><literal>numhword</></entry>
<entry>Hyphenated word</entry> <entry>Hyphenated word, letters and digits</entry>
<entry><literal>foo-beta1</literal></entry> <entry><literal>foo-beta1</literal></entry>
</row> </row>
<row> <row>
<entry>lpart_hword</entry> <entry><literal>hword_asciipart</></entry>
<entry>Latin part of hyphenated word</entry> <entry>Hyphenated word part, all ASCII</entry>
<entry><literal>foo</literal> or <literal>bar</literal> in the context <entry><literal>foo</literal> or <literal>bar</literal> in the context
<literal>foo-bar</></entry> <literal>foo-bar</literal></entry>
</row> </row>
<row> <row>
<entry>nlpart_hword</entry> <entry><literal>hword_part</></entry>
<entry>Non-latin part of hyphenated word</entry> <entry>Hyphenated word part, all letters</entry>
<entry><literal></literal></entry> <entry><literal>f&oslash;&oslash;</literal> in the context
<literal>f&oslash;&oslash;-bar</literal></entry>
</row> </row>
<row> <row>
<entry>part_hword</entry> <entry><literal>hword_numpart</></entry>
<entry>Part of hyphenated word</entry> <entry>Hyphenated word part, letters and digits</entry>
<entry><literal>beta1</literal> in the context <entry><literal>beta1</literal> in the context
<literal>foo-beta1</></entry> <literal>foo-beta1</literal></entry>
</row> </row>
<row> <row>
<entry>email</entry> <entry><literal>email</></entry>
<entry>Email address</entry> <entry>Email address</entry>
<entry><literal>foo@bar.com</literal></entry> <entry><literal>foo@bar.com</literal></entry>
</row> </row>
<row> <row>
<entry>protocol</entry> <entry><literal>protocol</></entry>
<entry>Protocol head</entry> <entry>Protocol head</entry>
<entry><literal>http://</literal></entry> <entry><literal>http://</literal></entry>
</row> </row>
<row> <row>
<entry>url</entry> <entry><literal>url</></entry>
<entry>URL</entry> <entry>URL</entry>
<entry><literal>foo.com/stuff/index.html</literal></entry> <entry><literal>foo.com/stuff/index.html</literal></entry>
</row> </row>
<row> <row>
<entry>host</entry> <entry><literal>host</></entry>
<entry>Host</entry> <entry>Host</entry>
<entry><literal>foo.com</literal></entry> <entry><literal>foo.com</literal></entry>
</row> </row>
<row> <row>
<entry>uri</entry> <entry><literal>uri</></entry>
<entry>URI</entry> <entry>URI</entry>
<entry><literal>/stuff/index.html</literal>, in the context of a URL</entry> <entry><literal>/stuff/index.html</literal>, in the context of a URL</entry>
</row> </row>
<row> <row>
<entry>file</entry> <entry><literal>file</></entry>
<entry>File or path name</entry> <entry>File or path name</entry>
<entry><literal>/usr/local/foo.txt</literal>, if not within a URL</entry> <entry><literal>/usr/local/foo.txt</literal>, if not within a URL</entry>
</row> </row>
<row> <row>
<entry>sfloat</entry> <entry><literal>sfloat</></entry>
<entry>Scientific notation</entry> <entry>Scientific notation</entry>
<entry><literal>-1.234e56</literal></entry> <entry><literal>-1.234e56</literal></entry>
</row> </row>
<row> <row>
<entry>float</entry> <entry><literal>float</></entry>
<entry>Decimal notation</entry> <entry>Decimal notation</entry>
<entry><literal>-1.234</literal></entry> <entry><literal>-1.234</literal></entry>
</row> </row>
<row> <row>
<entry>int</entry> <entry><literal>int</></entry>
<entry>Signed integer</entry> <entry>Signed integer</entry>
<entry><literal>-1234</literal></entry> <entry><literal>-1234</literal></entry>
</row> </row>
<row> <row>
<entry>uint</entry> <entry><literal>uint</></entry>
<entry>Unsigned integer</entry> <entry>Unsigned integer</entry>
<entry><literal>1234</literal></entry> <entry><literal>1234</literal></entry>
</row> </row>
<row> <row>
<entry>version</entry> <entry><literal>version</></entry>
<entry>Version number</entry> <entry>Version number</entry>
<entry><literal>8.3.0</literal></entry> <entry><literal>8.3.0</literal></entry>
</row> </row>
<row> <row>
<entry>tag</entry> <entry><literal>tag</></entry>
<entry>HTML Tag</entry> <entry>HTML tag</entry>
<entry><literal>&lt;A HREF="dictionaries.html"&gt;</literal></entry> <entry><literal>&lt;A HREF="dictionaries.html"&gt;</literal></entry>
</row> </row>
<row> <row>
<entry>entity</entry> <entry><literal>entity</></entry>
<entry>HTML Entity</entry> <entry>HTML entity</entry>
<entry><literal>&amp;amp;</literal></entry> <entry><literal>&amp;amp;</literal></entry>
</row> </row>
<row> <row>
<entry>blank</entry> <entry><literal>blank</></entry>
<entry>Space symbols</entry> <entry>Space symbols</entry>
<entry>(any whitespace or punctuation not otherwise recognized)</entry> <entry>(any whitespace or punctuation not otherwise recognized)</entry>
</row> </row>
...@@ -1895,6 +1896,17 @@ LIMIT 10; ...@@ -1895,6 +1896,17 @@ LIMIT 10;
</tgroup> </tgroup>
</table> </table>
<note>
<para>
The parser's notion of a <quote>letter</> is determined by the server's
locale setting, specifically <varname>lc_ctype</>. Words containing
only the basic ASCII letters are reported as a separate token type,
since it is sometimes useful to distinguish them. In most European
languages, token types <literal>word</> and <literal>asciiword</>
should always be treated alike.
</para>
</note>
<para> <para>
It is possible for the parser to produce overlapping tokens from the same It is possible for the parser to produce overlapping tokens from the same
piece of text. As an example, a hyphenated word will be reported both piece of text. As an example, a hyphenated word will be reported both
...@@ -1902,14 +1914,14 @@ LIMIT 10; ...@@ -1902,14 +1914,14 @@ LIMIT 10;
<programlisting> <programlisting>
SELECT alias, description, token FROM ts_debug('foo-bar-beta1'); SELECT alias, description, token FROM ts_debug('foo-bar-beta1');
alias | description | token alias | description | token
-------------+-------------------------------+--------------- -----------------+------------------------------------------+---------------
hword | Hyphenated word | foo-bar-beta1 numhword | Hyphenated word, letters and digits | foo-bar-beta1
lpart_hword | Latin part of hyphenated word | foo hword_asciipart | Hyphenated word part, all ASCII | foo
blank | Space symbols | - blank | Space symbols | -
lpart_hword | Latin part of hyphenated word | bar hword_asciipart | Hyphenated word part, all ASCII | bar
blank | Space symbols | - blank | Space symbols | -
part_hword | Part of hyphenated word | beta1 hword_numpart | Hyphenated word part, letters and digits | beta1
</programlisting> </programlisting>
This behavior is desirable since it allows searches to work for both This behavior is desirable since it allows searches to work for both
...@@ -2045,13 +2057,13 @@ SELECT alias, description, token FROM ts_debug('http://foo.com/stuff/index.html' ...@@ -2045,13 +2057,13 @@ SELECT alias, description, token FROM ts_debug('http://foo.com/stuff/index.html'
a <application>Snowball</> stemmer or <literal>simple</>, which a <application>Snowball</> stemmer or <literal>simple</>, which
recognizes everything. For example, for an astronomy-specific search recognizes everything. For example, for an astronomy-specific search
(<literal>astro_en</literal> configuration) one could bind token type (<literal>astro_en</literal> configuration) one could bind token type
<type>lword</type> (Latin word) to a synonym dictionary of astronomical <type>asciiword</type> (ASCII word) to a synonym dictionary of astronomical
terms, a general English dictionary and a <application>Snowball</> English terms, a general English dictionary and a <application>Snowball</> English
stemmer: stemmer:
<programlisting> <programlisting>
ALTER TEXT SEARCH CONFIGURATION astro_en ALTER TEXT SEARCH CONFIGURATION astro_en
ADD MAPPING FOR lword WITH astrosyn, english_ispell, english_stem; ADD MAPPING FOR asciiword WITH astrosyn, english_ispell, english_stem;
</programlisting> </programlisting>
</para> </para>
...@@ -2187,9 +2199,9 @@ SELECT ts_lexize('public.simple_dict','The'); ...@@ -2187,9 +2199,9 @@ SELECT ts_lexize('public.simple_dict','The');
<programlisting> <programlisting>
SELECT * FROM ts_debug('english', 'Paris'); SELECT * FROM ts_debug('english', 'Paris');
alias | description | token | dictionaries | dictionary | lexemes alias | description | token | dictionaries | dictionary | lexemes
-------+-------------+-------+----------------+--------------+--------- -----------+-----------------+-------+----------------+--------------+---------
lword | Latin word | Paris | {english_stem} | english_stem | {pari} asciiword | Word, all ASCII | Paris | {english_stem} | english_stem | {pari}
CREATE TEXT SEARCH DICTIONARY my_synonym ( CREATE TEXT SEARCH DICTIONARY my_synonym (
TEMPLATE = synonym, TEMPLATE = synonym,
...@@ -2197,12 +2209,12 @@ CREATE TEXT SEARCH DICTIONARY my_synonym ( ...@@ -2197,12 +2209,12 @@ CREATE TEXT SEARCH DICTIONARY my_synonym (
); );
ALTER TEXT SEARCH CONFIGURATION english ALTER TEXT SEARCH CONFIGURATION english
ALTER MAPPING FOR lword WITH my_synonym, english_stem; ALTER MAPPING FOR asciiword WITH my_synonym, english_stem;
SELECT * FROM ts_debug('english', 'Paris'); SELECT * FROM ts_debug('english', 'Paris');
alias | description | token | dictionaries | dictionary | lexemes alias | description | token | dictionaries | dictionary | lexemes
-------+-------------+-------+---------------------------+------------+--------- -----------+-----------------+-------+---------------------------+------------+---------
lword | Latin word | Paris | {my_synonym,english_stem} | my_synonym | {paris} asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
</programlisting> </programlisting>
</para> </para>
...@@ -2293,7 +2305,7 @@ the one a two : swsw2 ...@@ -2293,7 +2305,7 @@ the one a two : swsw2
uses these assignments to check if it should handle the next word or stop uses these assignments to check if it should handle the next word or stop
accumulation. The thesaurus dictionary must be configured accumulation. The thesaurus dictionary must be configured
carefully. For example, if the thesaurus dictionary is assigned to handle carefully. For example, if the thesaurus dictionary is assigned to handle
only the <literal>lword</literal> token, then a thesaurus dictionary only the <literal>asciiword</literal> token, then a thesaurus dictionary
definition like <literal>one 7</> will not work since token type definition like <literal>one 7</> will not work since token type
<literal>uint</literal> is not assigned to the thesaurus dictionary. <literal>uint</literal> is not assigned to the thesaurus dictionary.
</para> </para>
...@@ -2353,7 +2365,7 @@ CREATE TEXT SEARCH DICTIONARY thesaurus_simple ( ...@@ -2353,7 +2365,7 @@ CREATE TEXT SEARCH DICTIONARY thesaurus_simple (
<programlisting> <programlisting>
ALTER TEXT SEARCH CONFIGURATION russian ALTER TEXT SEARCH CONFIGURATION russian
ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple; ADD MAPPING FOR asciiword, asciihword, hword_asciipart WITH thesaurus_simple;
</programlisting> </programlisting>
</para> </para>
...@@ -2382,7 +2394,7 @@ CREATE TEXT SEARCH DICTIONARY thesaurus_astro ( ...@@ -2382,7 +2394,7 @@ CREATE TEXT SEARCH DICTIONARY thesaurus_astro (
); );
ALTER TEXT SEARCH CONFIGURATION russian ALTER TEXT SEARCH CONFIGURATION russian
ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_astro, english_stem; ADD MAPPING FOR asciiword, asciihword, hword_asciipart WITH thesaurus_astro, english_stem;
</programlisting> </programlisting>
Now we can see how it works. Now we can see how it works.
...@@ -2633,12 +2645,13 @@ CREATE TEXT SEARCH DICTIONARY english_ispell ( ...@@ -2633,12 +2645,13 @@ CREATE TEXT SEARCH DICTIONARY english_ispell (
); );
</programlisting> </programlisting>
Now we can set up the mappings for Latin words for configuration Now we can set up the mappings for words in configuration
<literal>pg</>: <literal>pg</>:
<programlisting> <programlisting>
ALTER TEXT SEARCH CONFIGURATION pg ALTER TEXT SEARCH CONFIGURATION pg
ALTER MAPPING FOR lword, lhword, lpart_hword ALTER MAPPING FOR asciiword, asciihword, hword_asciipart,
word, hword, hword_part
WITH pg_dict, english_ispell, english_stem; WITH pg_dict, english_ispell, english_stem;
</programlisting> </programlisting>
...@@ -2778,32 +2791,32 @@ SHOW default_text_search_config; ...@@ -2778,32 +2791,32 @@ SHOW default_text_search_config;
<programlisting> <programlisting>
SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats');
alias | description | token | dictionaries | dictionary | lexemes alias | description | token | dictionaries | dictionary | lexemes
-------+---------------+-------+----------------+--------------+--------- -----------+-----------------+-------+----------------+--------------+---------
lword | Latin word | a | {english_stem} | english_stem | {} asciiword | Word, all ASCII | a | {english_stem} | english_stem | {}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | fat | {english_stem} | english_stem | {fat} asciiword | Word, all ASCII | fat | {english_stem} | english_stem | {fat}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | cat | {english_stem} | english_stem | {cat} asciiword | Word, all ASCII | cat | {english_stem} | english_stem | {cat}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | sat | {english_stem} | english_stem | {sat} asciiword | Word, all ASCII | sat | {english_stem} | english_stem | {sat}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | on | {english_stem} | english_stem | {} asciiword | Word, all ASCII | on | {english_stem} | english_stem | {}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | a | {english_stem} | english_stem | {} asciiword | Word, all ASCII | a | {english_stem} | english_stem | {}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | mat | {english_stem} | english_stem | {mat} asciiword | Word, all ASCII | mat | {english_stem} | english_stem | {mat}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
blank | Space symbols | - | {} | | blank | Space symbols | - | {} | |
lword | Latin word | it | {english_stem} | english_stem | {} asciiword | Word, all ASCII | it | {english_stem} | english_stem | {}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | ate | {english_stem} | english_stem | {ate} asciiword | Word, all ASCII | ate | {english_stem} | english_stem | {ate}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | a | {english_stem} | english_stem | {} asciiword | Word, all ASCII | a | {english_stem} | english_stem | {}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | fat | {english_stem} | english_stem | {fat} asciiword | Word, all ASCII | fat | {english_stem} | english_stem | {fat}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | rats | {english_stem} | english_stem | {rat} asciiword | Word, all ASCII | rats | {english_stem} | english_stem | {rat}
</programlisting> </programlisting>
</para> </para>
...@@ -2824,23 +2837,23 @@ CREATE TEXT SEARCH DICTIONARY english_ispell ( ...@@ -2824,23 +2837,23 @@ CREATE TEXT SEARCH DICTIONARY english_ispell (
); );
ALTER TEXT SEARCH CONFIGURATION public.english ALTER TEXT SEARCH CONFIGURATION public.english
ALTER MAPPING FOR lword WITH english_ispell, english_stem; ALTER MAPPING FOR asciiword WITH english_ispell, english_stem;
</programlisting> </programlisting>
<programlisting> <programlisting>
SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
alias | description | token | dictionaries | dictionary | lexemes alias | description | token | dictionaries | dictionary | lexemes
-------+---------------+-------------+-------------------------------+----------------+------------- -----------+-----------------+-------------+-------------------------------+----------------+-------------
lword | Latin word | The | {english_ispell,english_stem} | english_ispell | {} asciiword | Word, all ASCII | The | {english_ispell,english_stem} | english_ispell | {}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | Brightest | {english_ispell,english_stem} | english_ispell | {bright} asciiword | Word, all ASCII | Brightest | {english_ispell,english_stem} | english_ispell | {bright}
blank | Space symbols | | {} | | blank | Space symbols | | {} | |
lword | Latin word | supernovaes | {english_ispell,english_stem} | english_stem | {supernova} asciiword | Word, all ASCII | supernovaes | {english_ispell,english_stem} | english_stem | {supernova}
</programlisting> </programlisting>
<para> <para>
In this example, the word <literal>Brightest</> was recognized by the In this example, the word <literal>Brightest</> was recognized by the
parser as a <literal>Latin word</literal> (alias <literal>lword</literal>). parser as an <literal>ASCII word</literal> (alias <literal>asciiword</literal>).
For this token type the dictionary list is For this token type the dictionary list is
<literal>english_ispell</> and <literal>english_ispell</> and
<literal>english_stem</literal>. The word was recognized by <literal>english_stem</literal>. The word was recognized by
...@@ -2868,13 +2881,13 @@ SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); ...@@ -2868,13 +2881,13 @@ SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
<programlisting> <programlisting>
SELECT alias, token, dictionary, lexemes SELECT alias, token, dictionary, lexemes
FROM ts_debug('public.english','The Brightest supernovaes'); FROM ts_debug('public.english','The Brightest supernovaes');
alias | token | dictionary | lexemes alias | token | dictionary | lexemes
-------+-------------+----------------+------------- -----------+-------------+----------------+-------------
lword | The | english_ispell | {} asciiword | The | english_ispell | {}
blank | | | blank | | |
lword | Brightest | english_ispell | {bright} asciiword | Brightest | english_ispell | {bright}
blank | | | blank | | |
lword | supernovaes | english_stem | {supernova} asciiword | supernovaes | english_stem | {supernova}
</programlisting> </programlisting>
</para> </para>
...@@ -2935,31 +2948,31 @@ SELECT * FROM ts_parse('default', '123 - a number'); ...@@ -2935,31 +2948,31 @@ SELECT * FROM ts_parse('default', '123 - a number');
<programlisting> <programlisting>
SELECT * FROM ts_token_type('default'); SELECT * FROM ts_token_type('default');
tokid | alias | description tokid | alias | description
-------+--------------+----------------------------------- -------+-----------------+------------------------------------------
1 | lword | Latin word 1 | asciiword | Word, all ASCII
2 | nlword | Non-latin word 2 | word | Word, all letters
3 | word | Word 3 | numword | Word, letters and digits
4 | email | Email 4 | email | Email address
5 | url | URL 5 | url | URL
6 | host | Host 6 | host | Host
7 | sfloat | Scientific notation 7 | sfloat | Scientific notation
8 | version | VERSION 8 | version | Version number
9 | part_hword | Part of hyphenated word 9 | hword_numpart | Hyphenated word part, letters and digits
10 | nlpart_hword | Non-latin part of hyphenated word 10 | hword_part | Hyphenated word part, all letters
11 | lpart_hword | Latin part of hyphenated word 11 | hword_asciipart | Hyphenated word part, all ASCII
12 | blank | Space symbols 12 | blank | Space symbols
13 | tag | HTML Tag 13 | tag | HTML tag
14 | protocol | Protocol head 14 | protocol | Protocol head
15 | hword | Hyphenated word 15 | numhword | Hyphenated word, letters and digits
16 | lhword | Latin hyphenated word 16 | asciihword | Hyphenated word, all ASCII
17 | nlhword | Non-latin hyphenated word 17 | hword | Hyphenated word, all letters
18 | uri | URI 18 | uri | URI
19 | file | File or path name 19 | file | File or path name
20 | float | Decimal notation 20 | float | Decimal notation
21 | int | Signed integer 21 | int | Signed integer
22 | uint | Unsigned integer 22 | uint | Unsigned integer
23 | entity | HTML Entity 23 | entity | HTML entity
</programlisting> </programlisting>
</para> </para>
...@@ -3304,27 +3317,27 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); ...@@ -3304,27 +3317,27 @@ EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
=&gt; \dF+ russian =&gt; \dF+ russian
Text search configuration "pg_catalog.russian" Text search configuration "pg_catalog.russian"
Parser: "pg_catalog.default" Parser: "pg_catalog.default"
Token | Dictionaries Token | Dictionaries
--------------+-------------- -----------------+--------------
email | simple asciihword | english_stem
file | simple asciiword | english_stem
float | simple email | simple
host | simple file | simple
hword | russian_stem float | simple
int | simple host | simple
lhword | english_stem hword | russian_stem
lpart_hword | english_stem hword_asciipart | english_stem
lword | english_stem hword_numpart | simple
nlhword | russian_stem hword_part | russian_stem
nlpart_hword | russian_stem int | simple
nlword | russian_stem numhword | simple
part_hword | russian_stem numword | simple
sfloat | simple sfloat | simple
uint | simple uint | simple
uri | simple uri | simple
url | simple url | simple
version | simple version | simple
word | russian_stem word | russian_stem
</programlisting> </programlisting>
</para> </para>
</listitem> </listitem>
...@@ -3389,32 +3402,32 @@ Parser: "pg_catalog.default" ...@@ -3389,32 +3402,32 @@ Parser: "pg_catalog.default"
Get headline | prsd_headline | Get headline | prsd_headline |
Get token types | prsd_lextype | Get token types | prsd_lextype |
Token types for parser "pg_catalog.default" Token types for parser "pg_catalog.default"
Token name | Description Token name | Description
--------------+----------------------------------- -----------------+------------------------------------------
blank | Space symbols asciihword | Hyphenated word, all ASCII
email | Email asciiword | Word, all ASCII
entity | HTML Entity blank | Space symbols
file | File or path name email | Email address
float | Decimal notation entity | HTML entity
host | Host file | File or path name
hword | Hyphenated word float | Decimal notation
int | Signed integer host | Host
lhword | Latin hyphenated word hword | Hyphenated word, all letters
lpart_hword | Latin part of hyphenated word hword_asciipart | Hyphenated word part, all ASCII
lword | Latin word hword_numpart | Hyphenated word part, letters and digits
nlhword | Non-latin hyphenated word hword_part | Hyphenated word part, all letters
nlpart_hword | Non-latin part of hyphenated word int | Signed integer
nlword | Non-latin word numhword | Hyphenated word, letters and digits
part_hword | Part of hyphenated word numword | Word, letters and digits
protocol | Protocol head protocol | Protocol head
sfloat | Scientific notation sfloat | Scientific notation
tag | HTML Tag tag | HTML tag
uint | Unsigned integer uint | Unsigned integer
uri | URI uri | URI
url | URL url | URL
version | VERSION version | Version number
word | Word word | Word, all letters
(23 rows) (23 rows)
</programlisting> </programlisting>
</para> </para>
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# #
# Makefile for src/backend/snowball # Makefile for src/backend/snowball
# #
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $ # $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
# #
#------------------------------------------------------------------------- #-------------------------------------------------------------------------
...@@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \ ...@@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
stem_UTF_8_swedish.o \ stem_UTF_8_swedish.o \
stem_UTF_8_turkish.o stem_UTF_8_turkish.o
# second column is name of latin dictionary, if different # first column is language name and also name of dictionary for not-all-ASCII
# Note order dependency: use of some other language as latin dictionary # words, second is name of dictionary for all-ASCII words
# Note order dependency: use of some other language as ASCII dictionary
# must come after creation of that language # must come after creation of that language
LANGUAGES= \ LANGUAGES= \
danish danish \ danish danish \
...@@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes) ...@@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
while [ "$$#" -gt 0 ] ; \ while [ "$$#" -gt 0 ] ; \
do \ do \
lang=$$1; shift; \ lang=$$1; shift; \
nonlatdictname=$$lang; \ nonascdictname=$$lang; \
latdictname=$$1; shift; \ ascdictname=$$1; shift; \
if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \ if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
stop=", StopWords=$${lang}" ; \ stop=", StopWords=$${lang}" ; \
else \ else \
...@@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes) ...@@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
sed -e "s#_LANGNAME_#$$lang#g" | \ sed -e "s#_LANGNAME_#$$lang#g" | \
sed -e "s#_DICTNAME_#$${lang}_stem#g" | \ sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
sed -e "s#_CFGNAME_#$$lang#g" | \ sed -e "s#_CFGNAME_#$$lang#g" | \
sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \ sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \ sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
sed -e "s#_STOPWORDS_#$$stop#g" ; \ sed -e "s#_STOPWORDS_#$$stop#g" ; \
done >> $@ done >> $@
else else
......
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$ -- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$
-- text search configuration for _LANGNAME_ language -- text search configuration for _LANGNAME_ language
CREATE TEXT SEARCH DICTIONARY _DICTNAME_ CREATE TEXT SEARCH DICTIONARY _DICTNAME_
...@@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_ ...@@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language'; COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR email, url, host, sfloat, version, uri, file, float, int, uint FOR email, url, host, sfloat, version, uri, file, float, int, uint,
numword, hword_numpart, numhword
WITH simple; WITH simple;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR lhword, lpart_hword, lword FOR asciiword, hword_asciipart, asciihword
WITH _LATDICTNAME_; WITH _ASCDICTNAME_;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR hword, nlhword, nlpart_hword, nlword, word, part_hword FOR word, hword_part, hword
WITH _NONLATDICTNAME_; WITH _NONASCDICTNAME_;
/*------------------------------------------------------------------------- /*-------------------------------------------------------------------------
* *
* wparser_def.c * wparser_def.c
* Standard word parser * Default text search parser
* *
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.3 2007/09/07 15:09:55 teodor Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.4 2007/10/23 20:46:12 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -22,79 +22,53 @@ ...@@ -22,79 +22,53 @@
#include "utils/builtins.h" #include "utils/builtins.h"
/* rememder !!!! */ /* Output token categories */
#define LASTNUM 23
#define LATWORD 1 #define ASCIIWORD 1
#define CYRWORD 2 #define WORD_T 2
#define UWORD 3 #define NUMWORD 3
#define EMAIL 4 #define EMAIL 4
#define FURL 5 #define URL_T 5
#define HOST 6 #define HOST 6
#define SCIENTIFIC 7 #define SCIENTIFIC 7
#define VERSIONNUMBER 8 #define VERSIONNUMBER 8
#define PARTHYPHENWORD 9 #define NUMPARTHWORD 9
#define CYRPARTHYPHENWORD 10 #define PARTHWORD 10
#define LATPARTHYPHENWORD 11 #define ASCIIPARTHWORD 11
#define SPACE 12 #define SPACE 12
#define TAG 13 #define TAG_T 13
#define PROTOCOL 14 #define PROTOCOL 14
#define HYPHENWORD 15 #define NUMHWORD 15
#define LATHYPHENWORD 16 #define ASCIIHWORD 16
#define CYRHYPHENWORD 17 #define HWORD 17
#define URI 18 #define URI 18
#define FILEPATH 19 #define FILEPATH 19
#define DECIMAL 20 #define DECIMAL 20
#define SIGNEDINT 21 #define SIGNEDINT 21
#define UNSIGNEDINT 22 #define UNSIGNEDINT 22
#define HTMLENTITY 23 #define HTMLENTITY 23
static const char *lex_descr[] = { #define LASTNUM 23
static const char * const tok_alias[] = {
"", "",
"Latin word", "asciiword",
"Non-latin word",
"Word",
"Email",
"URL",
"Host",
"Scientific notation",
"VERSION",
"Part of hyphenated word",
"Non-latin part of hyphenated word",
"Latin part of hyphenated word",
"Space symbols",
"HTML Tag",
"Protocol head",
"Hyphenated word",
"Latin hyphenated word",
"Non-latin hyphenated word",
"URI",
"File or path name",
"Decimal notation",
"Signed integer",
"Unsigned integer",
"HTML Entity"
};
static const char *tok_alias[] = {
"",
"lword",
"nlword",
"word", "word",
"numword",
"email", "email",
"url", "url",
"host", "host",
"sfloat", "sfloat",
"version", "version",
"part_hword", "hword_numpart",
"nlpart_hword", "hword_part",
"lpart_hword", "hword_asciipart",
"blank", "blank",
"tag", "tag",
"protocol", "protocol",
"numhword",
"asciihword",
"hword", "hword",
"lhword",
"nlhword",
"uri", "uri",
"file", "file",
"float", "float",
...@@ -103,12 +77,42 @@ static const char *tok_alias[] = { ...@@ -103,12 +77,42 @@ static const char *tok_alias[] = {
"entity" "entity"
}; };
static const char * const lex_descr[] = {
"",
"Word, all ASCII",
"Word, all letters",
"Word, letters and digits",
"Email address",
"URL",
"Host",
"Scientific notation",
"Version number",
"Hyphenated word part, letters and digits",
"Hyphenated word part, all letters",
"Hyphenated word part, all ASCII",
"Space symbols",
"HTML tag",
"Protocol head",
"Hyphenated word, letters and digits",
"Hyphenated word, all ASCII",
"Hyphenated word, all letters",
"URI",
"File or path name",
"Decimal notation",
"Signed integer",
"Unsigned integer",
"HTML entity"
};
/* Parser states */
typedef enum typedef enum
{ {
TPS_Base = 0, TPS_Base = 0,
TPS_InUWord, TPS_InNumWord,
TPS_InLatWord, TPS_InAsciiWord,
TPS_InCyrWord, TPS_InWord,
TPS_InUnsignedInt, TPS_InUnsignedInt,
TPS_InSignedIntFirst, TPS_InSignedIntFirst,
TPS_InSignedInt, TPS_InSignedInt,
...@@ -167,20 +171,20 @@ typedef enum ...@@ -167,20 +171,20 @@ typedef enum
TPS_InProtocolFirst, TPS_InProtocolFirst,
TPS_InProtocolSecond, TPS_InProtocolSecond,
TPS_InProtocolEnd, TPS_InProtocolEnd,
TPS_InHyphenLatWordFirst, TPS_InHyphenAsciiWordFirst,
TPS_InHyphenLatWord, TPS_InHyphenAsciiWord,
TPS_InHyphenCyrWordFirst, TPS_InHyphenWordFirst,
TPS_InHyphenCyrWord, TPS_InHyphenWord,
TPS_InHyphenUWordFirst, TPS_InHyphenNumWordFirst,
TPS_InHyphenUWord, TPS_InHyphenNumWord,
TPS_InHyphenValueFirst, TPS_InHyphenValueFirst,
TPS_InHyphenValue, TPS_InHyphenValue,
TPS_InHyphenValueExact, TPS_InHyphenValueExact,
TPS_InParseHyphen, TPS_InParseHyphen,
TPS_InParseHyphenHyphen, TPS_InParseHyphenHyphen,
TPS_InHyphenCyrWordPart, TPS_InHyphenWordPart,
TPS_InHyphenLatWordPart, TPS_InHyphenAsciiWordPart,
TPS_InHyphenUWordPart, TPS_InHyphenNumWordPart,
TPS_InHyphenUnsignedInt, TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst, TPS_InHDecimalPartFirst,
TPS_InHDecimalPart, TPS_InHDecimalPart,
...@@ -192,7 +196,6 @@ typedef enum ...@@ -192,7 +196,6 @@ typedef enum
/* forward declaration */ /* forward declaration */
struct TParser; struct TParser;
typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
* except p_iseq */ * except p_iseq */
typedef void (*TParserSpecial) (struct TParser *); /* special handler for typedef void (*TParserSpecial) (struct TParser *); /* special handler for
...@@ -208,6 +211,16 @@ typedef struct ...@@ -208,6 +211,16 @@ typedef struct
TParserSpecial special; TParserSpecial special;
} TParserStateActionItem; } TParserStateActionItem;
/* Flag bits in TParserStateActionItem.flags */
#define A_NEXT 0x0000
#define A_BINGO 0x0001
#define A_POP 0x0002
#define A_PUSH 0x0004
#define A_RERUN 0x0008
#define A_CLEAR 0x0010
#define A_MERGE 0x0020
#define A_CLRALL 0x0040
typedef struct typedef struct
{ {
TParserState state; TParserState state;
...@@ -255,6 +268,11 @@ typedef struct TParser ...@@ -255,6 +268,11 @@ typedef struct TParser
} TParser; } TParser;
/* forward decls here */
static bool TParserGet(TParser * prs);
static TParserPosition * static TParserPosition *
newTParserPosition(TParserPosition * prev) newTParserPosition(TParserPosition * prev)
{ {
...@@ -303,8 +321,6 @@ TParserInit(char *str, int len) ...@@ -303,8 +321,6 @@ TParserInit(char *str, int len)
return prs; return prs;
} }
static bool TParserGet(TParser * prs);
static void static void
TParserClose(TParser * prs) TParserClose(TParser * prs)
{ {
...@@ -325,10 +341,10 @@ TParserClose(TParser * prs) ...@@ -325,10 +341,10 @@ TParserClose(TParser * prs)
} }
/* /*
* defining support function, equvalent is* macroses, but * Character-type support functions, equivalent to is* macros, but
* working with any possible encodings and locales. Note, * working with any possible encodings and locales. Note,
* that with multibyte encoding and C-locale isw* function may fail * that with multibyte encoding and C-locale isw* function may fail
* or give wrong result. Note 2: multibyte encoding and C-local * or give wrong result. Note 2: multibyte encoding and C-locale
* often are used for Asian languages * often are used for Asian languages
*/ */
...@@ -487,17 +503,13 @@ p_isascii(TParser * prs) ...@@ -487,17 +503,13 @@ p_isascii(TParser * prs)
} }
static int static int
p_islatin(TParser * prs) p_isasclet(TParser * prs)
{ {
return (p_isalpha(prs) && p_isascii(prs)) ? 1 : 0; return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
} }
static int
p_isnonlatin(TParser * prs)
{
return (p_isalpha(prs) && !p_isascii(prs)) ? 1 : 0;
}
/* deliberately suppress unused-function complaints for the above */
void _make_compiler_happy(void); void _make_compiler_happy(void);
void void
_make_compiler_happy(void) _make_compiler_happy(void)
...@@ -638,21 +650,12 @@ p_isURI(TParser * prs) ...@@ -638,21 +650,12 @@ p_isURI(TParser * prs)
* Table of state/action of parser * Table of state/action of parser
*/ */
#define A_NEXT 0x0000
#define A_BINGO 0x0001
#define A_POP 0x0002
#define A_PUSH 0x0004
#define A_RERUN 0x0008
#define A_CLEAR 0x0010
#define A_MERGE 0x0020
#define A_CLRALL 0x0040
static TParserStateActionItem actionTPS_Base[] = { static TParserStateActionItem actionTPS_Base[] = {
{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL}, {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL}, {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL}, {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InLatWord, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InCyrWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
...@@ -664,37 +667,38 @@ static TParserStateActionItem actionTPS_Base[] = { ...@@ -664,37 +667,38 @@ static TParserStateActionItem actionTPS_Base[] = {
}; };
static TParserStateActionItem actionTPS_InUWord[] = { static TParserStateActionItem actionTPS_InNumWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UWORD, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL}, {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UWORD, NULL} {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
}; };
static TParserStateActionItem actionTPS_InLatWord[] = { static TParserStateActionItem actionTPS_InAsciiWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL}, {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, LATWORD, NULL} {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
}; };
static TParserStateActionItem actionTPS_InCyrWord[] = { static TParserStateActionItem actionTPS_InWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, CYRWORD, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_Null, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, CYRWORD, NULL} {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
}; };
static TParserStateActionItem actionTPS_InUnsignedInt[] = { static TParserStateActionItem actionTPS_InUnsignedInt[] = {
...@@ -704,8 +708,8 @@ static TParserStateActionItem actionTPS_InUnsignedInt[] = { ...@@ -704,8 +708,8 @@ static TParserStateActionItem actionTPS_InUnsignedInt[] = {
{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL}, {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL}, {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_islatin, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InUWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL} {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
}; };
...@@ -816,13 +820,13 @@ static TParserStateActionItem actionTPS_InMantissa[] = { ...@@ -816,13 +820,13 @@ static TParserStateActionItem actionTPS_InMantissa[] = {
static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = { static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL}, {p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static TParserStateActionItem actionTPS_InHTMLEntity[] = { static TParserStateActionItem actionTPS_InHTMLEntity[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
...@@ -849,7 +853,7 @@ static TParserStateActionItem actionTPS_InTagFirst[] = { ...@@ -849,7 +853,7 @@ static TParserStateActionItem actionTPS_InTagFirst[] = {
{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL}, {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL}, {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
{p_islatin, 0, A_PUSH, TPS_InTagName, 0, NULL}, {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
...@@ -863,7 +867,7 @@ static TParserStateActionItem actionTPS_InXMLBegin[] = { ...@@ -863,7 +867,7 @@ static TParserStateActionItem actionTPS_InXMLBegin[] = {
static TParserStateActionItem actionTPS_InTagCloseFirst[] = { static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InTagName, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
...@@ -873,7 +877,7 @@ static TParserStateActionItem actionTPS_InTagName[] = { ...@@ -873,7 +877,7 @@ static TParserStateActionItem actionTPS_InTagName[] = {
{p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL}, {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags}, {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags}, {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
...@@ -888,7 +892,7 @@ static TParserStateActionItem actionTPS_InTag[] = { ...@@ -888,7 +892,7 @@ static TParserStateActionItem actionTPS_InTag[] = {
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags}, {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL}, {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL}, {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL}, {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL}, {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
...@@ -924,7 +928,7 @@ static TParserStateActionItem actionTPS_InTagBackSleshed[] = { ...@@ -924,7 +928,7 @@ static TParserStateActionItem actionTPS_InTagBackSleshed[] = {
}; };
static TParserStateActionItem actionTPS_InTagEnd[] = { static TParserStateActionItem actionTPS_InTagEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL} {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
}; };
static TParserStateActionItem actionTPS_InCommentFirst[] = { static TParserStateActionItem actionTPS_InCommentFirst[] = {
...@@ -962,19 +966,19 @@ static TParserStateActionItem actionTPS_InCloseCommentLast[] = { ...@@ -962,19 +966,19 @@ static TParserStateActionItem actionTPS_InCloseCommentLast[] = {
}; };
static TParserStateActionItem actionTPS_InCommentEnd[] = { static TParserStateActionItem actionTPS_InCommentEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL} {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
}; };
static TParserStateActionItem actionTPS_InHostFirstDomain[] = { static TParserStateActionItem actionTPS_InHostFirstDomain[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static TParserStateActionItem actionTPS_InHostDomainSecond[] = { static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
...@@ -984,7 +988,7 @@ static TParserStateActionItem actionTPS_InHostDomainSecond[] = { ...@@ -984,7 +988,7 @@ static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
static TParserStateActionItem actionTPS_InHostDomain[] = { static TParserStateActionItem actionTPS_InHostDomain[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}, {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL}, {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
...@@ -1013,14 +1017,14 @@ static TParserStateActionItem actionTPS_InPort[] = { ...@@ -1013,14 +1017,14 @@ static TParserStateActionItem actionTPS_InPort[] = {
static TParserStateActionItem actionTPS_InHostFirstAN[] = { static TParserStateActionItem actionTPS_InHostFirstAN[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static TParserStateActionItem actionTPS_InHost[] = { static TParserStateActionItem actionTPS_InHost[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
...@@ -1034,7 +1038,7 @@ static TParserStateActionItem actionTPS_InEmail[] = { ...@@ -1034,7 +1038,7 @@ static TParserStateActionItem actionTPS_InEmail[] = {
static TParserStateActionItem actionTPS_InFileFirst[] = { static TParserStateActionItem actionTPS_InFileFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL}, {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
...@@ -1045,7 +1049,7 @@ static TParserStateActionItem actionTPS_InFileFirst[] = { ...@@ -1045,7 +1049,7 @@ static TParserStateActionItem actionTPS_InFileFirst[] = {
static TParserStateActionItem actionTPS_InFileTwiddle[] = { static TParserStateActionItem actionTPS_InFileTwiddle[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL}, {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
...@@ -1054,7 +1058,7 @@ static TParserStateActionItem actionTPS_InFileTwiddle[] = { ...@@ -1054,7 +1058,7 @@ static TParserStateActionItem actionTPS_InFileTwiddle[] = {
static TParserStateActionItem actionTPS_InPathFirst[] = { static TParserStateActionItem actionTPS_InPathFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL}, {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
...@@ -1079,7 +1083,7 @@ static TParserStateActionItem actionTPS_InPathSecond[] = { ...@@ -1079,7 +1083,7 @@ static TParserStateActionItem actionTPS_InPathSecond[] = {
static TParserStateActionItem actionTPS_InFile[] = { static TParserStateActionItem actionTPS_InFile[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
...@@ -1091,7 +1095,7 @@ static TParserStateActionItem actionTPS_InFile[] = { ...@@ -1091,7 +1095,7 @@ static TParserStateActionItem actionTPS_InFile[] = {
static TParserStateActionItem actionTPS_InFileNext[] = { static TParserStateActionItem actionTPS_InFileNext[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL}, {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL}, {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
...@@ -1119,7 +1123,7 @@ static TParserStateActionItem actionTPS_InURI[] = { ...@@ -1119,7 +1123,7 @@ static TParserStateActionItem actionTPS_InURI[] = {
static TParserStateActionItem actionTPS_InFURL[] = { static TParserStateActionItem actionTPS_InFURL[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, FURL, SpecialFURL}, {p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
...@@ -1139,54 +1143,52 @@ static TParserStateActionItem actionTPS_InProtocolEnd[] = { ...@@ -1139,54 +1143,52 @@ static TParserStateActionItem actionTPS_InProtocolEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL} {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
}; };
static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = { static TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static TParserStateActionItem actionTPS_InHyphenLatWord[] = { static TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen}, {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen} {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
}; };
static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = { static TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static TParserStateActionItem actionTPS_InHyphenCyrWord[] = { static TParserStateActionItem actionTPS_InHyphenWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen}, {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL}, {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen}
}; };
static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = { static TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static TParserStateActionItem actionTPS_InHyphenUWord[] = { static TParserStateActionItem actionTPS_InHyphenNumWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}, {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen} {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
}; };
static TParserStateActionItem actionTPS_InHyphenValueFirst[] = { static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
...@@ -1196,26 +1198,26 @@ static TParserStateActionItem actionTPS_InHyphenValueFirst[] = { ...@@ -1196,26 +1198,26 @@ static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
}; };
static TParserStateActionItem actionTPS_InHyphenValue[] = { static TParserStateActionItem actionTPS_InHyphenValue[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}, {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen} {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
}; };
static TParserStateActionItem actionTPS_InHyphenValueExact[] = { static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}, {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen} {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
}; };
static TParserStateActionItem actionTPS_InParseHyphen[] = { static TParserStateActionItem actionTPS_InParseHyphen[] = {
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL}, {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
{NULL, 0, A_RERUN, TPS_Base, 0, NULL} {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
...@@ -1227,32 +1229,31 @@ static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = { ...@@ -1227,32 +1229,31 @@ static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
{NULL, 0, A_POP, TPS_Null, 0, NULL} {NULL, 0, A_POP, TPS_Null, 0, NULL}
}; };
static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = { static TParserStateActionItem actionTPS_InHyphenWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, CYRPARTHYPHENWORD, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
{NULL, 0, A_BINGO, TPS_InParseHyphen, CYRPARTHYPHENWORD, NULL}
}; };
static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = { static TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, LATPARTHYPHENWORD, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, LATPARTHYPHENWORD, NULL} {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
}; };
static TParserStateActionItem actionTPS_InHyphenUWordPart[] = { static TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, PARTHYPHENWORD, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHYPHENWORD, NULL} {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
}; };
static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = { static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL}, {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL} {NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
}; };
...@@ -1284,14 +1285,14 @@ static TParserStateActionItem actionTPS_InHVersionPart[] = { ...@@ -1284,14 +1285,14 @@ static TParserStateActionItem actionTPS_InHVersionPart[] = {
}; };
/* /*
* order should be the same as in typedef enum {} TParserState!! * order must be the same as in typedef enum {} TParserState!!
*/ */
static const TParserStateAction Actions[] = { static const TParserStateAction Actions[] = {
{TPS_Base, actionTPS_Base}, {TPS_Base, actionTPS_Base},
{TPS_InUWord, actionTPS_InUWord}, {TPS_InNumWord, actionTPS_InNumWord},
{TPS_InLatWord, actionTPS_InLatWord}, {TPS_InAsciiWord, actionTPS_InAsciiWord},
{TPS_InCyrWord, actionTPS_InCyrWord}, {TPS_InWord, actionTPS_InWord},
{TPS_InUnsignedInt, actionTPS_InUnsignedInt}, {TPS_InUnsignedInt, actionTPS_InUnsignedInt},
{TPS_InSignedIntFirst, actionTPS_InSignedIntFirst}, {TPS_InSignedIntFirst, actionTPS_InSignedIntFirst},
{TPS_InSignedInt, actionTPS_InSignedInt}, {TPS_InSignedInt, actionTPS_InSignedInt},
...@@ -1350,20 +1351,20 @@ static const TParserStateAction Actions[] = { ...@@ -1350,20 +1351,20 @@ static const TParserStateAction Actions[] = {
{TPS_InProtocolFirst, actionTPS_InProtocolFirst}, {TPS_InProtocolFirst, actionTPS_InProtocolFirst},
{TPS_InProtocolSecond, actionTPS_InProtocolSecond}, {TPS_InProtocolSecond, actionTPS_InProtocolSecond},
{TPS_InProtocolEnd, actionTPS_InProtocolEnd}, {TPS_InProtocolEnd, actionTPS_InProtocolEnd},
{TPS_InHyphenLatWordFirst, actionTPS_InHyphenLatWordFirst}, {TPS_InHyphenAsciiWordFirst, actionTPS_InHyphenAsciiWordFirst},
{TPS_InHyphenLatWord, actionTPS_InHyphenLatWord}, {TPS_InHyphenAsciiWord, actionTPS_InHyphenAsciiWord},
{TPS_InHyphenCyrWordFirst, actionTPS_InHyphenCyrWordFirst}, {TPS_InHyphenWordFirst, actionTPS_InHyphenWordFirst},
{TPS_InHyphenCyrWord, actionTPS_InHyphenCyrWord}, {TPS_InHyphenWord, actionTPS_InHyphenWord},
{TPS_InHyphenUWordFirst, actionTPS_InHyphenUWordFirst}, {TPS_InHyphenNumWordFirst, actionTPS_InHyphenNumWordFirst},
{TPS_InHyphenUWord, actionTPS_InHyphenUWord}, {TPS_InHyphenNumWord, actionTPS_InHyphenNumWord},
{TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst}, {TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst},
{TPS_InHyphenValue, actionTPS_InHyphenValue}, {TPS_InHyphenValue, actionTPS_InHyphenValue},
{TPS_InHyphenValueExact, actionTPS_InHyphenValueExact}, {TPS_InHyphenValueExact, actionTPS_InHyphenValueExact},
{TPS_InParseHyphen, actionTPS_InParseHyphen}, {TPS_InParseHyphen, actionTPS_InParseHyphen},
{TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen}, {TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen},
{TPS_InHyphenCyrWordPart, actionTPS_InHyphenCyrWordPart}, {TPS_InHyphenWordPart, actionTPS_InHyphenWordPart},
{TPS_InHyphenLatWordPart, actionTPS_InHyphenLatWordPart}, {TPS_InHyphenAsciiWordPart, actionTPS_InHyphenAsciiWordPart},
{TPS_InHyphenUWordPart, actionTPS_InHyphenUWordPart}, {TPS_InHyphenNumWordPart, actionTPS_InHyphenNumWordPart},
{TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt}, {TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt},
{TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst}, {TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst},
{TPS_InHDecimalPart, actionTPS_InHDecimalPart}, {TPS_InHDecimalPart, actionTPS_InHDecimalPart},
...@@ -1378,10 +1379,11 @@ TParserGet(TParser * prs) ...@@ -1378,10 +1379,11 @@ TParserGet(TParser * prs)
{ {
TParserStateActionItem *item = NULL; TParserStateActionItem *item = NULL;
Assert(prs->state);
if (prs->state->posbyte >= prs->lenstr) if (prs->state->posbyte >= prs->lenstr)
return false; return false;
Assert(prs->state);
prs->lexeme = prs->str + prs->state->posbyte; prs->lexeme = prs->str + prs->state->posbyte;
prs->state->pushedAtAction = NULL; prs->state->pushedAtAction = NULL;
...@@ -1488,10 +1490,12 @@ TParserGet(TParser * prs) ...@@ -1488,10 +1490,12 @@ TParserGet(TParser * prs)
prs->state->state = item->tostate; prs->state->state = item->tostate;
/* check for go away */ /* check for go away */
if ((item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN) == 0)) if ((item->flags & A_BINGO) ||
(prs->state->posbyte >= prs->lenstr &&
(item->flags & A_RERUN) == 0))
break; break;
/* go to begining of loop if we should rerun or we just restore state */ /* go to beginning of loop if we should rerun or we just restore state */
if (item->flags & (A_RERUN | A_POP)) if (item->flags & (A_RERUN | A_POP))
continue; continue;
...@@ -1557,16 +1561,15 @@ prsd_end(PG_FUNCTION_ARGS) ...@@ -1557,16 +1561,15 @@ prsd_end(PG_FUNCTION_ARGS)
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
#define LEAVETOKEN(x) ( (x)==12 ) #define LEAVETOKEN(x) ( (x)==SPACE )
#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 ) #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define ENDPUNCTOKEN(x) ( (x)==12 ) #define ENDPUNCTOKEN(x) ( (x)==SPACE )
#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 ) #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY )
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 ) #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 ) #define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) ) #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) ) #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
typedef struct typedef struct
{ {
......
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.435 2007/10/22 20:13:37 tgl Exp $ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.436 2007/10/23 20:46:12 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -53,6 +53,6 @@ ...@@ -53,6 +53,6 @@
*/ */
/* yyyymmddN */ /* yyyymmddN */
#define CATALOG_VERSION_NO 200710221 #define CATALOG_VERSION_NO 200710231
#endif #endif
...@@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle'); ...@@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle');
(1 row) (1 row)
-- Create and simple test thesaurus dictionary -- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize -- More tests in configuration checks because ts_lexize()
-- can not give more tat one word as it may wish thesaurus. -- cannot pass more than one word to thesaurus.
CREATE TEXT SEARCH DICTIONARY thesaurus ( CREATE TEXT SEARCH DICTIONARY thesaurus (
Template=thesaurus, Template=thesaurus,
DictFile=thesaurus_sample, DictFile=thesaurus_sample,
...@@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst ( ...@@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
COPY=english COPY=english
); );
ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem; WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
to_tsvector to_tsvector
...@@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst ( ...@@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
COPY=english COPY=english
); );
ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
lword, lpart_hword, lhword asciiword, hword_asciipart, asciihword
WITH synonym, english_stem; WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
to_tsvector to_tsvector
...@@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( ...@@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
COPY=synonym_tst COPY=synonym_tst
); );
ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR
lword, lpart_hword, lhword asciiword, hword_asciipart, asciihword
WITH synonym, thesaurus, english_stem; WITH synonym, thesaurus, english_stem;
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one'); SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
to_tsvector to_tsvector
......
...@@ -208,31 +208,31 @@ SELECT ts_lexize('english_stem', 'identity'); ...@@ -208,31 +208,31 @@ SELECT ts_lexize('english_stem', 'identity');
(1 row) (1 row)
SELECT * FROM ts_token_type('default'); SELECT * FROM ts_token_type('default');
tokid | alias | description tokid | alias | description
-------+--------------+----------------------------------- -------+-----------------+------------------------------------------
1 | lword | Latin word 1 | asciiword | Word, all ASCII
2 | nlword | Non-latin word 2 | word | Word, all letters
3 | word | Word 3 | numword | Word, letters and digits
4 | email | Email 4 | email | Email address
5 | url | URL 5 | url | URL
6 | host | Host 6 | host | Host
7 | sfloat | Scientific notation 7 | sfloat | Scientific notation
8 | version | VERSION 8 | version | Version number
9 | part_hword | Part of hyphenated word 9 | hword_numpart | Hyphenated word part, letters and digits
10 | nlpart_hword | Non-latin part of hyphenated word 10 | hword_part | Hyphenated word part, all letters
11 | lpart_hword | Latin part of hyphenated word 11 | hword_asciipart | Hyphenated word part, all ASCII
12 | blank | Space symbols 12 | blank | Space symbols
13 | tag | HTML Tag 13 | tag | HTML tag
14 | protocol | Protocol head 14 | protocol | Protocol head
15 | hword | Hyphenated word 15 | numhword | Hyphenated word, letters and digits
16 | lhword | Latin hyphenated word 16 | asciihword | Hyphenated word, all ASCII
17 | nlhword | Non-latin hyphenated word 17 | hword | Hyphenated word, all letters
18 | uri | URI 18 | uri | URI
19 | file | File or path name 19 | file | File or path name
20 | float | Decimal notation 20 | float | Decimal notation
21 | int | Signed integer 21 | int | Signed integer
22 | uint | Unsigned integer 22 | uint | Unsigned integer
23 | entity | HTML Entity 23 | entity | HTML entity
(23 rows) (23 rows)
SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
......
...@@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs'); ...@@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs');
SELECT ts_lexize('synonym', 'Gogle'); SELECT ts_lexize('synonym', 'Gogle');
-- Create and simple test thesaurus dictionary -- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize -- More tests in configuration checks because ts_lexize()
-- can not give more tat one word as it may wish thesaurus. -- cannot pass more than one word to thesaurus.
CREATE TEXT SEARCH DICTIONARY thesaurus ( CREATE TEXT SEARCH DICTIONARY thesaurus (
Template=thesaurus, Template=thesaurus,
DictFile=thesaurus_sample, DictFile=thesaurus_sample,
...@@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst ( ...@@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
); );
ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem; WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
...@@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst ( ...@@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
); );
ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
lword, lpart_hword, lhword asciiword, hword_asciipart, asciihword
WITH synonym, english_stem; WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
...@@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( ...@@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
); );
ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR
lword, lpart_hword, lhword asciiword, hword_asciipart, asciihword
WITH synonym, thesaurus, english_stem; WITH synonym, thesaurus, english_stem;
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one'); SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)'); SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets'); SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
...@@ -3,7 +3,7 @@ package Install; ...@@ -3,7 +3,7 @@ package Install;
# #
# Package that provides 'make install' functionality for msvc builds # Package that provides 'make install' functionality for msvc builds
# #
# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.24 2007/10/16 16:00:00 tgl Exp $ # $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.25 2007/10/23 20:46:12 tgl Exp $
# #
use strict; use strict;
use warnings; use warnings;
...@@ -258,7 +258,7 @@ sub GenerateTsearchFiles ...@@ -258,7 +258,7 @@ sub GenerateTsearchFiles
while ($#pieces > 0) while ($#pieces > 0)
{ {
my $lang = shift @pieces || last; my $lang = shift @pieces || last;
my $latlang = shift @pieces || last; my $asclang = shift @pieces || last;
my $txt = $tmpl; my $txt = $tmpl;
my $stop = ''; my $stop = '';
...@@ -269,8 +269,8 @@ sub GenerateTsearchFiles ...@@ -269,8 +269,8 @@ sub GenerateTsearchFiles
$txt =~ s#_LANGNAME_#${lang}#gs; $txt =~ s#_LANGNAME_#${lang}#gs;
$txt =~ s#_DICTNAME_#${lang}_stem#gs; $txt =~ s#_DICTNAME_#${lang}_stem#gs;
$txt =~ s#_CFGNAME_#${lang}#gs; $txt =~ s#_CFGNAME_#${lang}#gs;
$txt =~ s#_LATDICTNAME_#${latlang}_stem#gs; $txt =~ s#_ASCDICTNAME_#${asclang}_stem#gs;
$txt =~ s#_NONLATDICTNAME_#${lang}_stem#gs; $txt =~ s#_NONASCDICTNAME_#${lang}_stem#gs;
$txt =~ s#_STOPWORDS_#$stop#gs; $txt =~ s#_STOPWORDS_#$stop#gs;
print $F $txt; print $F $txt;
print "."; print ".";
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment