Commit abd8c94f authored by Teodor Sigaev's avatar Teodor Sigaev

Add prefix support for synonym dictionary

parent 0c738084
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.52 2009/06/17 21:58:49 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.53 2009/08/14 14:53:20 teodor Exp $ -->
<chapter id="textsearch"> <chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title> <title id="textsearch-title">Full Text Search</title>
...@@ -2288,6 +2288,63 @@ SELECT * FROM ts_debug('english', 'Paris'); ...@@ -2288,6 +2288,63 @@ SELECT * FROM ts_debug('english', 'Paris');
asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris} asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
</programlisting> </programlisting>
</para> </para>
<para>
An asterisk (<literal>*</literal>) at the end of definition word indicates
that definition word is a prefix, and <function>to_tsquery()</function>
function will transform that definition to the prefix search format (see
<xref linkend="textsearch-parsing-queries">).
Notice that it is ignored in <function>to_tsvector()</function>.
</para>
<para>
Contents of <filename>$SHAREDIR/tsearch_data/synonym_sample.syn</>:
</para>
<programlisting>
postgres pgsql
postgresql pgsql
postgre pgsql
gogle googl
indices index*
</programlisting>
<para>
Results:
</para>
<programlisting>
=# create text search dictionary syn( template=synonym,synonyms='synonym_sample');
=# select ts_lexize('syn','indices');
ts_lexize
-----------
{index}
(1 row)
=# create text search configuration tst ( copy=simple);
=# alter text search configuration tst alter mapping for asciiword with syn;
=# select to_tsquery('tst','indices');
to_tsquery
------------
'index':*
(1 row)
=# select 'indexes are very useful'::tsvector;
tsvector
---------------------------------
'are' 'indexes' 'useful' 'very'
(1 row)
=# select 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
?column?
----------
t
(1 row)
=# select to_tsvector('tst','indices');
to_tsvector
-------------
'index':1
(1 row)
</programlisting>
<para> <para>
The only parameter required by the <literal>synonym</> template is The only parameter required by the <literal>synonym</> template is
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.10 2009/01/01 17:23:48 momjian Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.11 2009/08/14 14:53:20 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -23,6 +23,8 @@ typedef struct ...@@ -23,6 +23,8 @@ typedef struct
{ {
char *in; char *in;
char *out; char *out;
int outlen;
uint16 flags;
} Syn; } Syn;
typedef struct typedef struct
...@@ -36,11 +38,14 @@ typedef struct ...@@ -36,11 +38,14 @@ typedef struct
* Finds the next whitespace-delimited word within the 'in' string. * Finds the next whitespace-delimited word within the 'in' string.
* Returns a pointer to the first character of the word, and a pointer * Returns a pointer to the first character of the word, and a pointer
* to the next byte after the last character in the word (in *end). * to the next byte after the last character in the word (in *end).
* Character '*' at the end of word will not be threated as word
* charater if flags is not null.
*/ */
static char * static char *
findwrd(char *in, char **end) findwrd(char *in, char **end, uint16 *flags)
{ {
char *start; char *start;
char *lastchar;
/* Skip leading spaces */ /* Skip leading spaces */
while (*in && t_isspace(in)) while (*in && t_isspace(in))
...@@ -53,13 +58,27 @@ findwrd(char *in, char **end) ...@@ -53,13 +58,27 @@ findwrd(char *in, char **end)
return NULL; return NULL;
} }
start = in; lastchar = start = in;
/* Find end of word */ /* Find end of word */
while (*in && !t_isspace(in)) while (*in && !t_isspace(in))
{
lastchar = in;
in += pg_mblen(in); in += pg_mblen(in);
}
if ( in - lastchar == 1 && t_iseq(lastchar, '*') && flags )
{
*flags = TSL_PREFIX;
*end = lastchar;
}
else
{
if (flags)
*flags = 0;
*end = in;
}
*end = in;
return start; return start;
} }
...@@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS) ...@@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
*end = NULL; *end = NULL;
int cur = 0; int cur = 0;
char *line = NULL; char *line = NULL;
uint16 flags = 0;
foreach(l, dictoptions) foreach(l, dictoptions)
{ {
...@@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS) ...@@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
while ((line = tsearch_readline(&trst)) != NULL) while ((line = tsearch_readline(&trst)) != NULL)
{ {
starti = findwrd(line, &end); starti = findwrd(line, &end, NULL);
if (!starti) if (!starti)
{ {
/* Empty line */ /* Empty line */
...@@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS) ...@@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
} }
*end = '\0'; *end = '\0';
starto = findwrd(end + 1, &end); starto = findwrd(end + 1, &end, &flags);
if (!starto) if (!starto)
{ {
/* A line with only one word (+whitespace). Ignore silently. */ /* A line with only one word (+whitespace). Ignore silently. */
...@@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS) ...@@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS)
d->syn[cur].out = lowerstr(starto); d->syn[cur].out = lowerstr(starto);
} }
d->syn[cur].outlen = strlen(starto);
d->syn[cur].flags = flags;
cur++; cur++;
skipline: skipline:
...@@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS) ...@@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(NULL); PG_RETURN_POINTER(NULL);
res = palloc0(sizeof(TSLexeme) * 2); res = palloc0(sizeof(TSLexeme) * 2);
res[0].lexeme = pstrdup(found->out); res[0].lexeme = pnstrdup(found->out, found->outlen);
res[0].flags = found->flags;
PG_RETURN_POINTER(res); PG_RETURN_POINTER(res);
} }
...@@ -2,3 +2,4 @@ postgres pgsql ...@@ -2,3 +2,4 @@ postgres pgsql
postgresql pgsql postgresql pgsql
postgre pgsql postgre pgsql
gogle googl gogle googl
indices index*
...@@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle'); ...@@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle');
{googl} {googl}
(1 row) (1 row)
SELECT ts_lexize('synonym', 'indices');
ts_lexize
-----------
{index}
(1 row)
-- Create and simple test thesaurus dictionary -- Create and simple test thesaurus dictionary
-- More tests in configuration checks because ts_lexize() -- More tests in configuration checks because ts_lexize()
-- cannot pass more than one word to thesaurus. -- cannot pass more than one word to thesaurus.
...@@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead ...@@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead
'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6 'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
(1 row) (1 row)
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
to_tsvector
----------------------------------------------
'form':8 'index':1,3,10 'plural':7 'right':6
(1 row)
SELECT to_tsquery('synonym_tst', 'Index & indices');
to_tsquery
---------------------
'index' & 'index':*
(1 row)
-- test thesaurus in configuration -- test thesaurus in configuration
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector -- see thesaurus_sample.ths to understand 'odd' resulting tsvector
CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
......
...@@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym ( ...@@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym (
SELECT ts_lexize('synonym', 'PoStGrEs'); SELECT ts_lexize('synonym', 'PoStGrEs');
SELECT ts_lexize('synonym', 'Gogle'); SELECT ts_lexize('synonym', 'Gogle');
SELECT ts_lexize('synonym', 'indices');
-- Create and simple test thesaurus dictionary -- Create and simple test thesaurus dictionary
-- More tests in configuration checks because ts_lexize() -- More tests in configuration checks because ts_lexize()
...@@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR ...@@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google'); SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
SELECT to_tsquery('synonym_tst', 'Index & indices');
-- test thesaurus in configuration -- test thesaurus in configuration
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector -- see thesaurus_sample.ths to understand 'odd' resulting tsvector
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment