Commit 25bd9ce3 authored by Tom Lane's avatar Tom Lane

Add matchorig, matchsynonyms, and keepsynonyms options to contrib/dict_xsyn.

Sergey Karpov
parent 23dc89d2
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Copyright (c) 2007-2009, PostgreSQL Global Development Group * Copyright (c) 2007-2009, PostgreSQL Global Development Group
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.6 2009/01/01 17:23:32 momjian Exp $ * $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.7 2009/08/05 18:06:49 tgl Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -33,7 +33,10 @@ typedef struct ...@@ -33,7 +33,10 @@ typedef struct
int len; int len;
Syn *syn; Syn *syn;
bool matchorig;
bool keeporig; bool keeporig;
bool matchsynonyms;
bool keepsynonyms;
} DictSyn; } DictSyn;
...@@ -88,7 +91,8 @@ read_dictionary(DictSyn *d, char *filename) ...@@ -88,7 +91,8 @@ read_dictionary(DictSyn *d, char *filename)
{ {
char *value; char *value;
char *key; char *key;
char *end = NULL; char *pos;
char *end;
if (*line == '\0') if (*line == '\0')
continue; continue;
...@@ -96,26 +100,36 @@ read_dictionary(DictSyn *d, char *filename) ...@@ -96,26 +100,36 @@ read_dictionary(DictSyn *d, char *filename)
value = lowerstr(line); value = lowerstr(line);
pfree(line); pfree(line);
key = find_word(value, &end); pos = value;
if (!key) while ((key = find_word(pos, &end)) != NULL)
{ {
pfree(value); /* Enlarge syn structure if full */
continue; if (cur == d->len)
} {
d->len = (d->len > 0) ? 2 * d->len : 16;
if (d->syn)
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
else
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
}
if (cur == d->len) /* Save first word only if we will match it */
{ if (pos != value || d->matchorig)
d->len = (d->len > 0) ? 2 * d->len : 16; {
if (d->syn) d->syn[cur].key = pnstrdup(key, end - key);
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); d->syn[cur].value = pstrdup(value);
else
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
}
d->syn[cur].key = pnstrdup(key, end - key); cur++;
d->syn[cur].value = value; }
pos = end;
cur++; /* Don't bother scanning synonyms if we will not match them */
if (!d->matchsynonyms)
break;
}
pfree(value);
} }
tsearch_readline_end(&trst); tsearch_readline_end(&trst);
...@@ -133,23 +147,40 @@ dxsyn_init(PG_FUNCTION_ARGS) ...@@ -133,23 +147,40 @@ dxsyn_init(PG_FUNCTION_ARGS)
List *dictoptions = (List *) PG_GETARG_POINTER(0); List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictSyn *d; DictSyn *d;
ListCell *l; ListCell *l;
char *filename = NULL;
d = (DictSyn *) palloc0(sizeof(DictSyn)); d = (DictSyn *) palloc0(sizeof(DictSyn));
d->len = 0; d->len = 0;
d->syn = NULL; d->syn = NULL;
d->matchorig = true;
d->keeporig = true; d->keeporig = true;
d->matchsynonyms = false;
d->keepsynonyms = true;
foreach(l, dictoptions) foreach(l, dictoptions)
{ {
DefElem *defel = (DefElem *) lfirst(l); DefElem *defel = (DefElem *) lfirst(l);
if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0) if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0)
{
d->matchorig = defGetBoolean(defel);
}
else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
{ {
d->keeporig = defGetBoolean(defel); d->keeporig = defGetBoolean(defel);
} }
else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0)
{
d->matchsynonyms = defGetBoolean(defel);
}
else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0)
{
d->keepsynonyms = defGetBoolean(defel);
}
else if (pg_strcasecmp(defel->defname, "RULES") == 0) else if (pg_strcasecmp(defel->defname, "RULES") == 0)
{ {
read_dictionary(d, defGetString(defel)); /* we can't read the rules before parsing all options! */
filename = defGetString(defel);
} }
else else
{ {
...@@ -160,6 +191,9 @@ dxsyn_init(PG_FUNCTION_ARGS) ...@@ -160,6 +191,9 @@ dxsyn_init(PG_FUNCTION_ARGS)
} }
} }
if (filename)
read_dictionary(d, filename);
PG_RETURN_POINTER(d); PG_RETURN_POINTER(d);
} }
...@@ -194,41 +228,33 @@ dxsyn_lexize(PG_FUNCTION_ARGS) ...@@ -194,41 +228,33 @@ dxsyn_lexize(PG_FUNCTION_ARGS)
/* Parse string of synonyms and return array of words */ /* Parse string of synonyms and return array of words */
{ {
char *value = pstrdup(found->value); char *value = found->value;
int value_length = strlen(value); char *syn;
char *pos = value; char *pos;
char *end;
int nsyns = 0; int nsyns = 0;
bool is_first = true;
res = palloc(0); res = palloc(sizeof(TSLexeme));
while (pos < value + value_length) pos = value;
while ((syn = find_word(pos, &end)) != NULL)
{ {
char *end;
char *syn = find_word(pos, &end);
if (!syn)
break;
*end = '\0';
res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2)); res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
res[nsyns].lexeme = NULL;
/* first word is added to result only if KEEPORIG flag is set */ /* The first word is output only if keeporig=true */
if (d->keeporig || !is_first) if (pos != value || d->keeporig)
{ {
res[nsyns].lexeme = pstrdup(syn); res[nsyns].lexeme = pnstrdup(syn, end - syn);
res[nsyns + 1].lexeme = NULL;
nsyns++; nsyns++;
} }
is_first = false; pos = end;
pos = end + 1; /* Stop if we are not to output the synonyms */
if (!d->keepsynonyms)
break;
} }
res[nsyns].lexeme = NULL;
pfree(value);
} }
PG_RETURN_POINTER(res); PG_RETURN_POINTER(res);
......
...@@ -5,10 +5,76 @@ ...@@ -5,10 +5,76 @@
SET client_min_messages = warning; SET client_min_messages = warning;
\set ECHO none \set ECHO none
RESET client_min_messages; RESET client_min_messages;
--configuration -- default configuration - match first word and return it among with all synonyms
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
--lexize --lexize
SELECT ts_lexize('xsyn', 'supernova'); SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
--------------------------
{supernova,sn,sne,1987a}
(1 row)
SELECT ts_lexize('xsyn', 'sn');
ts_lexize
-----------
(1 row)
SELECT ts_lexize('xsyn', 'grb');
ts_lexize
-----------
(1 row)
-- the same, but return only synonyms
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
----------------
{sn,sne,1987a}
(1 row)
SELECT ts_lexize('xsyn', 'sn');
ts_lexize
-----------
(1 row)
SELECT ts_lexize('xsyn', 'grb');
ts_lexize
-----------
(1 row)
-- match any word and return all words
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
--------------------------
{supernova,sn,sne,1987a}
(1 row)
SELECT ts_lexize('xsyn', 'sn');
ts_lexize
--------------------------
{supernova,sn,sne,1987a}
(1 row)
SELECT ts_lexize('xsyn', 'grb');
ts_lexize
-----------
(1 row)
-- match any word and return all words except first one
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
----------------
{sn,sne,1987a}
(1 row)
SELECT ts_lexize('xsyn', 'sn');
ts_lexize ts_lexize
---------------- ----------------
{sn,sne,1987a} {sn,sne,1987a}
...@@ -20,3 +86,63 @@ SELECT ts_lexize('xsyn', 'grb'); ...@@ -20,3 +86,63 @@ SELECT ts_lexize('xsyn', 'grb');
(1 row) (1 row)
-- match any synonym but not first word, and return first word instead
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
-----------
(1 row)
SELECT ts_lexize('xsyn', 'sn');
ts_lexize
-------------
{supernova}
(1 row)
SELECT ts_lexize('xsyn', 'grb');
ts_lexize
-----------
(1 row)
-- do not match or return anything
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false);
SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
-----------
(1 row)
SELECT ts_lexize('xsyn', 'sn');
ts_lexize
-----------
(1 row)
SELECT ts_lexize('xsyn', 'grb');
ts_lexize
-----------
(1 row)
-- match any word but return nothing
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
-----------
{}
(1 row)
SELECT ts_lexize('xsyn', 'sn');
ts_lexize
-----------
{}
(1 row)
SELECT ts_lexize('xsyn', 'grb');
ts_lexize
-----------
(1 row)
...@@ -8,9 +8,46 @@ SET client_min_messages = warning; ...@@ -8,9 +8,46 @@ SET client_min_messages = warning;
\set ECHO all \set ECHO all
RESET client_min_messages; RESET client_min_messages;
--configuration -- default configuration - match first word and return it among with all synonyms
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false); ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
--lexize --lexize
SELECT ts_lexize('xsyn', 'supernova'); SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'sn');
SELECT ts_lexize('xsyn', 'grb');
-- the same, but return only synonyms
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false);
SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'sn');
SELECT ts_lexize('xsyn', 'grb');
-- match any word and return all words
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'sn');
SELECT ts_lexize('xsyn', 'grb');
-- match any word and return all words except first one
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true);
SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'sn');
SELECT ts_lexize('xsyn', 'grb');
-- match any synonym but not first word, and return first word instead
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'sn');
SELECT ts_lexize('xsyn', 'grb');
-- do not match or return anything
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false);
SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'sn');
SELECT ts_lexize('xsyn', 'grb');
-- match any word but return nothing
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true);
SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'sn');
SELECT ts_lexize('xsyn', 'grb'); SELECT ts_lexize('xsyn', 'grb');
<!-- $PostgreSQL: pgsql/doc/src/sgml/dict-xsyn.sgml,v 1.2 2007/12/06 04:12:10 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/dict-xsyn.sgml,v 1.3 2009/08/05 18:06:49 tgl Exp $ -->
<sect1 id="dict-xsyn"> <sect1 id="dict-xsyn">
<title>dict_xsyn</title> <title>dict_xsyn</title>
...@@ -23,9 +23,26 @@ ...@@ -23,9 +23,26 @@
<itemizedlist> <itemizedlist>
<listitem> <listitem>
<para> <para>
<literal>keeporig</> controls whether the original word is included (if <literal>matchorig</> controls whether the original word is accepted by
<literal>true</>), or only its synonyms (if <literal>false</>). Default the dictionary. Default is <literal>true</>.
is <literal>true</>. </para>
</listitem>
<listitem>
<para>
<literal>matchsynonyms</> controls whether the synonyms are
accepted by the dictionary. Default is <literal>false</>.
</para>
</listitem>
<listitem>
<para>
<literal>keeporig</> controls whether the original word is included in
the dictionary's output. Default is <literal>true</>.
</para>
</listitem>
<listitem>
<para>
<literal>keepsynonyms</> controls whether the synonyms are included in
the dictionary's output. Default is <literal>true</>.
</para> </para>
</listitem> </listitem>
<listitem> <listitem>
...@@ -87,13 +104,37 @@ ALTER TEXT SEARCH DICTIONARY ...@@ -87,13 +104,37 @@ ALTER TEXT SEARCH DICTIONARY
To test the dictionary, you can try To test the dictionary, you can try
<programlisting> <programlisting>
mydb=# SELECT ts_lexize('xsyn', 'word');
ts_lexize
-----------------------
{syn1,syn2,syn3}
mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true);
ALTER TEXT SEARCH DICTIONARY
mydb=# SELECT ts_lexize('xsyn', 'word'); mydb=# SELECT ts_lexize('xsyn', 'word');
ts_lexize ts_lexize
----------------------- -----------------------
{word,syn1,syn2,syn3} {word,syn1,syn2,syn3}
mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=false, MATCHSYNONYMS=true);
ALTER TEXT SEARCH DICTIONARY
mydb=# SELECT ts_lexize('xsyn', 'syn1');
ts_lexize
-----------------------
{syn1,syn2,syn3}
mydb# ALTER TEXT SEARCH DICTIONARY xsyn (RULES='my_rules', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false);
ALTER TEXT SEARCH DICTIONARY
mydb=# SELECT ts_lexize('xsyn', 'syn1');
ts_lexize
-----------------------
{word}
</programlisting> </programlisting>
but real-world usage will involve including it in a text search Real-world usage will involve including it in a text search
configuration as described in <xref linkend="textsearch">. configuration as described in <xref linkend="textsearch">.
That might look like this: That might look like this:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment