Commit 92e05bc6 authored by Teodor Sigaev's avatar Teodor Sigaev

Unaccent dictionary.

parent a88a4801
# $PostgreSQL: pgsql/contrib/Makefile,v 1.88 2009/08/07 20:50:21 petere Exp $
# $PostgreSQL: pgsql/contrib/Makefile,v 1.89 2009/08/18 10:34:39 teodor Exp $
subdir = contrib
top_builddir = ..
......@@ -39,6 +39,7 @@ SUBDIRS = \
tablefunc \
test_parser \
tsearch2 \
unaccent \
vacuumlo
ifeq ($(with_openssl),yes)
......
......@@ -169,6 +169,10 @@ tsearch2 -
Pavel Stehule <pavel.stehule@gmail.com>, based on code originally by
Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
unaccent -
Unaccent dictionary for text search
Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
uuid-ossp -
UUID generation functions
by Peter Eisentraut <peter_e@gmx.net>
......
# $PostgreSQL: pgsql/contrib/unaccent/Makefile,v 1.1 2009/08/18 10:34:39 teodor Exp $
MODULE_big = unaccent
OBJS = unaccent.o
DATA_built = unaccent.sql
DATA = uninstall_unaccent.sql
DATA_TSEARCH = unaccent.rules
REGRESS = unaccent
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = contrib/pg_trgm
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif
#redefine REGRESS_OPTS because of needings of UTF8 database
REGRESS_OPTS = --dbname=$(CONTRIB_TESTDB) --multibyte=UTF8 --no-locale
SET client_min_messages = warning;
\set ECHO none
RESET client_min_messages;
SET client_encoding TO 'KOI8';
SELECT unaccent('foobar');
unaccent
----------
foobar
(1 row)
SELECT unaccent('L肆');
unaccent
----------
盘肆
(1 row)
SELECT unaccent('出殡');
unaccent
----------
弼殡
(1 row)
SELECT unaccent('unaccent', 'foobar');
unaccent
----------
foobar
(1 row)
SELECT unaccent('unaccent', 'L肆');
unaccent
----------
盘肆
(1 row)
SELECT unaccent('unaccent', '出殡');
unaccent
----------
弼殡
(1 row)
SELECT ts_lexize('unaccent', 'foobar');
ts_lexize
-----------
(1 row)
SELECT ts_lexize('unaccent', 'L肆');
ts_lexize
-----------
{盘肆}
(1 row)
SELECT ts_lexize('unaccent', '出殡');
ts_lexize
-----------
{弼殡}
(1 row)
SET client_min_messages = warning;
\set ECHO none
\i unaccent.sql
\set ECHO all
RESET client_min_messages;
SET client_encoding TO 'KOI8';
SELECT unaccent('foobar');
SELECT unaccent('L肆');
SELECT unaccent('出殡');
SELECT unaccent('unaccent', 'foobar');
SELECT unaccent('unaccent', 'L肆');
SELECT unaccent('unaccent', '出殡');
SELECT ts_lexize('unaccent', 'foobar');
SELECT ts_lexize('unaccent', 'L肆');
SELECT ts_lexize('unaccent', '出殡');
/*-------------------------------------------------------------------------
*
* unaccent.c
* Text search unaccent dictionary
*
* Copyright (c) 2009, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "mb/pg_wchar.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
PG_MODULE_MAGIC;
/*
* Unaccent dictionary uses uncompressed suffix tree to find a
* character to replace. Each node of tree is an array of
* SuffixChar struct with length = 256 (n-th element of array
* corresponds to byte)
*/
typedef struct SuffixChar {
struct SuffixChar *nextChar;
char *replaceTo;
int replacelen;
} SuffixChar;
/*
* placeChar - put str into tree's structure, byte by byte.
*/
static SuffixChar*
placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
{
SuffixChar *curnode;
if ( !node )
{
node = palloc(sizeof(SuffixChar) * 256);
memset(node, 0, sizeof(SuffixChar) * 256);
}
curnode = node + *str;
if ( lenstr == 1 )
{
if ( curnode->replaceTo )
elog(WARNING, "duplicate TO argument, use first one");
else
{
curnode->replacelen = replacelen;
curnode->replaceTo = palloc( replacelen );
memcpy(curnode->replaceTo, replaceTo, replacelen);
}
}
else
{
curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
}
return node;
}
/*
* initSuffixTree - create suffix tree from file. Function converts
* UTF8-encoded file into current encoding.
*/
static SuffixChar*
initSuffixTree(char *filename)
{
SuffixChar *rootSuffixTree = NULL;
MemoryContext ccxt = CurrentMemoryContext;
tsearch_readline_state trst;
bool skip;
filename = get_tsearch_config_filename(filename, "rules");
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open unaccent file \"%s\": %m",
filename)));
do
{
char src[4096];
char trg[4096];
int srclen;
int trglen;
char *line = NULL;
skip = true;
PG_TRY();
{
/*
* pg_do_encoding_conversion() (called by tsearch_readline())
* will emit exception if it finds untranslatable characters in current locale.
* We just skip such characters.
*/
while ((line = tsearch_readline(&trst)) != NULL)
{
if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
continue;
srclen = strlen(src);
trglen = strlen(trg);
rootSuffixTree = placeChar(rootSuffixTree,
(unsigned char*)src, srclen,
trg, trglen);
skip = false;
pfree(line);
}
}
PG_CATCH();
{
ErrorData *errdata;
MemoryContext ecxt;
ecxt = MemoryContextSwitchTo(ccxt);
errdata = CopyErrorData();
if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
{
FlushErrorState();
}
else
{
MemoryContextSwitchTo(ecxt);
PG_RE_THROW();
}
}
PG_END_TRY();
}
while(skip);
tsearch_readline_end(&trst);
return rootSuffixTree;
}
/*
* findReplaceTo - find multibyte character in tree
*/
static SuffixChar *
findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
{
while( node )
{
node = node + *src;
if ( srclen == 1 )
return node;
src++;
srclen--;
node = node->nextChar;
}
return NULL;
}
PG_FUNCTION_INFO_V1(unaccent_init);
Datum unaccent_init(PG_FUNCTION_ARGS);
Datum
unaccent_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
SuffixChar *rootSuffixTree;
bool fileloaded = false;
ListCell *l;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (pg_strcasecmp("Rules", defel->defname) == 0)
{
if (fileloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple Rules parameters")));
rootSuffixTree = initSuffixTree(defGetString(defel));
fileloaded = true;
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized Unaccent parameter: \"%s\"",
defel->defname)));
}
}
if (!fileloaded)
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing Rules parameter")));
}
PG_RETURN_POINTER(rootSuffixTree);
}
PG_FUNCTION_INFO_V1(unaccent_lexize);
Datum unaccent_lexize(PG_FUNCTION_ARGS);
Datum
unaccent_lexize(PG_FUNCTION_ARGS)
{
SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
char *srcchar = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
char *srcstart, *trgchar;
int charlen;
TSLexeme *res = NULL;
SuffixChar *node;
srcstart = srcchar;
while( srcchar - srcstart < len )
{
charlen = pg_mblen(srcchar);
node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
if ( node && node->replaceTo )
{
if ( !res )
{
/* allocate res only it it's needed */
res = palloc0(sizeof(TSLexeme) * 2);
res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
res->flags = TSL_FILTER;
if ( srcchar != srcstart )
{
memcpy(trgchar, srcstart, srcchar - srcstart);
trgchar += (srcchar - srcstart);
}
}
memcpy( trgchar, node->replaceTo, node->replacelen );
trgchar += node->replacelen;
}
else if ( res )
{
memcpy( trgchar, srcchar, charlen );
trgchar += charlen;
}
srcchar += charlen;
}
if ( res )
*trgchar = '\0';
PG_RETURN_POINTER(res);
}
/*
* Function-like wrapper for dictionary
*/
PG_FUNCTION_INFO_V1(unaccent_dict);
Datum unaccent_dict(PG_FUNCTION_ARGS);
Datum
unaccent_dict(PG_FUNCTION_ARGS)
{
text *str;
int strArg;
Oid dictOid;
TSDictionaryCacheEntry *dict;
TSLexeme *res;
if (PG_NARGS() == 1)
{
dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
strArg = 0;
}
else
{
dictOid = PG_GETARG_OID(0);
strArg = 1;
}
str = PG_GETARG_TEXT_P(strArg);
dict = lookup_ts_dictionary_cache(dictOid);
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(VARDATA(str)),
Int32GetDatum(VARSIZE(str) - VARHDRSZ),
PointerGetDatum(NULL)));
PG_FREE_IF_COPY(str, strArg);
if ( res == NULL )
{
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
}
else if ( res->lexeme == NULL )
{
pfree(res);
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
}
else
{
text *txt = cstring_to_text(res->lexeme);
pfree(res->lexeme);
pfree(res);
PG_RETURN_TEXT_P(txt);
}
}
À A
Á A
 A
à A
Ä A
Å A
Æ A
à a
á a
â a
ã a
ä a
å a
æ a
Ā A
ā a
Ă A
ă a
Ą A
ą a
Ç C
ç c
Ć C
ć c
Ĉ C
ĉ c
Ċ C
ċ c
Č C
č c
Ď D
ď d
Đ D
đ d
È E
É E
Ê E
Ë E
è e
é e
ê e
ë e
Ē E
ē e
Ĕ E
ĕ e
Ė E
ė e
Ę E
ę e
Ě E
ě e
Ĝ G
ĝ g
Ğ G
ğ g
Ġ G
ġ g
Ģ G
ģ g
Ĥ H
ĥ h
Ħ H
ħ h
Ĩ I
Ì I
Í I
Î I
Ï I
ì i
í i
î i
ï i
ĩ i
Ī I
ī i
Ĭ I
ĭ i
Į I
į i
İ I
ı i
IJ I
ij i
Ĵ J
ĵ j
Ķ K
ķ k
ĸ k
Ĺ L
ĺ l
Ļ L
ļ l
Ľ L
ľ l
Ŀ L
ŀ l
Ł L
ł l
Ñ N
ñ n
Ń N
ń n
Ņ N
ņ n
Ň N
ň n
ʼn n
Ŋ N
ŋ n
Ò O
Ó O
Ô O
Õ O
Ö O
ò o
ó o
ô o
õ o
ö o
Ō O
ō o
Ŏ O
ŏ o
Ő O
ő o
Œ E
œ e
Ø O
ø o
Ŕ R
ŕ r
Ŗ R
ŗ r
Ř R
ř r
ß S
Ś S
ś s
Ŝ S
ŝ s
Ş S
ş s
Š S
š s
Ţ T
ţ t
Ť T
ť t
Ŧ T
ŧ t
Ù U
Ú U
Û U
Ü U
ù u
ú u
û u
ü u
Ũ U
ũ u
Ū U
ū u
Ŭ U
ŭ u
Ů U
ů u
Ű U
ű u
Ų U
ų u
Ŵ W
ŵ w
Ý Y
ý y
ÿ y
Ŷ Y
ŷ y
Ÿ Y
Ź Z
ź z
Ż Z
ż z
Ž Z
ž z
ё е
Ё Е
/* $PostgreSQL: pgsql/contrib/unaccent/unaccent.sql.in,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
CREATE OR REPLACE FUNCTION unaccent(regdictionary, text)
RETURNS text
AS 'MODULE_PATHNAME', 'unaccent_dict'
LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
CREATE OR REPLACE FUNCTION unaccent(text)
RETURNS text
AS 'MODULE_PATHNAME', 'unaccent_dict'
LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
CREATE OR REPLACE FUNCTION unaccent_init(internal)
RETURNS internal
AS 'MODULE_PATHNAME', 'unaccent_init'
LANGUAGE C;
CREATE OR REPLACE FUNCTION unaccent_lexize(internal,internal,internal,internal)
RETURNS internal
AS 'MODULE_PATHNAME', 'unaccent_lexize'
LANGUAGE C;
CREATE TEXT SEARCH TEMPLATE unaccent (
INIT = unaccent_init,
LEXIZE = unaccent_lexize
);
CREATE TEXT SEARCH DICTIONARY unaccent (
TEMPLATE = unaccent,
RULES = 'unaccent'
);
/* $PostgreSQL: pgsql/contrib/unaccent/uninstall_unaccent.sql,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
DROP FUNCTION IF EXISTS unaccent(regdictionary, text) CASCADE;
DROP FUNCTION IF EXISTS unaccent(text) CASCADE;
DROP TEXT SEARCH DICTIONARY IF EXISTS unaccent CASCADE;
DROP TEXT SEARCH TEMPLATE IF EXISTS unaccent CASCADE;
DROP FUNCTION IF EXISTS unaccent_init(internal) CASCADE;
DROP FUNCTION IF EXISTS unaccent_lexize(internal,internal,internal,internal) CASCADE;
<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.13 2009/04/27 16:27:35 momjian Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.14 2009/08/18 10:34:39 teodor Exp $ -->
<appendix id="contrib">
<title>Additional Supplied Modules</title>
......@@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
&tablefunc;
&test-parser;
&tsearch2;
&unaccent;
&uuid-ossp;
&vacuumlo;
&xml2;
......
<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.63 2009/08/17 22:14:44 petere Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.64 2009/08/18 10:34:39 teodor Exp $ -->
<!entity history SYSTEM "history.sgml">
<!entity info SYSTEM "info.sgml">
......@@ -126,6 +126,7 @@
<!entity tablefunc SYSTEM "tablefunc.sgml">
<!entity test-parser SYSTEM "test-parser.sgml">
<!entity tsearch2 SYSTEM "tsearch2.sgml">
<!entity unaccent SYSTEM "unaccent.sgml">
<!entity uuid-ossp SYSTEM "uuid-ossp.sgml">
<!entity vacuumlo SYSTEM "vacuumlo.sgml">
<!entity xml2 SYSTEM "xml2.sgml">
......
<sect1 id="unaccent">
<title>unaccent</title>
<indexterm zone="unaccent">
<primary>unaccent</primary>
</indexterm>
<para>
<filename>unaccent</> removes accents (diacritic signs) from a lexeme.
It's a filtering dictionary, that means its output is
always passed to the next dictionary (if any), contrary to the standard
behaviour. Currently, it supports most important accents from european
languages.
</para>
<para>
Limitation: Current implementation of <filename>unaccent</>
dictionary cannot be used as a normalizing dictionary for
<filename>thesaurus</filename> dictionary.
</para>
<sect2>
<title>Configuration</title>
<para>
A <literal>unaccent</> dictionary accepts the following options:
</para>
<itemizedlist>
<listitem>
<para>
<literal>RULES</> is the base name of the file containing the list of
translation rules. This file must be stored in
<filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
the <productname>PostgreSQL</> installation's shared-data directory).
Its name must end in <literal>.rules</> (which is not to be included in
the <literal>RULES</> parameter).
</para>
</listitem>
</itemizedlist>
<para>
The rules file has the following format:
</para>
<itemizedlist>
<listitem>
<para>
Each line represents pair: character_with_accent character_without_accent
<programlisting>
&Agrave; A
&Aacute; A
&Acirc; A
&Atilde; A
&Auml; A
&Aring; A
&AElig; A
</programlisting>
</para>
</listitem>
</itemizedlist>
<para>
Look at <filename>unaccent.rules</>, which is installed in
<filename>$SHAREDIR/tsearch_data/</>, for an example.
</para>
</sect2>
<sect2>
<title>Usage</title>
<para>
Running the installation script creates a text search template
<literal>unaccent</> and a dictionary <literal>unaccent</>
based on it, with default parameters. You can alter the
parameters, for example
<programlisting>
=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
</programlisting>
or create new dictionaries based on the template.
</para>
<para>
To test the dictionary, you can try
<programlisting>
=# select ts_lexize('unaccent','Hôtel');
ts_lexize
-----------
{Hotel}
(1 row)
</programlisting>
</para>
<para>
Filtering dictionary are useful for correct work of
<function>ts_headline</function> function.
<programlisting>
=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french );
=# ALTER TEXT SEARCH CONFIGURATION fr
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, french_stem;
=# select to_tsvector('fr','Hôtels de la Mer');
to_tsvector
-------------------
'hotel':1 'mer':4
(1 row)
=# select to_tsvector('fr','Hôtel de la Mer') @@ to_tsquery('fr','Hotels');
?column?
----------
t
(1 row)
=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels'));
ts_headline
------------------------
&lt;b&gt;Hôtel&lt;/b&gt;de la Mer
(1 row)
</programlisting>
</para>
</sect2>
<sect2>
<title>Function</title>
<para>
<function>unaccent</> function removes accents (diacritic signs) from
argument string. Basically, it's a wrapper around
<filename>unaccent</> dictionary.
</para>
<indexterm>
<primary>unaccent</primary>
</indexterm>
<synopsis>
unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>,
</optional> <replaceable class="PARAMETER">string</replaceable>)
returns <type>text</type>
</synopsis>
<para>
<programlisting>
SELECT unaccent('unaccent','Hôtel');
SELECT unaccent('Hôtel');
</programlisting>
</para>
</sect2>
</sect1>
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment