Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
Postgres FD Implementation
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Abuhujair Javed
Postgres FD Implementation
Commits
92e05bc6
Commit
92e05bc6
authored
Aug 18, 2009
by
Teodor Sigaev
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Unaccent dictionary.
parent
a88a4801
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
808 additions
and
3 deletions
+808
-3
contrib/Makefile
contrib/Makefile
+2
-1
contrib/README
contrib/README
+4
-0
contrib/unaccent/Makefile
contrib/unaccent/Makefile
+24
-0
contrib/unaccent/expected/unaccent.out
contrib/unaccent/expected/unaccent.out
+58
-0
contrib/unaccent/sql/unaccent.sql
contrib/unaccent/sql/unaccent.sql
+19
-0
contrib/unaccent/unaccent.c
contrib/unaccent/unaccent.c
+318
-0
contrib/unaccent/unaccent.rules
contrib/unaccent/unaccent.rules
+187
-0
contrib/unaccent/unaccent.sql.in
contrib/unaccent/unaccent.sql.in
+33
-0
contrib/unaccent/uninstall_unaccent.sql
contrib/unaccent/uninstall_unaccent.sql
+9
-0
doc/src/sgml/contrib.sgml
doc/src/sgml/contrib.sgml
+2
-1
doc/src/sgml/filelist.sgml
doc/src/sgml/filelist.sgml
+2
-1
doc/src/sgml/unaccent.sgml
doc/src/sgml/unaccent.sgml
+150
-0
No files found.
contrib/Makefile
View file @
92e05bc6
# $PostgreSQL: pgsql/contrib/Makefile,v 1.8
8 2009/08/07 20:50:21 petere
Exp $
# $PostgreSQL: pgsql/contrib/Makefile,v 1.8
9 2009/08/18 10:34:39 teodor
Exp $
subdir
=
contrib
subdir
=
contrib
top_builddir
=
..
top_builddir
=
..
...
@@ -39,6 +39,7 @@ SUBDIRS = \
...
@@ -39,6 +39,7 @@ SUBDIRS = \
tablefunc
\
tablefunc
\
test_parser
\
test_parser
\
tsearch2
\
tsearch2
\
unaccent
\
vacuumlo
vacuumlo
ifeq
($(with_openssl),yes)
ifeq
($(with_openssl),yes)
...
...
contrib/README
View file @
92e05bc6
...
@@ -169,6 +169,10 @@ tsearch2 -
...
@@ -169,6 +169,10 @@ tsearch2 -
Pavel Stehule <pavel.stehule@gmail.com>, based on code originally by
Pavel Stehule <pavel.stehule@gmail.com>, based on code originally by
Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
unaccent -
Unaccent dictionary for text search
Teodor Sigaev <teodor@sigaev.ru> and Oleg Bartunov <oleg@sai.msu.su>.
uuid-ossp -
uuid-ossp -
UUID generation functions
UUID generation functions
by Peter Eisentraut <peter_e@gmx.net>
by Peter Eisentraut <peter_e@gmx.net>
...
...
contrib/unaccent/Makefile
0 → 100644
View file @
92e05bc6
# $PostgreSQL: pgsql/contrib/unaccent/Makefile,v 1.1 2009/08/18 10:34:39 teodor Exp $
MODULE_big
=
unaccent
OBJS
=
unaccent.o
DATA_built
=
unaccent.sql
DATA
=
uninstall_unaccent.sql
DATA_TSEARCH
=
unaccent.rules
REGRESS
=
unaccent
ifdef
USE_PGXS
PG_CONFIG
=
pg_config
PGXS
:=
$(
shell
$(PG_CONFIG)
--pgxs
)
include
$(PGXS)
else
subdir
=
contrib/pg_trgm
top_builddir
=
../..
include
$(top_builddir)/src/Makefile.global
include
$(top_srcdir)/contrib/contrib-global.mk
endif
#redefine REGRESS_OPTS because of needings of UTF8 database
REGRESS_OPTS
=
--dbname
=
$(CONTRIB_TESTDB)
--multibyte
=
UTF8
--no-locale
contrib/unaccent/expected/unaccent.out
0 → 100644
View file @
92e05bc6
SET client_min_messages = warning;
\set ECHO none
RESET client_min_messages;
SET client_encoding TO 'KOI8';
SELECT unaccent('foobar');
unaccent
----------
foobar
(1 row)
SELECT unaccent('L肆');
unaccent
----------
盘肆
(1 row)
SELECT unaccent('出殡');
unaccent
----------
弼殡
(1 row)
SELECT unaccent('unaccent', 'foobar');
unaccent
----------
foobar
(1 row)
SELECT unaccent('unaccent', 'L肆');
unaccent
----------
盘肆
(1 row)
SELECT unaccent('unaccent', '出殡');
unaccent
----------
弼殡
(1 row)
SELECT ts_lexize('unaccent', 'foobar');
ts_lexize
-----------
(1 row)
SELECT ts_lexize('unaccent', 'L肆');
ts_lexize
-----------
{盘肆}
(1 row)
SELECT ts_lexize('unaccent', '出殡');
ts_lexize
-----------
{弼殡}
(1 row)
contrib/unaccent/sql/unaccent.sql
0 → 100644
View file @
92e05bc6
SET
client_min_messages
=
warning
;
\
set
ECHO
none
\
i
unaccent
.
sql
\
set
ECHO
all
RESET
client_min_messages
;
SET
client_encoding
TO
'KOI8'
;
SELECT
unaccent
(
'foobar'
);
SELECT
unaccent
(
'L肆'
);
SELECT
unaccent
(
'出殡'
);
SELECT
unaccent
(
'unaccent'
,
'foobar'
);
SELECT
unaccent
(
'unaccent'
,
'L肆'
);
SELECT
unaccent
(
'unaccent'
,
'出殡'
);
SELECT
ts_lexize
(
'unaccent'
,
'foobar'
);
SELECT
ts_lexize
(
'unaccent'
,
'L肆'
);
SELECT
ts_lexize
(
'unaccent'
,
'出殡'
);
contrib/unaccent/unaccent.c
0 → 100644
View file @
92e05bc6
/*-------------------------------------------------------------------------
*
* unaccent.c
* Text search unaccent dictionary
*
* Copyright (c) 2009, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "mb/pg_wchar.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"
PG_MODULE_MAGIC
;
/*
* Unaccent dictionary uses uncompressed suffix tree to find a
* character to replace. Each node of tree is an array of
* SuffixChar struct with length = 256 (n-th element of array
* corresponds to byte)
*/
typedef
struct
SuffixChar
{
struct
SuffixChar
*
nextChar
;
char
*
replaceTo
;
int
replacelen
;
}
SuffixChar
;
/*
* placeChar - put str into tree's structure, byte by byte.
*/
static
SuffixChar
*
placeChar
(
SuffixChar
*
node
,
unsigned
char
*
str
,
int
lenstr
,
char
*
replaceTo
,
int
replacelen
)
{
SuffixChar
*
curnode
;
if
(
!
node
)
{
node
=
palloc
(
sizeof
(
SuffixChar
)
*
256
);
memset
(
node
,
0
,
sizeof
(
SuffixChar
)
*
256
);
}
curnode
=
node
+
*
str
;
if
(
lenstr
==
1
)
{
if
(
curnode
->
replaceTo
)
elog
(
WARNING
,
"duplicate TO argument, use first one"
);
else
{
curnode
->
replacelen
=
replacelen
;
curnode
->
replaceTo
=
palloc
(
replacelen
);
memcpy
(
curnode
->
replaceTo
,
replaceTo
,
replacelen
);
}
}
else
{
curnode
->
nextChar
=
placeChar
(
curnode
->
nextChar
,
str
+
1
,
lenstr
-
1
,
replaceTo
,
replacelen
);
}
return
node
;
}
/*
* initSuffixTree - create suffix tree from file. Function converts
* UTF8-encoded file into current encoding.
*/
static
SuffixChar
*
initSuffixTree
(
char
*
filename
)
{
SuffixChar
*
rootSuffixTree
=
NULL
;
MemoryContext
ccxt
=
CurrentMemoryContext
;
tsearch_readline_state
trst
;
bool
skip
;
filename
=
get_tsearch_config_filename
(
filename
,
"rules"
);
if
(
!
tsearch_readline_begin
(
&
trst
,
filename
))
ereport
(
ERROR
,
(
errcode
(
ERRCODE_CONFIG_FILE_ERROR
),
errmsg
(
"could not open unaccent file
\"
%s
\"
: %m"
,
filename
)));
do
{
char
src
[
4096
];
char
trg
[
4096
];
int
srclen
;
int
trglen
;
char
*
line
=
NULL
;
skip
=
true
;
PG_TRY
();
{
/*
* pg_do_encoding_conversion() (called by tsearch_readline())
* will emit exception if it finds untranslatable characters in current locale.
* We just skip such characters.
*/
while
((
line
=
tsearch_readline
(
&
trst
))
!=
NULL
)
{
if
(
sscanf
(
line
,
"%s
\t
%s
\n
"
,
src
,
trg
)
!=
2
)
continue
;
srclen
=
strlen
(
src
);
trglen
=
strlen
(
trg
);
rootSuffixTree
=
placeChar
(
rootSuffixTree
,
(
unsigned
char
*
)
src
,
srclen
,
trg
,
trglen
);
skip
=
false
;
pfree
(
line
);
}
}
PG_CATCH
();
{
ErrorData
*
errdata
;
MemoryContext
ecxt
;
ecxt
=
MemoryContextSwitchTo
(
ccxt
);
errdata
=
CopyErrorData
();
if
(
errdata
->
sqlerrcode
==
ERRCODE_UNTRANSLATABLE_CHARACTER
)
{
FlushErrorState
();
}
else
{
MemoryContextSwitchTo
(
ecxt
);
PG_RE_THROW
();
}
}
PG_END_TRY
();
}
while
(
skip
);
tsearch_readline_end
(
&
trst
);
return
rootSuffixTree
;
}
/*
* findReplaceTo - find multibyte character in tree
*/
static
SuffixChar
*
findReplaceTo
(
SuffixChar
*
node
,
unsigned
char
*
src
,
int
srclen
)
{
while
(
node
)
{
node
=
node
+
*
src
;
if
(
srclen
==
1
)
return
node
;
src
++
;
srclen
--
;
node
=
node
->
nextChar
;
}
return
NULL
;
}
PG_FUNCTION_INFO_V1
(
unaccent_init
);
Datum
unaccent_init
(
PG_FUNCTION_ARGS
);
Datum
unaccent_init
(
PG_FUNCTION_ARGS
)
{
List
*
dictoptions
=
(
List
*
)
PG_GETARG_POINTER
(
0
);
SuffixChar
*
rootSuffixTree
;
bool
fileloaded
=
false
;
ListCell
*
l
;
foreach
(
l
,
dictoptions
)
{
DefElem
*
defel
=
(
DefElem
*
)
lfirst
(
l
);
if
(
pg_strcasecmp
(
"Rules"
,
defel
->
defname
)
==
0
)
{
if
(
fileloaded
)
ereport
(
ERROR
,
(
errcode
(
ERRCODE_INVALID_PARAMETER_VALUE
),
errmsg
(
"multiple Rules parameters"
)));
rootSuffixTree
=
initSuffixTree
(
defGetString
(
defel
));
fileloaded
=
true
;
}
else
{
ereport
(
ERROR
,
(
errcode
(
ERRCODE_INVALID_PARAMETER_VALUE
),
errmsg
(
"unrecognized Unaccent parameter:
\"
%s
\"
"
,
defel
->
defname
)));
}
}
if
(
!
fileloaded
)
{
ereport
(
ERROR
,
(
errcode
(
ERRCODE_INVALID_PARAMETER_VALUE
),
errmsg
(
"missing Rules parameter"
)));
}
PG_RETURN_POINTER
(
rootSuffixTree
);
}
PG_FUNCTION_INFO_V1
(
unaccent_lexize
);
Datum
unaccent_lexize
(
PG_FUNCTION_ARGS
);
Datum
unaccent_lexize
(
PG_FUNCTION_ARGS
)
{
SuffixChar
*
rootSuffixTree
=
(
SuffixChar
*
)
PG_GETARG_POINTER
(
0
);
char
*
srcchar
=
(
char
*
)
PG_GETARG_POINTER
(
1
);
int32
len
=
PG_GETARG_INT32
(
2
);
char
*
srcstart
,
*
trgchar
;
int
charlen
;
TSLexeme
*
res
=
NULL
;
SuffixChar
*
node
;
srcstart
=
srcchar
;
while
(
srcchar
-
srcstart
<
len
)
{
charlen
=
pg_mblen
(
srcchar
);
node
=
findReplaceTo
(
rootSuffixTree
,
(
unsigned
char
*
)
srcchar
,
charlen
);
if
(
node
&&
node
->
replaceTo
)
{
if
(
!
res
)
{
/* allocate res only it it's needed */
res
=
palloc0
(
sizeof
(
TSLexeme
)
*
2
);
res
->
lexeme
=
trgchar
=
palloc
(
len
*
pg_database_encoding_max_length
()
+
1
/* \0 */
);
res
->
flags
=
TSL_FILTER
;
if
(
srcchar
!=
srcstart
)
{
memcpy
(
trgchar
,
srcstart
,
srcchar
-
srcstart
);
trgchar
+=
(
srcchar
-
srcstart
);
}
}
memcpy
(
trgchar
,
node
->
replaceTo
,
node
->
replacelen
);
trgchar
+=
node
->
replacelen
;
}
else
if
(
res
)
{
memcpy
(
trgchar
,
srcchar
,
charlen
);
trgchar
+=
charlen
;
}
srcchar
+=
charlen
;
}
if
(
res
)
*
trgchar
=
'\0'
;
PG_RETURN_POINTER
(
res
);
}
/*
* Function-like wrapper for dictionary
*/
PG_FUNCTION_INFO_V1
(
unaccent_dict
);
Datum
unaccent_dict
(
PG_FUNCTION_ARGS
);
Datum
unaccent_dict
(
PG_FUNCTION_ARGS
)
{
text
*
str
;
int
strArg
;
Oid
dictOid
;
TSDictionaryCacheEntry
*
dict
;
TSLexeme
*
res
;
if
(
PG_NARGS
()
==
1
)
{
dictOid
=
TSDictionaryGetDictid
(
stringToQualifiedNameList
(
"unaccent"
),
false
);
strArg
=
0
;
}
else
{
dictOid
=
PG_GETARG_OID
(
0
);
strArg
=
1
;
}
str
=
PG_GETARG_TEXT_P
(
strArg
);
dict
=
lookup_ts_dictionary_cache
(
dictOid
);
res
=
(
TSLexeme
*
)
DatumGetPointer
(
FunctionCall4
(
&
(
dict
->
lexize
),
PointerGetDatum
(
dict
->
dictData
),
PointerGetDatum
(
VARDATA
(
str
)),
Int32GetDatum
(
VARSIZE
(
str
)
-
VARHDRSZ
),
PointerGetDatum
(
NULL
)));
PG_FREE_IF_COPY
(
str
,
strArg
);
if
(
res
==
NULL
)
{
PG_RETURN_TEXT_P
(
PG_GETARG_TEXT_P_COPY
(
strArg
));
}
else
if
(
res
->
lexeme
==
NULL
)
{
pfree
(
res
);
PG_RETURN_TEXT_P
(
PG_GETARG_TEXT_P_COPY
(
strArg
));
}
else
{
text
*
txt
=
cstring_to_text
(
res
->
lexeme
);
pfree
(
res
->
lexeme
);
pfree
(
res
);
PG_RETURN_TEXT_P
(
txt
);
}
}
contrib/unaccent/unaccent.rules
0 → 100644
View file @
92e05bc6
À A
Á A
 A
à A
Ä A
Å A
Æ A
à a
á a
â a
ã a
ä a
å a
æ a
Ā A
ā a
Ă A
ă a
Ą A
ą a
Ç C
ç c
Ć C
ć c
Ĉ C
ĉ c
Ċ C
ċ c
Č C
č c
Ď D
ď d
Đ D
đ d
È E
É E
Ê E
Ë E
è e
é e
ê e
ë e
Ē E
ē e
Ĕ E
ĕ e
Ė E
ė e
Ę E
ę e
Ě E
ě e
Ĝ G
ĝ g
Ğ G
ğ g
Ġ G
ġ g
Ģ G
ģ g
Ĥ H
ĥ h
Ħ H
ħ h
Ĩ I
Ì I
Í I
Î I
Ï I
ì i
í i
î i
ï i
ĩ i
Ī I
ī i
Ĭ I
ĭ i
Į I
į i
İ I
ı i
IJ I
ij i
Ĵ J
ĵ j
Ķ K
ķ k
ĸ k
Ĺ L
ĺ l
Ļ L
ļ l
Ľ L
ľ l
Ŀ L
ŀ l
Ł L
ł l
Ñ N
ñ n
Ń N
ń n
Ņ N
ņ n
Ň N
ň n
ʼn n
Ŋ N
ŋ n
Ò O
Ó O
Ô O
Õ O
Ö O
ò o
ó o
ô o
õ o
ö o
Ō O
ō o
Ŏ O
ŏ o
Ő O
ő o
Œ E
œ e
Ø O
ø o
Ŕ R
ŕ r
Ŗ R
ŗ r
Ř R
ř r
ß S
Ś S
ś s
Ŝ S
ŝ s
Ş S
ş s
Š S
š s
Ţ T
ţ t
Ť T
ť t
Ŧ T
ŧ t
Ù U
Ú U
Û U
Ü U
ù u
ú u
û u
ü u
Ũ U
ũ u
Ū U
ū u
Ŭ U
ŭ u
Ů U
ů u
Ű U
ű u
Ų U
ų u
Ŵ W
ŵ w
Ý Y
ý y
ÿ y
Ŷ Y
ŷ y
Ÿ Y
Ź Z
ź z
Ż Z
ż z
Ž Z
ž z
ё е
Ё Е
contrib/unaccent/unaccent.sql.in
0 → 100644
View file @
92e05bc6
/* $PostgreSQL: pgsql/contrib/unaccent/unaccent.sql.in,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
CREATE OR REPLACE FUNCTION unaccent(regdictionary, text)
RETURNS text
AS 'MODULE_PATHNAME', 'unaccent_dict'
LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
CREATE OR REPLACE FUNCTION unaccent(text)
RETURNS text
AS 'MODULE_PATHNAME', 'unaccent_dict'
LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
CREATE OR REPLACE FUNCTION unaccent_init(internal)
RETURNS internal
AS 'MODULE_PATHNAME', 'unaccent_init'
LANGUAGE C;
CREATE OR REPLACE FUNCTION unaccent_lexize(internal,internal,internal,internal)
RETURNS internal
AS 'MODULE_PATHNAME', 'unaccent_lexize'
LANGUAGE C;
CREATE TEXT SEARCH TEMPLATE unaccent (
INIT = unaccent_init,
LEXIZE = unaccent_lexize
);
CREATE TEXT SEARCH DICTIONARY unaccent (
TEMPLATE = unaccent,
RULES = 'unaccent'
);
contrib/unaccent/uninstall_unaccent.sql
0 → 100644
View file @
92e05bc6
/* $PostgreSQL: pgsql/contrib/unaccent/uninstall_unaccent.sql,v 1.1 2009/08/18 10:34:39 teodor Exp $ */
DROP
FUNCTION
IF
EXISTS
unaccent
(
regdictionary
,
text
)
CASCADE
;
DROP
FUNCTION
IF
EXISTS
unaccent
(
text
)
CASCADE
;
DROP
TEXT
SEARCH
DICTIONARY
IF
EXISTS
unaccent
CASCADE
;
DROP
TEXT
SEARCH
TEMPLATE
IF
EXISTS
unaccent
CASCADE
;
DROP
FUNCTION
IF
EXISTS
unaccent_init
(
internal
)
CASCADE
;
DROP
FUNCTION
IF
EXISTS
unaccent_lexize
(
internal
,
internal
,
internal
,
internal
)
CASCADE
;
doc/src/sgml/contrib.sgml
View file @
92e05bc6
<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.1
3 2009/04/27 16:27:35 momjian
Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.1
4 2009/08/18 10:34:39 teodor
Exp $ -->
<appendix id="contrib">
<appendix id="contrib">
<title>Additional Supplied Modules</title>
<title>Additional Supplied Modules</title>
...
@@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
...
@@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
&tablefunc;
&tablefunc;
&test-parser;
&test-parser;
&tsearch2;
&tsearch2;
&unaccent;
&uuid-ossp;
&uuid-ossp;
&vacuumlo;
&vacuumlo;
&xml2;
&xml2;
...
...
doc/src/sgml/filelist.sgml
View file @
92e05bc6
<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.6
3 2009/08/17 22:14:44 petere
Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.6
4 2009/08/18 10:34:39 teodor
Exp $ -->
<!entity history SYSTEM "history.sgml">
<!entity history SYSTEM "history.sgml">
<!entity info SYSTEM "info.sgml">
<!entity info SYSTEM "info.sgml">
...
@@ -126,6 +126,7 @@
...
@@ -126,6 +126,7 @@
<!entity tablefunc SYSTEM "tablefunc.sgml">
<!entity tablefunc SYSTEM "tablefunc.sgml">
<!entity test-parser SYSTEM "test-parser.sgml">
<!entity test-parser SYSTEM "test-parser.sgml">
<!entity tsearch2 SYSTEM "tsearch2.sgml">
<!entity tsearch2 SYSTEM "tsearch2.sgml">
<!entity unaccent SYSTEM "unaccent.sgml">
<!entity uuid-ossp SYSTEM "uuid-ossp.sgml">
<!entity uuid-ossp SYSTEM "uuid-ossp.sgml">
<!entity vacuumlo SYSTEM "vacuumlo.sgml">
<!entity vacuumlo SYSTEM "vacuumlo.sgml">
<!entity xml2 SYSTEM "xml2.sgml">
<!entity xml2 SYSTEM "xml2.sgml">
...
...
doc/src/sgml/unaccent.sgml
0 → 100644
View file @
92e05bc6
<sect1 id="unaccent">
<title>unaccent</title>
<indexterm zone="unaccent">
<primary>unaccent</primary>
</indexterm>
<para>
<filename>unaccent</> removes accents (diacritic signs) from a lexeme.
It's a filtering dictionary, that means its output is
always passed to the next dictionary (if any), contrary to the standard
behaviour. Currently, it supports most important accents from european
languages.
</para>
<para>
Limitation: Current implementation of <filename>unaccent</>
dictionary cannot be used as a normalizing dictionary for
<filename>thesaurus</filename> dictionary.
</para>
<sect2>
<title>Configuration</title>
<para>
A <literal>unaccent</> dictionary accepts the following options:
</para>
<itemizedlist>
<listitem>
<para>
<literal>RULES</> is the base name of the file containing the list of
translation rules. This file must be stored in
<filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
the <productname>PostgreSQL</> installation's shared-data directory).
Its name must end in <literal>.rules</> (which is not to be included in
the <literal>RULES</> parameter).
</para>
</listitem>
</itemizedlist>
<para>
The rules file has the following format:
</para>
<itemizedlist>
<listitem>
<para>
Each line represents pair: character_with_accent character_without_accent
<programlisting>
À A
Á A
 A
à A
Ä A
Å A
Æ A
</programlisting>
</para>
</listitem>
</itemizedlist>
<para>
Look at <filename>unaccent.rules</>, which is installed in
<filename>$SHAREDIR/tsearch_data/</>, for an example.
</para>
</sect2>
<sect2>
<title>Usage</title>
<para>
Running the installation script creates a text search template
<literal>unaccent</> and a dictionary <literal>unaccent</>
based on it, with default parameters. You can alter the
parameters, for example
<programlisting>
=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
</programlisting>
or create new dictionaries based on the template.
</para>
<para>
To test the dictionary, you can try
<programlisting>
=# select ts_lexize('unaccent','Hôtel');
ts_lexize
-----------
{Hotel}
(1 row)
</programlisting>
</para>
<para>
Filtering dictionary are useful for correct work of
<function>ts_headline</function> function.
<programlisting>
=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french );
=# ALTER TEXT SEARCH CONFIGURATION fr
ALTER MAPPING FOR hword, hword_part, word
WITH unaccent, french_stem;
=# select to_tsvector('fr','Hôtels de la Mer');
to_tsvector
-------------------
'hotel':1 'mer':4
(1 row)
=# select to_tsvector('fr','Hôtel de la Mer') @@ to_tsquery('fr','Hotels');
?column?
----------
t
(1 row)
=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels'));
ts_headline
------------------------
<b>Hôtel</b>de la Mer
(1 row)
</programlisting>
</para>
</sect2>
<sect2>
<title>Function</title>
<para>
<function>unaccent</> function removes accents (diacritic signs) from
argument string. Basically, it's a wrapper around
<filename>unaccent</> dictionary.
</para>
<indexterm>
<primary>unaccent</primary>
</indexterm>
<synopsis>
unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>,
</optional> <replaceable class="PARAMETER">string</replaceable>)
returns <type>text</type>
</synopsis>
<para>
<programlisting>
SELECT unaccent('unaccent','Hôtel');
SELECT unaccent('Hôtel');
</programlisting>
</para>
</sect2>
</sect1>
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment