Commit 22505f47 authored by Teodor Sigaev's avatar Teodor Sigaev

Add thesaurus dictionary which can replace N>0 lexemes by M>0 lexemes.

It required some changes in lexize algorithm, but interface with
dictionaries stays compatible with old dictionaries.

Funded by Georgia Public Library Service and LibLime, Inc.
parent 3b7ed9ba
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $
MODULE_big = tsearch2
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
dict_snowball.o dict_ispell.o dict_syn.o \
dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
wparser.o wparser_def.o \
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
tsvector_op.o rank.o ts_stat.o \
query_util.o query_support.o query_rewrite.o query_gist.o \
ts_locale.o ginidx.o
ts_locale.o ts_lexize.o ginidx.o
SUBDIRS := snowball ispell wordparser
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
......@@ -16,7 +16,7 @@ OBJS += $(SUBDIROBJS)
PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus
DATA_built = tsearch2.sql untsearch2.sql
DOCS = README.tsearch2
REGRESS = tsearch2
......
......@@ -5,6 +5,7 @@
#include "catalog/pg_proc.h"
#include "catalog/pg_namespace.h"
#include "utils/syscache.h"
#include "miscadmin.h"
#include "ts_cfg.h"
#include "dict.h"
......@@ -163,3 +164,23 @@ get_oidnamespace(Oid funcoid)
return nspoid;
}
/* if path is relative, take it as relative to share dir */
char *
to_absfilename(char *filename) {
if (!is_absolute_path(filename)) {
char sharepath[MAXPGPATH];
char *absfn;
#ifdef WIN32
char delim = '\\';
#else
char delim = '/';
#endif
get_share_path(my_exec_path, sharepath);
absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
sprintf(absfn, "%s%c%s", sharepath, delim, filename);
filename = absfn;
}
return filename;
}
......@@ -16,6 +16,8 @@ text *mtextdup(text *in);
int text_cmp(text *a, text *b);
char * to_absfilename(char *filename);
#define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
#define ARRNELEMS(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
......
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */
/*
* interface functions to dictionary
......@@ -50,16 +50,19 @@ init_dict(Oid id, DictInfo * dict)
Datum opt;
Oid oid = InvalidOid;
/* setup dictlexize method */
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
if (isnull || oid == InvalidOid)
ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
/* setup and call dictinit method, optinally */
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
if (!(isnull || oid == InvalidOid))
{
opt = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 2, &isnull);
dict->dictionary = (void *) DatumGetPointer(OidFunctionCall1(oid, opt));
}
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
if (isnull || oid == InvalidOid)
ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
dict->dict_id = id;
}
else
......@@ -98,6 +101,29 @@ comparedict(const void *a, const void *b)
return (((DictInfo *) a)->dict_id < ((DictInfo *) b)->dict_id) ? -1 : 1;
}
static void
insertdict(Oid id) {
DictInfo newdict;
if (DList.len == DList.reallen)
{
DictInfo *tmp;
int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
if (!tmp)
ts_error(ERROR, "No memory");
DList.reallen = reallen;
DList.list = tmp;
}
init_dict(id, &newdict);
DList.list[DList.len] = newdict;
DList.len++;
qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
}
DictInfo *
finddict(Oid id)
{
......@@ -117,23 +143,8 @@ finddict(Oid id)
return DList.last_dict;
}
/* last chance */
if (DList.len == DList.reallen)
{
DictInfo *tmp;
int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
if (!tmp)
ts_error(ERROR, "No memory");
DList.reallen = reallen;
DList.list = tmp;
}
DList.last_dict = &(DList.list[DList.len]);
init_dict(id, DList.last_dict);
DList.len++;
qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
/* insert new dictionary */
insertdict(id);
return finddict(id); /* qsort changed order!! */ ;
}
......@@ -190,17 +201,32 @@ lexize(PG_FUNCTION_ARGS)
*ptr;
Datum *da;
ArrayType *a;
DictSubState dstate = { false, false, NULL };
SET_FUNCOID();
dict = finddict(PG_GETARG_OID(0));
ptr = res = (TSLexeme *) DatumGetPointer(
FunctionCall3(&(dict->lexize_info),
FunctionCall4(&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(VARDATA(in)),
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
PointerGetDatum(&dstate)
)
);
if (dstate.getnext) {
dstate.isend = true;
ptr = res = (TSLexeme *) DatumGetPointer(
FunctionCall4(&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(VARDATA(in)),
Int32GetDatum(VARSIZE(in) - VARHDRSZ)
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
PointerGetDatum(&dstate)
)
);
}
PG_FREE_IF_COPY(in, 1);
if (!res)
{
......
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */
#ifndef __DICT_H__
#define __DICT_H__
#include "postgres.h"
#include "fmgr.h"
#include "ts_cfg.h"
typedef struct
{
......@@ -29,6 +30,11 @@ DictInfo *finddict(Oid id);
Oid name2id_dict(text *name);
void reset_dict(void);
typedef struct {
bool isend; /* in: marks for lexize_info about text end is reached */
bool getnext; /* out: dict wants next lexeme */
void *private; /* internal dict state between calls with getnext == true */
} DictSubState;
/* simple parser of cfg string */
typedef struct
......@@ -45,17 +51,61 @@ typedef struct
/*
* number of variant of split word , for example Word 'fotballklubber'
* (norwegian) has two varian to split: ( fotball, klubb ) and ( fot,
* ball, klubb ). So, dictionary should return: nvariant lexeme 1
* fotball 1 klubb 2 fot 2 ball 2 klubb
*
* ball, klubb ). So, dictionary should return:
* nvariant lexeme
* 1 fotball
* 1 klubb
* 2 fot
* 2 ball
* 2 klubb
*/
uint16 nvariant;
/* currently unused */
uint16 flags;
/* C-string */
char *lexeme;
} TSLexeme;
#define TSL_ADDPOS 0x01
/*
* Lexize subsystem
*/
typedef struct ParsedLex {
int type;
char *lemm;
int lenlemm;
bool resfollow;
struct ParsedLex *next;
} ParsedLex;
typedef struct ListParsedLex {
ParsedLex *head;
ParsedLex *tail;
} ListParsedLex;
typedef struct {
TSCfgInfo *cfg;
Oid curDictId;
int posDict;
DictSubState dictState;
ParsedLex *curSub;
ListParsedLex towork; /* current list to work */
ListParsedLex waste; /* list of lexemes that already lexized */
/* fields to store last variant to lexize (basically, thesaurus
or similar to, which wants several lexemes */
ParsedLex *lastRes;
TSLexeme *tmpRes;
} LexizeData;
void LexizeInit(LexizeData *ld, TSCfgInfo *cfg);
void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm);
TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem);
#endif
This diff is collapsed.
......@@ -4,21 +4,21 @@
--
\set ECHO none
psql:tsearch2.sql:13: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict"
psql:tsearch2.sql:158: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
psql:tsearch2.sql:257: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
psql:tsearch2.sql:264: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
psql:tsearch2.sql:370: NOTICE: type "tsvector" is not yet defined
psql:tsearch2.sql:177: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
psql:tsearch2.sql:276: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
psql:tsearch2.sql:283: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
psql:tsearch2.sql:389: NOTICE: type "tsvector" is not yet defined
DETAIL: Creating a shell type definition.
psql:tsearch2.sql:375: NOTICE: argument type tsvector is only a shell
psql:tsearch2.sql:429: NOTICE: type "tsquery" is not yet defined
psql:tsearch2.sql:394: NOTICE: argument type tsvector is only a shell
psql:tsearch2.sql:448: NOTICE: type "tsquery" is not yet defined
DETAIL: Creating a shell type definition.
psql:tsearch2.sql:434: NOTICE: argument type tsquery is only a shell
psql:tsearch2.sql:592: NOTICE: type "gtsvector" is not yet defined
psql:tsearch2.sql:453: NOTICE: argument type tsquery is only a shell
psql:tsearch2.sql:611: NOTICE: type "gtsvector" is not yet defined
DETAIL: Creating a shell type definition.
psql:tsearch2.sql:597: NOTICE: argument type gtsvector is only a shell
psql:tsearch2.sql:1087: NOTICE: type "gtsq" is not yet defined
psql:tsearch2.sql:616: NOTICE: argument type gtsvector is only a shell
psql:tsearch2.sql:1106: NOTICE: type "gtsq" is not yet defined
DETAIL: Creating a shell type definition.
psql:tsearch2.sql:1092: NOTICE: argument type gtsq is only a shell
psql:tsearch2.sql:1111: NOTICE: argument type gtsq is only a shell
--tsvector
SELECT '1'::tsvector;
tsvector
......
......@@ -4,8 +4,6 @@
*/
#include "postgres.h"
#include "miscadmin.h"
#include "common.h"
#include "dict.h"
#include "ts_locale.h"
......@@ -36,30 +34,11 @@ readstoplist(text *in, StopList * s)
s->len = 0;
if (in && VARSIZE(in) - VARHDRSZ > 0)
{
char *filename = text2char(in);
char *filename = to_absfilename(text2char(in));
FILE *hin;
char buf[STOPBUFLEN];
int reallen = 0;
/* if path is relative, take it as relative to share dir */
if (!is_absolute_path(filename))
{
char sharepath[MAXPGPATH];
char *absfn;
#ifdef WIN32
char delim = '\\';
#else
char delim = '/';
#endif
get_share_path(my_exec_path, sharepath);
absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
sprintf(absfn, "%s%c%s", sharepath, delim, filename);
pfree(filename);
filename = absfn;
}
if ((hin = fopen(filename, "r")) == NULL)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
......
#
# Theasurus config file. Character ':' splits
# string to part:
# to be substituted string
# substituting string
#
#one two three : 123
#one two : 12
#one : 1
#two : 2
#foo bar : blah blah
#f bar : fbar
#e bar : ebar
#g bar bar : gbarbar
#asd:sdffff
#qwerty:qwer wert erty
......@@ -281,15 +281,15 @@ name2id_cfg(text *name)
return id;
}
void
parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
{
int type,
lenlemm,
i;
lenlemm;
char *lemm = NULL;
WParserInfo *prsobj = findprs(cfg->prs_id);
LexizeData ldata;
TSLexeme *norms;
prsobj->prs = (void *) DatumGetPointer(
FunctionCall2(
......@@ -299,14 +299,16 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
)
);
while ((type = DatumGetInt32(FunctionCall3(
LexizeInit(&ldata, cfg);
do {
type = DatumGetInt32(FunctionCall3(
&(prsobj->getlexeme_info),
PointerGetDatum(prsobj->prs),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)))) != 0)
{
PointerGetDatum(&lenlemm)));
if (lenlemm >= MAXSTRLEN)
if (type>0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
......@@ -320,25 +322,11 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
#endif
}
if (type >= cfg->len) /* skip this type of lexeme */
continue;
LexizeAddLemm(&ldata, type, lemm, lenlemm);
for (i = 0; i < cfg->map[type].len; i++)
while( (norms = LexizeExec(&ldata, NULL)) != NULL )
{
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
TSLexeme *norms,
*ptr;
norms = ptr = (TSLexeme *) DatumGetPointer(
FunctionCall3(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(lemm),
PointerGetDatum(lenlemm)
)
);
if (!norms) /* dictionary doesn't know this lexeme */
continue;
TSLexeme *ptr = norms;
prs->pos++; /* set pos */
......@@ -350,6 +338,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
}
if ( ptr->flags & TSL_ADDPOS )
prs->pos++;
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
......@@ -359,9 +349,8 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
prs->curwords++;
}
pfree(norms);
break; /* lexeme already normalized or is stop word */
}
}
} while(type>0);
FunctionCall1(
&(prsobj->end_info),
......@@ -417,14 +406,47 @@ hlfinditem(HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int buflen)
}
}
static void
addHLParsedLex(HLPRSTEXT *prs, QUERYTYPE * query, ParsedLex *lexs, TSLexeme *norms) {
ParsedLex *tmplexs;
TSLexeme *ptr;
while( lexs ) {
if ( lexs->type > 0 )
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
ptr = norms;
while( ptr && ptr->lexeme ) {
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
ptr++;
}
tmplexs = lexs->next;
pfree( lexs );
lexs = tmplexs;
}
if ( norms ) {
ptr = norms;
while( ptr->lexeme ) {
pfree( ptr->lexeme );
ptr++;
}
pfree(norms);
}
}
void
hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen)
{
int type,
lenlemm,
i;
lenlemm;
char *lemm = NULL;
WParserInfo *prsobj = findprs(cfg->prs_id);
LexizeData ldata;
TSLexeme *norms;
ParsedLex *lexs;
prsobj->prs = (void *) DatumGetPointer(
FunctionCall2(
......@@ -434,14 +456,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
)
);
while ((type = DatumGetInt32(FunctionCall3(
LexizeInit(&ldata, cfg);
do {
type = DatumGetInt32(FunctionCall3(
&(prsobj->getlexeme_info),
PointerGetDatum(prsobj->prs),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)))) != 0)
{
PointerGetDatum(&lenlemm)));
if (lenlemm >= MAXSTRLEN)
if (type>0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
......@@ -455,38 +479,16 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
#endif
}
hladdword(prs, lemm, lenlemm, type);
LexizeAddLemm(&ldata, type, lemm, lenlemm);
if (type >= cfg->len)
continue;
do {
if ( (norms = LexizeExec(&ldata,&lexs)) != NULL )
addHLParsedLex(prs, query, lexs, norms);
else
addHLParsedLex(prs, query, lexs, NULL);
} while( norms );
for (i = 0; i < cfg->map[type].len; i++)
{
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
TSLexeme *norms,
*ptr;
norms = ptr = (TSLexeme *) DatumGetPointer(
FunctionCall3(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(lemm),
PointerGetDatum(lenlemm)
)
);
if (!norms) /* dictionary doesn't know this lexeme */
continue;
while (ptr->lexeme)
{
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
pfree(ptr->lexeme);
ptr++;
}
pfree(norms);
break; /* lexeme already normalized or is stop word */
}
}
} while( type>0 );
FunctionCall1(
&(prsobj->end_info),
......
/*
* lexize stream of lexemes
* Teodor Sigaev <teodor@sigaev.ru>
*/
#include "postgres.h"
#include <ctype.h>
#include <locale.h>
#include "ts_cfg.h"
#include "dict.h"
void
LexizeInit(LexizeData *ld, TSCfgInfo *cfg) {
ld->cfg = cfg;
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
ld->waste.head = ld->waste.tail = NULL;
ld->lastRes=NULL;
ld->tmpRes=NULL;
}
static void
LPLAddTail(ListParsedLex *list, ParsedLex *newpl) {
if ( list->tail ) {
list->tail->next = newpl;
list->tail = newpl;
} else
list->head = list->tail = newpl;
newpl->next = NULL;
}
static ParsedLex*
LPLRemoveHead(ListParsedLex *list) {
ParsedLex *res = list->head;
if ( list->head )
list->head = list->head->next;
if ( list->head == NULL )
list->tail = NULL;
return res;
}
void
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm) {
ParsedLex *newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
newpl = (ParsedLex*)palloc( sizeof(ParsedLex) );
newpl->type = type;
newpl->lemm = lemm;
newpl->lenlemm = lenlemm;
LPLAddTail(&ld->towork, newpl);
ld->curSub = ld->towork.tail;
}
static void
RemoveHead(LexizeData *ld) {
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
ld->posDict = 0;
}
static void
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem) {
if ( correspondLexem ) {
*correspondLexem = ld->waste.head;
} else {
ParsedLex *tmp, *ptr = ld->waste.head;
while(ptr) {
tmp = ptr->next;
pfree(ptr);
ptr = tmp;
}
}
ld->waste.head = ld->waste.tail = NULL;
}
static void
moveToWaste(LexizeData *ld, ParsedLex *stop) {
bool go = true;
while( ld->towork.head && go) {
if (ld->towork.head == stop) {
ld->curSub = stop->next;
go = false;
}
RemoveHead(ld);
}
}
static void
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res) {
if ( ld->tmpRes ) {
TSLexeme *ptr;
for( ptr=ld->tmpRes; ptr->lexeme; ptr++ )
pfree( ptr->lexeme );
pfree( ld->tmpRes );
}
ld->tmpRes = res;
ld->lastRes = lex;
}
TSLexeme*
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) {
int i;
ListDictionary *map;
DictInfo *dict;
TSLexeme *res;
if ( ld->curDictId == InvalidOid ) {
/*
* usial mode: dictionary wants only one word,
* but we should keep in mind that we should go through
* all stack
*/
while( ld->towork.head ) {
ParsedLex *curVal = ld->towork.head;
map = ld->cfg->map + curVal->type;
if (curVal->type == 0 || curVal->type >= ld->cfg->len || map->len == 0 ) {
/* skip this type of lexeme */
RemoveHead(ld);
continue;
}
for (i = ld->posDict; i < map->len; i++) {
dict = finddict(DatumGetObjectId(map->dict_id[i]));
ld->dictState.isend = ld->dictState.getnext = false;
ld->dictState.private = NULL;
res = (TSLexeme *) DatumGetPointer( FunctionCall4(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(curVal->lemm),
Int32GetDatum(curVal->lenlemm),
PointerGetDatum(&ld->dictState)
));
if ( ld->dictState.getnext ) {
/*
* dictinary wants next word, so setup and store
* current position and go to multiword mode
*/
ld->curDictId = DatumGetObjectId(map->dict_id[i]);
ld->posDict = i+1;
ld->curSub = curVal->next;
if ( res )
setNewTmpRes(ld, curVal, res);
return LexizeExec(ld, correspondLexem);
}
if (!res) /* dictionary doesn't know this lexeme */
continue;
RemoveHead(ld);
setCorrLex(ld, correspondLexem);
return res;
}
RemoveHead(ld);
}
} else { /* curDictId is valid */
dict = finddict(ld->curDictId);
/*
* Dictionary ld->curDictId asks us about following words
*/
while( ld->curSub ) {
ParsedLex *curVal = ld->curSub;
map = ld->cfg->map + curVal->type;
if (curVal->type != 0) {
bool dictExists = false;
if (curVal->type >= ld->cfg->len || map->len == 0 ) {
/* skip this type of lexeme */
ld->curSub = curVal->next;
continue;
}
/*
* We should be sure that current type of lexeme is recognized by
* our dictinonary: we just check is it exist in
* list of dictionaries ?
*/
for(i=0;i < map->len && !dictExists; i++)
if ( ld->curDictId == DatumGetObjectId(map->dict_id[i]) )
dictExists = true;
if ( !dictExists ) {
/*
* Dictionary can't work with current tpe of lexeme,
* return to basic mode and redo all stored lexemes
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
ld->dictState.isend = (curVal->type==0) ? true : false;
ld->dictState.getnext = false;
res = (TSLexeme *) DatumGetPointer( FunctionCall4(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(curVal->lemm),
Int32GetDatum(curVal->lenlemm),
PointerGetDatum(&ld->dictState)
));
if ( ld->dictState.getnext ) {
/* Dictionary wants one more */
ld->curSub = curVal->next;
if ( res )
setNewTmpRes(ld, curVal, res);
continue;
}
if ( res || ld->tmpRes ) {
/*
* Dictionary normalizes lexemes,
* so we remove from stack all used lexemes ,
* return to basic mode and redo end of stack (if it exists)
*/
if ( res ) {
moveToWaste( ld, ld->curSub );
} else {
res = ld->tmpRes;
moveToWaste( ld, ld->lastRes );
}
/* reset to initial state */
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->lastRes = NULL;
ld->tmpRes = NULL;
setCorrLex(ld, correspondLexem);
return res;
}
/* Dict don't want next lexem and didn't recognize anything,
redo from ld->towork.head */
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
setCorrLex(ld, correspondLexem);
return NULL;
}
......@@ -146,6 +146,25 @@ insert into pg_ts_dict select
'Example of synonym dictionary'
;
CREATE FUNCTION thesaurus_init(internal)
RETURNS internal
as 'MODULE_PATHNAME'
LANGUAGE C;
CREATE FUNCTION thesaurus_lexize(internal,internal,int4,internal)
RETURNS internal
as 'MODULE_PATHNAME'
LANGUAGE C
RETURNS NULL ON NULL INPUT;
insert into pg_ts_dict select
'thesaurus_template',
'thesaurus_init(internal)',
null,
'thesaurus_lexize(internal,internal,int4,internal)',
'Thesaurus template, must be pointed Dictionary and DictFile'
;
--dict conf
CREATE TABLE pg_ts_parser (
prs_name text not null primary key,
......@@ -1193,7 +1212,11 @@ AS
--example of ISpell dictionary
--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
--example of synonym dict
--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_name='synonym';
--example of thesaurus dict
--update pg_ts_dict set dict_initoption='DictFile="contrib/thesaurus", Dictionary="en_stem"' where dict_name='thesaurus_template';
--update pg_ts_cfgmap set dict_name = '{thesaurus_template,en_stem}' where dict_name = '{en_stem}';
END;
......@@ -41,6 +41,8 @@ DROP FUNCTION snb_lexize(internal,internal,int4);
DROP FUNCTION snb_ru_init(internal);
DROP FUNCTION spell_init(internal);
DROP FUNCTION spell_lexize(internal,internal,int4);
DROP FUNCTION thesaurus_init(internal);
DROP FUNCTION thesaurus_lexize(internal,internal,int4);
DROP FUNCTION syn_init(internal);
DROP FUNCTION syn_lexize(internal,internal,int4);
DROP FUNCTION set_curprs(int);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment