Commit 324300bc authored by Teodor Sigaev's avatar Teodor Sigaev

improve support of agglutinative languages (query with compound words).

regression=# select to_tsquery( '\'fotballklubber\'');
                   to_tsquery
------------------------------------------------
 'fotball' & 'klubb' | 'fot' & 'ball' & 'klubb'
(1 row)

So, changed interface to dictionaries, lexize method of dictionary shoud return
pointer to aray of TSLexeme structs instead of char**. Last element should
have TSLexeme->lexeme == NULL.

typedef struct {
        /* number of variant of split word , for example
                Word 'fotballklubber' (norwegian) has two varian to split:
                ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary
                should return:
                nvariant        lexeme
                1               fotball
                1               klubb
                2               fot
                2               ball
                2               klubb

        */
        uint16  nvariant;

        /* currently unused */
        uint16  flags;

        /* C-string */
        char    *lexeme;
} TSLexeme;
parent d314616d
......@@ -183,15 +183,15 @@ lexize(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(1);
DictInfo *dict;
char **res,
**ptr;
TSLexeme *res,
*ptr;
Datum *da;
ArrayType *a;
SET_FUNCOID();
dict = finddict(PG_GETARG_OID(0));
ptr = res = (char **) DatumGetPointer(
ptr = res = (TSLexeme *) DatumGetPointer(
FunctionCall3(&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
PointerGetDatum(VARDATA(in)),
......@@ -207,13 +207,13 @@ lexize(PG_FUNCTION_ARGS)
PG_RETURN_NULL();
}
while (*ptr)
while (ptr->lexeme)
ptr++;
da = (Datum *) palloc(sizeof(Datum) * (ptr - res + 1));
ptr = res;
while (*ptr)
while (ptr->lexeme)
{
da[ptr - res] = PointerGetDatum(char2text(*ptr));
da[ptr - res] = PointerGetDatum(char2text(ptr->lexeme));
ptr++;
}
......@@ -227,10 +227,10 @@ lexize(PG_FUNCTION_ARGS)
);
ptr = res;
while (*ptr)
while (ptr->lexeme)
{
pfree(DatumGetPointer(da[ptr - res]));
pfree(*ptr);
pfree(ptr->lexeme);
ptr++;
}
pfree(res);
......
......@@ -38,4 +38,27 @@ typedef struct
void parse_cfgdict(text *in, Map ** m);
/* return struct for any lexize function */
typedef struct {
/* number of variant of split word , for example
Word 'fotballklubber' (norwegian) has two varian to split:
( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary
should return:
nvariant lexeme
1 fotball
1 klubb
2 fot
2 ball
2 klubb
*/
uint16 nvariant;
/* currently unused */
uint16 flags;
/* C-string */
char *lexeme;
} TSLexeme;
#endif
......@@ -54,16 +54,16 @@ dex_lexize(PG_FUNCTION_ARGS)
DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
char **res = palloc(sizeof(char *) * 2);
TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
memset(res,0,sizeof(TSLexeme) * 2);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
{
pfree(txt);
res[0] = NULL;
}
else
res[0] = txt;
res[1] = NULL;
res[0].lexeme = txt;
PG_RETURN_POINTER(res);
}
......@@ -159,14 +159,13 @@ spell_lexize(PG_FUNCTION_ARGS)
DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
char *txt;
char **res;
char **ptr,
**cptr;
TSLexeme *res;
TSLexeme *ptr,
*cptr;
if (!PG_GETARG_INT32(2))
PG_RETURN_POINTER(NULL);
res = palloc(sizeof(char *) * 2);
txt = pnstrdup(in, PG_GETARG_INT32(2));
res = NINormalizeWord(&(d->obj), txt);
pfree(txt);
......@@ -175,22 +174,22 @@ spell_lexize(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(NULL);
ptr = cptr = res;
while (*ptr)
while (ptr->lexeme)
{
if (searchstoplist(&(d->stoplist), *ptr))
if (searchstoplist(&(d->stoplist), ptr->lexeme))
{
pfree(*ptr);
*ptr = NULL;
pfree(ptr->lexeme);
ptr->lexeme = NULL;
ptr++;
}
else
{
*cptr = *ptr;
memcpy(cptr, ptr, sizeof(TSLexeme));
cptr++;
ptr++;
}
}
*cptr = NULL;
cptr->lexeme = NULL;
PG_RETURN_POINTER(res);
}
......@@ -105,12 +105,12 @@ snb_lexize(PG_FUNCTION_ARGS)
DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
char **res = palloc(sizeof(char *) * 2);
TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
memset(res, 0, sizeof(TSLexeme) * 2);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
{
pfree(txt);
res[0] = NULL;
}
else
{
......@@ -122,10 +122,8 @@ snb_lexize(PG_FUNCTION_ARGS)
memcpy(txt, d->z->p, d->z->l);
txt[d->z->l] = '\0';
}
res[0] = txt;
res->lexeme = txt;
}
res[1] = NULL;
PG_RETURN_POINTER(res);
}
......@@ -162,7 +162,7 @@ syn_lexize(PG_FUNCTION_ARGS)
char *in = (char *) PG_GETARG_POINTER(1);
Syn key,
*found;
char **res = NULL;
TSLexeme *res = NULL;
if (!PG_GETARG_INT32(2))
PG_RETURN_POINTER(NULL);
......@@ -176,10 +176,9 @@ syn_lexize(PG_FUNCTION_ARGS)
if (!found)
PG_RETURN_POINTER(NULL);
res = palloc(sizeof(char *) * 2);
res[0] = pstrdup(found->out);
res[1] = NULL;
res = palloc(sizeof(TSLexeme) * 2);
memset(res,0,sizeof(TSLexeme) * 2);
res[0].lexeme = pstrdup(found->out);
PG_RETURN_POINTER(res);
}
......@@ -52,15 +52,15 @@ dlexize_CFG_MODNAME(PG_FUNCTION_ARGS) {
HASINIT DictExample *d = (DictExample*)PG_GETARG_POINTER(0);
char *in = (char*)PG_GETARG_POINTER(1);
char *txt = pnstrdup(in, PG_GETARG_INT32(2));
char **res=palloc(sizeof(char*)*2);
TSLexeme *res=palloc(sizeof(TSLexeme*)*2);
/* Your INIT dictionary code */
/* Your LEXIZE dictionary code */
HASINIT if ( *txt=='\0' || searchstoplist(&(d->stoplist),txt) ) {
HASINIT pfree(txt);
HASINIT res[0]=NULL;
HASINIT res[0].lexeme=NULL;
HASINIT } else
res[0]=txt;
res[1]=NULL;
res[0].lexeme=txt;
res[1].lexeme=NULL;
PG_RETURN_POINTER(res);
}
......@@ -1119,17 +1119,32 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
return var;
}
char **
TSLexeme *
NINormalizeWord(IspellDict * Conf, char *word)
{
char **res = NormalizeSubWord(Conf, word, 0);
TSLexeme *lcur=NULL, *lres=NULL;
u_int16_t NVariant=1;
if (res) {
char **ptr = res;
lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
while(*ptr) {
lcur->lexeme=*ptr;
lcur->flags=0;
lcur->nvariant = NVariant++;
lcur++;
ptr++;
}
lcur->lexeme=NULL;
pfree(res);
}
if (Conf->compoundcontrol != '\t')
{
int wordlen = strlen(word);
SplitVar *ptr,
*var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
char **cur = res;
int i;
while (var)
......@@ -1140,30 +1155,31 @@ NINormalizeWord(IspellDict * Conf, char *word)
if (subres)
{
char **ptr = subres;
char **subptr = subres;
if ( !lcur )
lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
while(*subptr) {
for(i=0;i<var->nstem-1;i++) {
lcur->lexeme=(subptr==subres) ? var->stem[ i ] : pstrdup(var->stem[ i ]);
lcur->flags=0;
lcur->nvariant = NVariant;
lcur++;
}
if (cur)
{
while (*cur)
cur++;
}
else
res = cur = (char **) palloc(MAX_NORM * sizeof(char *));
lcur->lexeme=*subptr;
lcur->flags=0;
lcur->nvariant = NVariant;
lcur++;
subptr++;
NVariant++;
}
for (i = 0; i < var->nstem - 1; i++)
{
*cur = var->stem[i];
cur++;
}
while (*ptr)
{
*cur = *ptr;
cur++;
ptr++;
}
*cur = NULL;
lcur->lexeme=NULL;
pfree(subres);
var->stem[0] = NULL;
pfree( var->stem[ var->nstem-1 ] );
}
}
......@@ -1175,7 +1191,7 @@ NINormalizeWord(IspellDict * Conf, char *word)
var = ptr;
}
}
return res;
return lres;
}
......
......@@ -3,10 +3,11 @@
#include <sys/types.h>
#include "regex/regex.h"
#include "regis.h"
#include "c.h"
#include "regis.h"
#include "dict.h"
struct SPNode;
......@@ -116,7 +117,7 @@ typedef struct
} IspellDict;
char **NINormalizeWord(IspellDict * Conf, char *word);
TSLexeme *NINormalizeWord(IspellDict * Conf, char *word);
int NIImportAffixes(IspellDict * Conf, const char *filename);
int NIImportDictionary(IspellDict * Conf, const char *filename);
......
......@@ -265,6 +265,7 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we
{
int4 count = 0;
PRSTEXT prs;
uint32 variant, pos, cntvar=0, cntpos=0, cnt=0;
prs.lenwords = 32;
prs.curwords = 0;
......@@ -273,17 +274,39 @@ pushval_morph(QPRS_STATE * state, int typeval, char *strval, int lenval, int2 we
parsetext_v2(findcfg(state->cfg_id), &prs, strval, lenval);
for (count = 0; count < prs.curwords; count++)
{
pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
pfree(prs.words[count].word);
if (count)
pushquery(state, OPR, (int4) '&', 0, 0, 0);
}
pfree(prs.words);
if ( prs.curwords>0 ) {
while (count < prs.curwords) {
pos = prs.words[count].pos.pos;
cntvar=0;
while(count < prs.curwords && pos==prs.words[count].pos.pos) {
variant = prs.words[count].nvariant;
cnt=0;
while(count < prs.curwords && pos==prs.words[count].pos.pos && variant==prs.words[count].nvariant) {
pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
pfree(prs.words[count].word);
if ( cnt )
pushquery(state, OPR, (int4) '&', 0, 0, 0);
cnt++;
count++;
}
if ( cntvar )
pushquery(state, OPR, (int4) '|', 0, 0, 0);
cntvar++;
}
if (cntpos)
pushquery(state, OPR, (int4) '&', 0, 0, 0);
cntpos++;
}
pfree(prs.words);
/* XXX */
if (prs.curwords == 0)
} else
pushval_asis(state, VALSTOP, NULL, 0, 0);
}
......
......@@ -321,10 +321,10 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
for (i = 0; i < cfg->map[type].len; i++)
{
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
char **norms,
**ptr;
TSLexeme *norms,
*ptr;
norms = ptr = (char **) DatumGetPointer(
norms = ptr = (TSLexeme *) DatumGetPointer(
FunctionCall3(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
......@@ -337,7 +337,7 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
prs->pos++; /* set pos */
while (*ptr)
while (ptr->lexeme)
{
if (prs->curwords == prs->lenwords)
{
......@@ -345,8 +345,9 @@ parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
}
prs->words[prs->curwords].len = strlen(*ptr);
prs->words[prs->curwords].word = *ptr;
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
prs->words[prs->curwords].alen = 0;
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
ptr++;
......@@ -458,10 +459,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
for (i = 0; i < cfg->map[type].len; i++)
{
DictInfo *dict = finddict(DatumGetObjectId(cfg->map[type].dict_id[i]));
char **norms,
**ptr;
TSLexeme *norms,
*ptr;
norms = ptr = (char **) DatumGetPointer(
norms = ptr = (TSLexeme *) DatumGetPointer(
FunctionCall3(
&(dict->lexize_info),
PointerGetDatum(dict->dictionary),
......@@ -472,10 +473,10 @@ hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4
if (!norms) /* dictionary doesn't know this lexem */
continue;
while (*ptr)
while (ptr->lexeme)
{
hlfinditem(prs, query, *ptr, strlen(*ptr));
pfree(*ptr);
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
pfree(ptr->lexeme);
ptr++;
}
pfree(norms);
......
......@@ -27,6 +27,7 @@ void reset_cfg(void);
typedef struct
{
uint16 len;
uint16 nvariant;
union
{
uint16 pos;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment