Commit c63c1946 authored by Teodor Sigaev's avatar Teodor Sigaev

Optimize. Improve ispell support for compound words. This work was sponsored by ABC Startsiden AS.

parent 6a04c571
...@@ -27,7 +27,7 @@ Datum spell_lexize(PG_FUNCTION_ARGS); ...@@ -27,7 +27,7 @@ Datum spell_lexize(PG_FUNCTION_ARGS);
static void static void
freeDictISpell(DictISpell * d) freeDictISpell(DictISpell * d)
{ {
FreeIspell(&(d->obj)); NIFree(&(d->obj));
freestoplist(&(d->stoplist)); freestoplist(&(d->stoplist));
free(d); free(d);
} }
...@@ -71,7 +71,7 @@ spell_init(PG_FUNCTION_ARGS) ...@@ -71,7 +71,7 @@ spell_init(PG_FUNCTION_ARGS)
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("dictionary already loaded"))); errmsg("dictionary already loaded")));
} }
if (ImportDictionary(&(d->obj), pcfg->value)) if (NIImportDictionary(&(d->obj), pcfg->value))
{ {
freeDictISpell(d); freeDictISpell(d);
ereport(ERROR, ereport(ERROR,
...@@ -90,7 +90,7 @@ spell_init(PG_FUNCTION_ARGS) ...@@ -90,7 +90,7 @@ spell_init(PG_FUNCTION_ARGS)
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("affixes already loaded"))); errmsg("affixes already loaded")));
} }
if (ImportAffixes(&(d->obj), pcfg->value)) if (NIImportAffixes(&(d->obj), pcfg->value))
{ {
freeDictISpell(d); freeDictISpell(d);
ereport(ERROR, ereport(ERROR,
...@@ -132,8 +132,8 @@ spell_init(PG_FUNCTION_ARGS) ...@@ -132,8 +132,8 @@ spell_init(PG_FUNCTION_ARGS)
if (affloaded && dictloaded) if (affloaded && dictloaded)
{ {
SortDictionary(&(d->obj)); NISortDictionary(&(d->obj));
SortAffixes(&(d->obj)); NISortAffixes(&(d->obj));
} }
else if (!affloaded) else if (!affloaded)
{ {
...@@ -168,7 +168,7 @@ spell_lexize(PG_FUNCTION_ARGS) ...@@ -168,7 +168,7 @@ spell_lexize(PG_FUNCTION_ARGS)
res = palloc(sizeof(char *) * 2); res = palloc(sizeof(char *) * 2);
txt = pnstrdup(in, PG_GETARG_INT32(2)); txt = pnstrdup(in, PG_GETARG_INT32(2));
res = NormalizeWord(&(d->obj), txt); res = NINormalizeWord(&(d->obj), txt);
pfree(txt); pfree(txt);
if (res == NULL) if (res == NULL)
......
...@@ -7,15 +7,26 @@ ...@@ -7,15 +7,26 @@
#include "spell.h" #include "spell.h"
#define MAXNORMLEN 56 #define MAX_NORM 1024
#define MAXNORMLEN 256
#define STRNCASECMP(x,y) (strncasecmp(x,y,strlen(y))) #define STRNCASECMP(x,y) (strncasecmp(x,y,strlen(y)))
#define GETWCHAR(W,L,N,T) ( ((u_int8_t*)(W))[ ((T)=='p') ? (N) : ( (L) - 1 - (N) ) ] )
#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
#define MEMOUT(X) if ( !(X) ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")))
static int static int
cmpspell(const void *s1, const void *s2) cmpspell(const void *s1, const void *s2)
{ {
return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word)); return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word));
} }
static int
cmpspellaffix(const void *s1, const void *s2)
{
return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
}
static void static void
strlower(char *str) strlower(char *str)
...@@ -29,6 +40,13 @@ strlower(char *str) ...@@ -29,6 +40,13 @@ strlower(char *str)
} }
} }
static char*
strndup(char *s, int len) {
char *d=(char*)palloc( len + 1 );
memcpy(d, s, len );
d[len]='\0';
return d;
}
/* backward string compaire for suffix tree operations */ /* backward string compaire for suffix tree operations */
static int static int
strbcmp(const char *s1, const char *s2) strbcmp(const char *s1, const char *s2)
...@@ -92,7 +110,7 @@ cmpaffix(const void *s1, const void *s2) ...@@ -92,7 +110,7 @@ cmpaffix(const void *s1, const void *s2)
} }
int int
AddSpell(IspellDict * Conf, const char *word, const char *flag) NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
{ {
if (Conf->nspell >= Conf->mspell) if (Conf->nspell >= Conf->mspell)
{ {
...@@ -106,24 +124,18 @@ AddSpell(IspellDict * Conf, const char *word, const char *flag) ...@@ -106,24 +124,18 @@ AddSpell(IspellDict * Conf, const char *word, const char *flag)
Conf->mspell = 1024 * 20; Conf->mspell = 1024 * 20;
Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL)); Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL));
} }
if (Conf->Spell == NULL) MEMOUT(Conf->Spell);
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
} }
Conf->Spell[Conf->nspell].word = strdup(word); Conf->Spell[Conf->nspell].word = strdup(word);
if (!Conf->Spell[Conf->nspell].word) MEMOUT(Conf->Spell[Conf->nspell].word);
ereport(ERROR, strncpy(Conf->Spell[Conf->nspell].p.flag, flag, 16);
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
strncpy(Conf->Spell[Conf->nspell].flag, flag, 10);
Conf->nspell++; Conf->nspell++;
return (0); return (0);
} }
int int
ImportDictionary(IspellDict * Conf, const char *filename) NIImportDictionary(IspellDict * Conf, const char *filename)
{ {
unsigned char str[BUFSIZ]; unsigned char str[BUFSIZ];
FILE *dict; FILE *dict;
...@@ -143,7 +155,7 @@ ImportDictionary(IspellDict * Conf, const char *filename) ...@@ -143,7 +155,7 @@ ImportDictionary(IspellDict * Conf, const char *filename)
flag = s; flag = s;
while (*s) while (*s)
{ {
if (((*s >= 'A') && (*s <= 'Z')) || ((*s >= 'a') && (*s <= 'z'))) if (isprint(*s) && !isspace(*s))
s++; s++;
else else
{ {
...@@ -166,65 +178,49 @@ ImportDictionary(IspellDict * Conf, const char *filename) ...@@ -166,65 +178,49 @@ ImportDictionary(IspellDict * Conf, const char *filename)
*s = 0; *s = 0;
s++; s++;
} }
AddSpell(Conf, str, flag); NIAddSpell(Conf, str, flag);
} }
fclose(dict); fclose(dict);
return (0); return (0);
} }
static SPELL * static int
FindWord(IspellDict * Conf, const char *word, int affixflag) FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
{ {
int l, SPNode *node = Conf->Dictionary;
c, SPNodeData *StopLow, *StopHigh, *StopMiddle;
r, int level=0, wrdlen=strlen(word);
resc,
resl, while( node && level<wrdlen) {
resr, StopLow = node->data;
i; StopHigh = node->data+node->length;
while (StopLow < StopHigh) {
i = (int) (*word) & 255; StopMiddle = StopLow + (StopHigh - StopLow) / 2;
l = Conf->SpellTree.Left[i]; if ( StopMiddle->val == ((u_int8_t*)(word))[level] ) {
r = Conf->SpellTree.Right[i]; if ( wrdlen==level+1 && StopMiddle->isword ) {
if (l == -1) if ( compoundonly && !StopMiddle->compoundallow )
return (NULL); return 0;
while (l <= r) if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
{ return 1;
c = (l + r) >> 1; }
resc = strcmp(Conf->Spell[c].word, word); node=StopMiddle->node;
if ((resc == 0) && level++;
((affixflag == 0) || (strchr(Conf->Spell[c].flag, affixflag) != NULL))) break;
return (&Conf->Spell[c]); } else if ( StopMiddle->val < ((u_int8_t*)(word))[level] ) {
resl = strcmp(Conf->Spell[l].word, word); StopLow = StopMiddle + 1;
if ((resl == 0) && } else {
((affixflag == 0) || (strchr(Conf->Spell[l].flag, affixflag) != NULL))) StopHigh = StopMiddle;
return (&Conf->Spell[l]); }
resr = strcmp(Conf->Spell[r].word, word);
if ((resr == 0) &&
((affixflag == 0) || (strchr(Conf->Spell[r].flag, affixflag) != NULL)))
return (&Conf->Spell[r]);
if (resc < 0)
{
l = c + 1;
r--;
}
else if (resc > 0)
{
r = c - 1;
l++;
}
else
{
l++;
r--;
} }
if ( StopLow >= StopHigh )
break;
} }
return (NULL); return 0;
} }
int int
AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const char *repl, int type) NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
{ {
if (Conf->naffixes >= Conf->maffixes) if (Conf->naffixes >= Conf->maffixes)
{ {
...@@ -238,16 +234,14 @@ AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const ...@@ -238,16 +234,14 @@ AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const
Conf->maffixes = 16; Conf->maffixes = 16;
Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX)); Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX));
} }
if (Conf->Affix == NULL) MEMOUT(Conf->Affix);
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
} }
if (type == 's') if (type == 's')
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
else else
sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask); sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
Conf->Affix[Conf->naffixes].compile = 1; Conf->Affix[Conf->naffixes].compile = 1;
Conf->Affix[Conf->naffixes].flagflags = flagflags;
Conf->Affix[Conf->naffixes].flag = flag; Conf->Affix[Conf->naffixes].flag = flag;
Conf->Affix[Conf->naffixes].type = type; Conf->Affix[Conf->naffixes].type = type;
...@@ -281,7 +275,7 @@ remove_spaces(char *dist, char *src) ...@@ -281,7 +275,7 @@ remove_spaces(char *dist, char *src)
int int
ImportAffixes(IspellDict * Conf, const char *filename) NIImportAffixes(IspellDict * Conf, const char *filename)
{ {
unsigned char str[BUFSIZ]; unsigned char str[BUFSIZ];
unsigned char flag = 0; unsigned char flag = 0;
...@@ -292,13 +286,24 @@ ImportAffixes(IspellDict * Conf, const char *filename) ...@@ -292,13 +286,24 @@ ImportAffixes(IspellDict * Conf, const char *filename)
int i; int i;
int suffixes = 0; int suffixes = 0;
int prefixes = 0; int prefixes = 0;
unsigned char flagflags = 0;
FILE *affix; FILE *affix;
if (!(affix = fopen(filename, "r"))) if (!(affix = fopen(filename, "r")))
return (1); return (1);
Conf->compoundcontrol='\t';
while (fgets(str, sizeof(str), affix)) while (fgets(str, sizeof(str), affix))
{ {
if (STRNCASECMP(str, "compoundwords")==0) {
s=strchr(str, 'l');
if ( s ) {
while( *s!=' ' ) s++;
while( *s==' ' ) s++;
Conf->compoundcontrol = *s;
continue;
}
}
if (!STRNCASECMP(str, "suffixes")) if (!STRNCASECMP(str, "suffixes"))
{ {
suffixes = 1; suffixes = 1;
...@@ -314,8 +319,18 @@ ImportAffixes(IspellDict * Conf, const char *filename) ...@@ -314,8 +319,18 @@ ImportAffixes(IspellDict * Conf, const char *filename)
if (!STRNCASECMP(str, "flag ")) if (!STRNCASECMP(str, "flag "))
{ {
s = str + 5; s = str + 5;
while (strchr("* ", *s)) flagflags=0;
while( *s==' ' ) s++;
if ( *s=='*' ) {
flagflags|=FF_CROSSPRODUCT;
s++;
} else if ( *s=='~' ) {
flagflags|=FF_COMPOUNDONLYAFX;
s++; s++;
}
if ( *s=='\\' ) s++;
flag = *s; flag = *s;
continue; continue;
} }
...@@ -351,7 +366,7 @@ ImportAffixes(IspellDict * Conf, const char *filename) ...@@ -351,7 +366,7 @@ ImportAffixes(IspellDict * Conf, const char *filename)
continue; continue;
} }
AddAffix(Conf, (int) flag, mask, find, repl, suffixes ? 's' : 'p'); NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p');
} }
fclose(affix); fclose(affix);
...@@ -359,87 +374,266 @@ ImportAffixes(IspellDict * Conf, const char *filename) ...@@ -359,87 +374,266 @@ ImportAffixes(IspellDict * Conf, const char *filename)
return (0); return (0);
} }
static int
MergeAffix(IspellDict *Conf, int a1, int a2) {
int naffix=0;
char **ptr=Conf->AffixData;
while(*ptr) {
naffix++;
ptr++;
}
Conf->AffixData=(char**)realloc( Conf->AffixData, (naffix+2)*sizeof(char*) );
MEMOUT(Conf->AffixData);
ptr = Conf->AffixData + naffix;
*ptr=malloc( strlen(Conf->AffixData[a1]) + strlen(Conf->AffixData[a2]) + 1 /* space */ + 1 /* \0 */ );
MEMOUT(ptr);
sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
ptr++;
*ptr='\0';
return naffix;
}
static SPNode*
mkSPNode(IspellDict *Conf, int low, int high, int level) {
int i;
int nchar=0;
char lastchar='\0';
SPNode *rs;
SPNodeData *data;
int lownew=low;
for(i=low; i<high; i++)
if ( Conf->Spell[i].p.d.len>level && lastchar!=Conf->Spell[i].word[level] ) {
nchar++;
lastchar=Conf->Spell[i].word[level];
}
if (!nchar)
return NULL;
rs=(SPNode*)malloc(SPNHRDSZ+nchar*sizeof(SPNodeData));
MEMOUT(rs);
memset(rs,0,SPNHRDSZ+nchar*sizeof(SPNodeData));
rs->length = nchar;
data=rs->data;
lastchar='\0';
for(i=low; i<high; i++)
if ( Conf->Spell[i].p.d.len>level ) {
if ( lastchar!=Conf->Spell[i].word[level] ) {
if ( lastchar ) {
data->node = mkSPNode(Conf, lownew, i, level+1);
lownew=i;
data++;
}
lastchar=Conf->Spell[i].word[level];
}
data->val=((u_int8_t*)(Conf->Spell[i].word))[level];
if ( Conf->Spell[i].p.d.len == level+1 ) {
if ( data->isword && data->affix!=Conf->Spell[i].p.d.affix) {
/*
fprintf(stderr,"Word already exists: %s (affixes: '%s' and '%s')\n",
Conf->Spell[i].word,
Conf->AffixData[data->affix],
Conf->AffixData[Conf->Spell[i].p.d.affix]
);
*/
/* MergeAffix called a few times */
data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i].p.d.affix);
} else
data->affix = Conf->Spell[i].p.d.affix;
data->isword=1;
if ( strchr( Conf->AffixData[ data->affix ], Conf->compoundcontrol ) )
data->compoundallow=1;
}
}
data->node = mkSPNode(Conf, lownew, high, level+1);
return rs;
}
void void
SortDictionary(IspellDict * Conf) NISortDictionary(IspellDict * Conf)
{ {
int CurLet = -1,
Let;
size_t i; size_t i;
int naffix=3;
/* compress affixes */
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspellaffix);
for (i = 1; i < Conf->nspell; i++)
if ( strcmp(Conf->Spell[i].p.flag,Conf->Spell[i-1].p.flag) )
naffix++;
Conf->AffixData=(char**)malloc( naffix*sizeof(char*) );
MEMOUT(Conf->AffixData);
memset(Conf->AffixData, 0, naffix*sizeof(char*));
naffix=1;
Conf->AffixData[0]=strdup("");
MEMOUT(Conf->AffixData[0]);
Conf->AffixData[1]=strdup( Conf->Spell[0].p.flag );
MEMOUT(Conf->AffixData[1]);
Conf->Spell[0].p.d.affix = 1;
Conf->Spell[0].p.d.len = strlen(Conf->Spell[0].word);
for (i = 1; i < Conf->nspell; i++) {
if ( strcmp(Conf->Spell[i].p.flag, Conf->AffixData[naffix]) ) {
naffix++;
Conf->AffixData[naffix] = strdup( Conf->Spell[i].p.flag );
MEMOUT(Conf->AffixData[naffix]);
}
Conf->Spell[i].p.d.affix = naffix;
Conf->Spell[i].p.d.len = strlen(Conf->Spell[i].word);
}
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell); qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell);
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
for (i = 0; i < Conf->nspell; i++)
free( Conf->Spell[i].word );
free( Conf->Spell );
Conf->Spell=NULL;
}
for (i = 0; i < 256; i++) static AffixNode*
Conf->SpellTree.Left[i] = -1; mkANode(IspellDict *Conf, int low, int high, int level, int type) {
int i;
int nchar=0;
u_int8_t lastchar='\0';
AffixNode *rs;
AffixNodeData *data;
int lownew=low;
for(i=low; i<high; i++)
if ( Conf->Affix[i].replen>level && lastchar!=GETCHAR( Conf->Affix + i, level, type ) ) {
nchar++;
lastchar=GETCHAR( Conf->Affix + i, level, type );
}
for (i = 0; i < Conf->nspell; i++) if (!nchar)
{ return NULL;
Let = (int) (*(Conf->Spell[i].word)) & 255;
if (CurLet != Let) rs=(AffixNode*)malloc(ANHRDSZ+nchar*sizeof(AffixNodeData));
{ MEMOUT(rs);
Conf->SpellTree.Left[Let] = i; memset(rs,0,ANHRDSZ+nchar*sizeof(AffixNodeData));
CurLet = Let; rs->length = nchar;
data=rs->data;
lastchar='\0';
for(i=low; i<high; i++)
if ( Conf->Affix[i].replen>level ) {
if ( lastchar!=GETCHAR( Conf->Affix + i, level, type ) ) {
if ( lastchar ) {
data->node = mkANode(Conf, lownew, i, level+1, type);
lownew=i;
data++;
}
lastchar=GETCHAR( Conf->Affix + i, level, type );
}
data->val=GETCHAR( Conf->Affix + i, level, type );
if ( Conf->Affix[i].replen == level+1 ) { /* affix stopped */
if ( !data->naff )
data->aff=(AFFIX**)malloc(sizeof(AFFIX*)*(high-i+1));
MEMOUT(data);
data->aff[ data->naff ] = Conf->Affix + i;
data->naff++;
}
} }
Conf->SpellTree.Right[Let] = i;
} data->node = mkANode(Conf, lownew, high, level+1, type);
return rs;
} }
void void
SortAffixes(IspellDict * Conf) NISortAffixes(IspellDict * Conf)
{ {
int CurLetP = -1,
CurLetS = -1,
Let;
AFFIX *Affix; AFFIX *Affix;
size_t i; size_t i;
CMPDAffix* ptr;
int firstsuffix=-1;
if (Conf->naffixes > 1) if (Conf->naffixes > 1)
qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix); qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
for (i = 0; i < 256; i++)
{
Conf->PrefixTree.Left[i] = Conf->PrefixTree.Right[i] = -1;
Conf->SuffixTree.Left[i] = Conf->SuffixTree.Right[i] = -1;
}
for (i = 0; i < Conf->naffixes; i++) Conf->CompoundAffix = ptr = (CMPDAffix*)malloc( sizeof(CMPDAffix) * Conf->naffixes );
{ MEMOUT(Conf->CompoundAffix);
ptr->affix=NULL;
for (i = 0; i < Conf->naffixes; i++) {
Affix = &(((AFFIX *) Conf->Affix)[i]); Affix = &(((AFFIX *) Conf->Affix)[i]);
if (Affix->type == 'p') if ( Affix->type == 's' ) {
{ if ( firstsuffix<0 ) firstsuffix=i;
Let = (int) (*(Affix->repl)) & 255; if ( Affix->flagflags & FF_COMPOUNDONLYAFX ) {
if (CurLetP != Let) if ( !ptr->affix || strbncmp((ptr-1)->affix, Affix->repl, (ptr-1)->len) ) {
{ /* leave only unique and minimals suffixes */
Conf->PrefixTree.Left[Let] = i; ptr->affix=Affix->repl;
CurLetP = Let; ptr->len=Affix->replen;
ptr++;
}
} }
Conf->PrefixTree.Right[Let] = i;
} }
else }
{ ptr->affix = NULL;
Let = (Affix->replen) ? (int) (Affix->repl[Affix->replen - 1]) & 255 : 0; Conf->CompoundAffix = (CMPDAffix*)realloc( Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr-Conf->CompoundAffix+1) );
if (CurLetS != Let)
{ Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p');
Conf->SuffixTree.Left[Let] = i; Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
CurLetS = Let; }
static AffixNodeData*
FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) {
AffixNodeData *StopLow, *StopHigh, *StopMiddle;
u_int8_t symbol;
while( node && *level<wrdlen) {
StopLow = node->data;
StopHigh = node->data+node->length;
while (StopLow < StopHigh) {
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
symbol = GETWCHAR(word,wrdlen,*level,type);
if ( StopMiddle->val == symbol ) {
if ( StopMiddle->naff )
return StopMiddle;
node=StopMiddle->node;
(*level)++;
break;
} else if ( StopMiddle->val < symbol ) {
StopLow = StopMiddle + 1;
} else {
StopHigh = StopMiddle;
} }
Conf->SuffixTree.Right[Let] = i;
} }
if ( StopLow >= StopHigh )
break;
} }
return NULL;
} }
static char * static char *
CheckSuffix(const char *word, size_t len, AFFIX * Affix, int *res, IspellDict * Conf) CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
{
regmatch_t subs[2]; /* workaround for apache&linux */ regmatch_t subs[2]; /* workaround for apache&linux */
char newword[2 * MAXNORMLEN] = "";
int err; int err;
*res = strbncmp(word, Affix->repl, Affix->replen); if ( flagflags & FF_COMPOUNDONLYAFX ) {
if (*res < 0) if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
return NULL; return NULL;
if (*res > 0) } else {
return NULL; if ( Affix->flagflags & FF_COMPOUNDONLYAFX )
strcpy(newword, word); return NULL;
strcpy(newword + len - Affix->replen, Affix->find); }
if ( Affix->type=='s' ) {
strcpy(newword, word);
strcpy(newword + len - Affix->replen, Affix->find);
} else {
strcpy(newword, Affix->find);
strcat(newword, word + Affix->replen);
}
if (Affix->compile) if (Affix->compile)
{ {
...@@ -452,205 +646,364 @@ CheckSuffix(const char *word, size_t len, AFFIX * Affix, int *res, IspellDict * ...@@ -452,205 +646,364 @@ CheckSuffix(const char *word, size_t len, AFFIX * Affix, int *res, IspellDict *
} }
Affix->compile = 0; Affix->compile = 0;
} }
if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0))) if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0)))
{ return newword;
if (FindWord(Conf, newword, Affix->flag))
return pstrdup(newword);
}
return NULL; return NULL;
} }
#define NS 1
#define MAX_NORM 512 static char **
static int NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
CheckPrefix(const char *word, size_t len, AFFIX * Affix, IspellDict * Conf, int pi, AffixNodeData *suffix=NULL, *prefix=NULL;
char **forms, char ***cur) int slevel=0, plevel=0;
{ int wrdlen = strlen(word), swrdlen;
regmatch_t subs[NS * 2]; char **forms;
char **cur;
char newword[2 * MAXNORMLEN] = ""; char newword[2 * MAXNORMLEN] = "";
int err, char pnewword[2 * MAXNORMLEN] = "";
ls, AffixNode *snode = Conf->Suffix, *pnode;
res, int i,j;
lres;
size_t newlen;
AFFIX *CAffix = Conf->Affix;
res = strncmp(word, Affix->repl, Affix->replen);
if (res != 0)
return res;
strcpy(newword, Affix->find);
strcat(newword, word + Affix->replen);
if (Affix->compile) if (wrdlen > MAXNORMLEN) return NULL;
{ strlower(word);
err = regcomp(&(Affix->reg), Affix->mask, REG_EXTENDED | REG_ICASE | REG_NOSUB); cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
if (err) *cur = NULL;
{
/* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
regfree(&(Affix->reg)); /* Check that the word itself is normal form */
return (0); if (FindWord(Conf, word, 0, flag & FF_COMPOUNDWORD)) {
} *cur = pstrdup(word);
Affix->compile = 0; cur++;
*cur = NULL;
} }
if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0)))
{
SPELL *curspell;
if ((curspell = FindWord(Conf, newword, Affix->flag))) /* Find all other NORMAL forms of the 'word' (check only prefix)*/
{ pnode=Conf->Prefix;
if ((*cur - forms) < (MAX_NORM - 1)) plevel=0;
{ while(pnode) {
**cur = pstrdup(newword); prefix=FinfAffixes(pnode, word, wrdlen, &plevel,'p');
(*cur)++; if (!prefix) break;
**cur = NULL; for(j=0;j<prefix->naff;j++) {
if ( CheckAffix(word,wrdlen,prefix->aff[j], flag, newword) ) {
/* prefix success */
if ( FindWord(Conf, newword, prefix->aff[j]->flag, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
/* word search success */
*cur = pstrdup(newword);
cur++;
*cur=NULL;
}
} }
} }
newlen = strlen(newword); pnode = prefix->node;
ls = Conf->SuffixTree.Left[pi]; plevel++;
if (ls >= 0 && ((*cur - forms) < (MAX_NORM - 1))) }
{
**cur = CheckSuffix(newword, newlen, &CAffix[ls], &lres, Conf); /* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
if (**cur) while( snode ) {
{ /* find possible suffix */
(*cur)++; suffix = FinfAffixes(snode, word, wrdlen, &slevel, 's');
**cur = NULL; if (!suffix) break;
/* foreach suffix check affix */
for(i=0;i<suffix->naff;i++) {
if ( CheckAffix(word, wrdlen, suffix->aff[i], flag, newword) ) {
/* suffix success */
if ( FindWord(Conf, newword, suffix->aff[i]->flag, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
/* word search success */
*cur = pstrdup(newword);
cur++;
*cur=NULL;
}
/* now we will look changed word with prefixes */
pnode=Conf->Prefix;
plevel=0;
swrdlen=strlen(newword);
while(pnode) {
prefix=FinfAffixes(pnode, newword, swrdlen, &plevel,'p');
if (!prefix) break;
for(j=0;j<prefix->naff;j++) {
if ( CheckAffix(newword,swrdlen,prefix->aff[j], flag, pnewword) ) {
/* prefix success */
int ff=( prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT ) ?
0 : prefix->aff[j]->flag;
if ( FindWord(Conf, pnewword, ff, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
/* word search success */
*cur = pstrdup(pnewword);
cur++;
*cur=NULL;
}
}
}
pnode = prefix->node;
plevel++;
}
} }
} }
}
return 0;
}
snode=suffix->node;
slevel++;
}
char ** if (cur == forms) {
NormalizeWord(IspellDict * Conf, char *word) free(forms);
{
/*regmatch_t subs[NS];*/
size_t len;
char **forms;
char **cur;
AFFIX *Affix;
int ri,
pi,
ipi,
lp,
rp,
cp,
ls,
rs;
int lres,
rres,
cres = 0;
SPELL *spell;
len = strlen(word);
if (len > MAXNORMLEN)
return (NULL); return (NULL);
}
return (forms);
}
strlower(word); typedef struct SplitVar {
int nstem;
forms = (char **) palloc(MAX_NORM * sizeof(char **)); char **stem;
cur = forms; struct SplitVar *next;
*cur = NULL; } SplitVar;
ri = (int) (*word) & 255; static int
pi = (int) (word[strlen(word) - 1]) & 255; CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len) {
Affix = (AFFIX *) Conf->Affix; while( (*ptr)->affix ) {
if ( len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len)==0 ) {
/* Check that the word itself is normal form */ len = (*ptr)->len;
if ((spell = FindWord(Conf, word, 0))) (*ptr)++;
{ return len;
*cur = pstrdup(word); }
cur++; (*ptr)++;
*cur = NULL;
} }
return 0;
}
/* Find all other NORMAL forms of the 'word' */ static SplitVar*
CopyVar(SplitVar *s, int makedup) {
SplitVar *v = (SplitVar*)palloc(sizeof(SplitVar));
v->stem=(char**)palloc( sizeof(char*) * (MAX_NORM) );
v->next=NULL;
if ( s ) {
int i;
v->nstem = s->nstem;
for(i=0;i<s->nstem;i++)
v->stem[i] = (makedup) ? pstrdup( s->stem[i] ) : s->stem[i];
} else {
v->nstem=0;
}
return v;
}
for (ipi = 0; ipi <= pi; ipi += pi)
{
/* check prefix */ static SplitVar*
lp = Conf->PrefixTree.Left[ri]; SplitToVariants( IspellDict * Conf, SPNode *snode, SplitVar * orig, char *word, int wordlen, int startpos, int minpos ) {
rp = Conf->PrefixTree.Right[ri]; SplitVar *var=NULL;
while (lp >= 0 && lp <= rp) SPNodeData *StopLow, *StopHigh, *StopMiddle;
{ SPNode *node = (snode) ? snode : Conf->Dictionary;
cp = (lp + rp) >> 1; int level=(snode) ? minpos : startpos; /* recursive minpos==level*/
cres = 0; int lenaff;
if ((cur - forms) < (MAX_NORM - 1)) CMPDAffix *caff;
cres = CheckPrefix(word, len, &Affix[cp], Conf, ipi, forms, &cur); char notprobed[wordlen];
if ((lp < cp) && ((cur - forms) < (MAX_NORM - 1)))
lres = CheckPrefix(word, len, &Affix[lp], Conf, ipi, forms, &cur); memset(notprobed,1,wordlen);
if ((rp > cp) && ((cur - forms) < (MAX_NORM - 1))) var = CopyVar(orig,1);
rres = CheckPrefix(word, len, &Affix[rp], Conf, ipi, forms, &cur);
if (cres < 0) while( node && level<wordlen) {
{ StopLow = node->data;
rp = cp - 1; StopHigh = node->data+node->length;
lp++; while (StopLow < StopHigh) {
} StopMiddle = StopLow + (StopHigh - StopLow) / 2;
else if (cres > 0) if ( StopMiddle->val == ((u_int8_t*)(word))[level] ) {
{ break;
lp = cp + 1; } else if ( StopMiddle->val < ((u_int8_t*)(word))[level] ) {
rp--; StopLow = StopMiddle + 1;
} else {
StopHigh = StopMiddle;
} }
else }
{ if ( StopLow >= StopHigh )
lp++; break;
rp--;
/* find word with epenthetic */
caff = Conf->CompoundAffix;
while ( level>startpos && (lenaff=CheckCompoundAffixes( &caff, word + level, wordlen - level ))>0 ) {
/* there is one of compound suffixes, so check word for existings */
char buf[MAXNORMLEN];
char **subres;
lenaff=level-startpos+lenaff;
if ( !notprobed[startpos+lenaff-1] )
continue;
if ( level+lenaff-1 <= minpos )
continue;
memcpy(buf, word+startpos, lenaff);
buf[lenaff]='\0';
subres = NormalizeSubWord(Conf, buf, FF_COMPOUNDWORD | FF_COMPOUNDONLYAFX);
if ( subres ) {
/* Yes, it was a word from dictionary */
SplitVar *new=CopyVar(var,0);
SplitVar *ptr=var;
char **sptr=subres;
notprobed[startpos+lenaff-1]=0;
while(*sptr) {
new->stem[ new->nstem ] = *sptr;
new->nstem++;
sptr++;
}
free(subres);
while( ptr->next )
ptr = ptr->next;
ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos+lenaff, startpos+lenaff);
free(new->stem);
free(new);
} }
} }
/* check suffix */ /* find infinitive */
ls = Conf->SuffixTree.Left[ipi]; if ( StopMiddle->isword && StopMiddle->compoundallow && notprobed[level] ) {
rs = Conf->SuffixTree.Right[ipi]; /* ok, we found full compoundallowed word*/
while (ls >= 0 && ls <= rs) if ( level>minpos ) {
{ /* and its length more than minimal */
if (((cur - forms) < (MAX_NORM - 1))) if ( wordlen==level+1 ) {
{ /* well, it was last word */
*cur = CheckSuffix(word, len, &Affix[ls], &lres, Conf); var->stem[ var->nstem ] = strndup(word + startpos, wordlen - startpos);
if (*cur) var->nstem++;
{ return var;
cur++; } else {
*cur = NULL; /* then we will search more big word at the same point */
SplitVar *ptr=var;
while( ptr->next )
ptr = ptr->next;
ptr->next=SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
/* we can find next word */
level++;
var->stem[ var->nstem ] = strndup(word + startpos, level - startpos);
var->nstem++;
node = Conf->Dictionary;
startpos=level;
continue;
} }
} }
if ((rs > ls) && ((cur - forms) < (MAX_NORM - 1))) }
{ level++;
*cur = CheckSuffix(word, len, &Affix[rs], &rres, Conf); node=StopMiddle->node;
if (*cur) }
{
cur++; var->stem[ var->nstem ] = strndup(word + startpos, wordlen - startpos);
*cur = NULL; var->nstem++;
return var;
}
char **
NINormalizeWord(IspellDict * Conf, char *word) {
char **res= NormalizeSubWord(Conf, word, 0);
if ( Conf->compoundcontrol != '\t' ) {
int wordlen=strlen(word);
SplitVar *ptr, *var = SplitToVariants(Conf,NULL,NULL, word, wordlen, 0, -1);
char **cur=res;
int i;
while(var) {
if ( var->nstem > 1 ) {
char **subres = NormalizeSubWord(Conf, var->stem[ var->nstem-1 ], FF_COMPOUNDWORD);
if ( subres ) {
char **ptr=subres;
if ( cur ) {
while(*cur)
cur++;
} else {
res=cur=(char **) palloc(MAX_NORM * sizeof(char *));
}
for(i=0;i<var->nstem-1;i++) {
*cur=var->stem[ i ];
cur++;
}
while(*ptr) {
*cur=*ptr;
cur++; ptr++;
}
*cur=NULL;
free(subres);
var->stem[ 0 ] = NULL;
} }
} }
ls++;
rs--; for(i=0;i<var->nstem && var->stem[ i ];i++)
} /* end while */ free( var->stem[i] );
ptr = var->next;
free(var->stem);
free(var);
var=ptr;
}
}
return res;
}
} /* for ipi */
if (cur == forms) static void freeSPNode(SPNode *node) {
{ SPNodeData *data;
pfree(forms);
return (NULL); if (!node) return;
data=node->data;
while( node->length ) {
freeSPNode(data->node);
data++;
node->length--;
} }
return (forms); free(node);
} }
static void freeANode(AffixNode *node) {
AffixNodeData *data;
if (!node) return;
data=node->data;
while( node->length ) {
freeANode(data->node);
if (data->naff)
free(data->aff);
data++;
node->length--;
}
free(node);
}
void void
FreeIspell(IspellDict * Conf) NIFree(IspellDict * Conf)
{ {
int i; int i;
AFFIX *Affix = (AFFIX *) Conf->Affix; AFFIX *Affix = (AFFIX *) Conf->Affix;
char** aff = Conf->AffixData;
if ( aff ) {
while(*aff) {
free(*aff);
aff++;
}
free(Conf->AffixData);
}
for (i = 0; i < Conf->naffixes; i++) for (i = 0; i < Conf->naffixes; i++)
{ {
if (Affix[i].compile == 0) if (Affix[i].compile == 0)
regfree(&(Affix[i].reg)); regfree(&(Affix[i].reg));
} }
for (i = 0; i < Conf->naffixes; i++) if (Conf->Spell) {
free(Conf->Spell[i].word); for (i = 0; i < Conf->nspell; i++)
free(Conf->Affix); free(Conf->Spell[i].word);
free(Conf->Spell); free(Conf->Spell);
}
if (Conf->Affix) free(Conf->Affix);
if ( Conf->CompoundAffix ) free(Conf->CompoundAffix);
freeSPNode(Conf->Dictionary);
freeANode(Conf->Suffix);
freeANode(Conf->Prefix);
memset((void *) Conf, 0, sizeof(IspellDict)); memset((void *) Conf, 0, sizeof(IspellDict));
return; return;
} }
...@@ -4,15 +4,43 @@ ...@@ -4,15 +4,43 @@
#include <sys/types.h> #include <sys/types.h>
#include <regex.h> #include <regex.h>
struct SPNode;
typedef struct {
u_int32_t
val:8,
isword:1,
compoundallow:1,
affix:22;
struct SPNode *node;
} SPNodeData;
typedef struct SPNode {
u_int32_t length;
SPNodeData data[1];
} SPNode;
#define SPNHRDSZ (sizeof(u_int32_t))
typedef struct spell_struct typedef struct spell_struct
{ {
char *word; char *word;
char flag[10]; union {
char flag[16];
struct {
int affix;
int len;
} d;
} p;
} SPELL; } SPELL;
typedef struct aff_struct typedef struct aff_struct
{ {
char flag; char flag;
char flagflags;
char type; char type;
char mask[33]; char mask[33];
char find[16]; char find[16];
...@@ -22,35 +50,66 @@ typedef struct aff_struct ...@@ -22,35 +50,66 @@ typedef struct aff_struct
char compile; char compile;
} AFFIX; } AFFIX;
#define FF_CROSSPRODUCT 0x01
#define FF_COMPOUNDWORD 0x02
#define FF_COMPOUNDONLYAFX 0x04
struct AffixNode;
typedef struct {
u_int32_t
val:8,
naff:24;
AFFIX **aff;
struct AffixNode *node;
} AffixNodeData;
typedef struct AffixNode {
u_int32_t length;
AffixNodeData data[1];
} AffixNode;
#define ANHRDSZ (sizeof(u_int32_t))
typedef struct Tree_struct typedef struct Tree_struct
{ {
int Left[256], int Left[256],
Right[256]; Right[256];
} Tree_struct; } Tree_struct;
typedef struct {
char *affix;
int len;
} CMPDAffix;
typedef struct typedef struct
{ {
int maffixes; int maffixes;
int naffixes; int naffixes;
AFFIX *Affix; AFFIX *Affix;
char compoundcontrol;
int nspell; int nspell;
int mspell; int mspell;
SPELL *Spell; SPELL *Spell;
Tree_struct SpellTree;
Tree_struct PrefixTree; AffixNode *Suffix;
Tree_struct SuffixTree; AffixNode *Prefix;
SPNode *Dictionary;
char **AffixData;
CMPDAffix *CompoundAffix;
} IspellDict; } IspellDict;
char **NormalizeWord(IspellDict * Conf, char *word); char **NINormalizeWord(IspellDict * Conf, char *word);
int ImportAffixes(IspellDict * Conf, const char *filename); int NIImportAffixes(IspellDict * Conf, const char *filename);
int ImportDictionary(IspellDict * Conf, const char *filename); int NIImportDictionary(IspellDict * Conf, const char *filename);
int AddSpell(IspellDict * Conf, const char *word, const char *flag); int NIAddSpell(IspellDict * Conf, const char *word, const char *flag);
int AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const char *repl, int type); int NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type);
void SortDictionary(IspellDict * Conf); void NISortDictionary(IspellDict * Conf);
void SortAffixes(IspellDict * Conf); void NISortAffixes(IspellDict * Conf);
void FreeIspell(IspellDict * Conf); void NIFree(IspellDict * Conf);
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment