Commit dde94572 authored by Teodor Sigaev's avatar Teodor Sigaev

Fixing and improve compound word support. This changes cannot be applied to

previous version iwthout recreating tsvector fields...

Thanks to Alexander Presber <aljoscha@weisshuhn.de> to discover a problem.
parent 21e2544a
...@@ -737,9 +737,9 @@ NISortAffixes(IspellDict * Conf) ...@@ -737,9 +737,9 @@ NISortAffixes(IspellDict * Conf)
{ {
if (firstsuffix < 0) if (firstsuffix < 0)
firstsuffix = i; firstsuffix = i;
if (Affix->flagflags & FF_COMPOUNDONLYAFX) if ((Affix->flagflags & FF_COMPOUNDONLYAFX) && Affix->replen>0 )
{ {
if (!ptr->affix || if (ptr == Conf->CompoundAffix ||
strbncmp((const unsigned char *) (ptr - 1)->affix, strbncmp((const unsigned char *) (ptr - 1)->affix,
(const unsigned char *) Affix->repl, (const unsigned char *) Affix->repl,
(ptr - 1)->len)) (ptr - 1)->len))
...@@ -1024,17 +1024,31 @@ typedef struct SplitVar ...@@ -1024,17 +1024,31 @@ typedef struct SplitVar
} SplitVar; } SplitVar;
static int static int
CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len) CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len, bool CheckInPlace)
{ {
while ((*ptr)->affix) if ( CheckInPlace ) {
{ while ((*ptr)->affix)
if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0) {
if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
{
len = (*ptr)->len;
(*ptr)++;
return len;
}
(*ptr)++;
}
} else {
char *affbegin;
while ((*ptr)->affix)
{ {
len = (*ptr)->len; if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
{
len = (*ptr)->len + (affbegin-word);
(*ptr)++;
return len;
}
(*ptr)++; (*ptr)++;
return len;
} }
(*ptr)++;
} }
return 0; return 0;
} }
...@@ -1078,26 +1092,11 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, ...@@ -1078,26 +1092,11 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
memset(notprobed, 1, wordlen); memset(notprobed, 1, wordlen);
var = CopyVar(orig, 1); var = CopyVar(orig, 1);
while (node && level < wordlen) while (level < wordlen)
{ {
StopLow = node->data; /* find word with epenthetic or/and compound suffix */
StopHigh = node->data + node->length;
while (StopLow < StopHigh)
{
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
if (StopMiddle->val == ((uint8 *) (word))[level])
break;
else if (StopMiddle->val < ((uint8 *) (word))[level])
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
if (StopLow >= StopHigh)
break;
/* find word with epenthetic */
caff = Conf->CompoundAffix; caff = Conf->CompoundAffix;
while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level)) > 0) while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) > 0)
{ {
/* /*
* there is one of compound suffixes, so check word for existings * there is one of compound suffixes, so check word for existings
...@@ -1143,41 +1142,61 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, ...@@ -1143,41 +1142,61 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
} }
} }
/* find infinitive */ if ( !node )
if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level]) break;
StopLow = node->data;
StopHigh = node->data + node->length;
while (StopLow < StopHigh)
{ {
/* ok, we found full compoundallowed word */ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
if (level > minpos) if (StopMiddle->val == ((uint8 *) (word))[level])
break;
else if (StopMiddle->val < ((uint8 *) (word))[level])
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
if (StopLow < StopHigh) {
/* find infinitive */
if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level])
{ {
/* and its length more than minimal */ /* ok, we found full compoundallowed word */
if (wordlen == level + 1) if (level > minpos)
{
/* well, it was last word */
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
var->nstem++;
pfree(notprobed);
return var;
}
else
{ {
/* then we will search more big word at the same point */ /* and its length more than minimal */
SplitVar *ptr = var; if (wordlen == level + 1)
{
while (ptr->next) /* well, it was last word */
ptr = ptr->next; var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level); var->nstem++;
/* we can find next word */ pfree(notprobed);
level++; return var;
var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos); }
var->nstem++; else
node = Conf->Dictionary; {
startpos = level; /* then we will search more big word at the same point */
continue; SplitVar *ptr = var;
while (ptr->next)
ptr = ptr->next;
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
/* we can find next word */
level++;
var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
var->nstem++;
node = Conf->Dictionary;
startpos = level;
continue;
}
} }
} }
} node = StopMiddle->node;
} else
node = NULL;
level++; level++;
node = StopMiddle->node;
} }
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos); var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment