Commit de55c0ce authored by Teodor Sigaev's avatar Teodor Sigaev

1 Fix affixes with void replacement (AFAIK, it's only russian)

2 Optimize regex execution
parent 153d5d31
# $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.5 2003/11/29 19:51:36 pgsql Exp $ # $PostgreSQL: pgsql/contrib/tsearch2/ispell/Makefile,v 1.6 2004/06/23 11:06:11 teodor Exp $
subdir = contrib/tsearch2/ispell subdir = contrib/tsearch2/ispell
top_builddir = ../../.. top_builddir = ../../..
...@@ -8,7 +8,7 @@ include $(top_builddir)/src/Makefile.global ...@@ -8,7 +8,7 @@ include $(top_builddir)/src/Makefile.global
PG_CPPFLAGS = -I$(srcdir)/.. $(CPPFLAGS) PG_CPPFLAGS = -I$(srcdir)/.. $(CPPFLAGS)
override CFLAGS += $(CFLAGS_SL) override CFLAGS += $(CFLAGS_SL)
SUBOBJS = spell.o SUBOBJS = spell.o regis.o
all: SUBSYS.o all: SUBSYS.o
......
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "regis.h"
#include "common.h"
int
RS_isRegis(const char *str) {
unsigned char *ptr=(unsigned char *)str;
while(ptr && *ptr)
if ( isalpha(*ptr) || *ptr=='[' || *ptr==']' || *ptr=='^')
ptr++;
else
return 0;
return 1;
}
#define RS_IN_ONEOF 1
#define RS_IN_ONEOF_IN 2
#define RS_IN_NONEOF 3
#define RS_IN_WAIT 4
static RegisNode*
newRegisNode(RegisNode *prev, int len) {
RegisNode *ptr;
ptr = (RegisNode*)malloc(RNHDRSZ+len+1);
if (!ptr)
ts_error(ERROR, "No memory");
memset(ptr,0,RNHDRSZ+len+1);
if (prev)
prev->next=ptr;
return ptr;
}
int
RS_compile(Regis *r, int issuffix, const char *str) {
int i,len = strlen(str);
int state = RS_IN_WAIT;
RegisNode *ptr=NULL;
memset(r,0,sizeof(Regis));
r->issuffix = (issuffix) ? 1 : 0;
for(i=0;i<len;i++) {
unsigned char c = *( ( (unsigned char*)str ) + i );
if ( state == RS_IN_WAIT ) {
if ( isalpha(c) ) {
if ( ptr )
ptr = newRegisNode(ptr,len);
else
ptr = r->node = newRegisNode(NULL,len);
ptr->data[ 0 ] = c;
ptr->type = RSF_ONEOF;
ptr->len=1;
} else if ( c=='[' ) {
if ( ptr )
ptr = newRegisNode(ptr,len);
else
ptr = r->node = newRegisNode(NULL,len);
ptr->type = RSF_ONEOF;
state=RS_IN_ONEOF;
} else
ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
} else if ( state == RS_IN_ONEOF ) {
if ( c=='^' ) {
ptr->type = RSF_NONEOF;
state=RS_IN_NONEOF;
} else if ( isalpha(c) ) {
ptr->data[ 0 ] = c;
ptr->len=1;
state=RS_IN_ONEOF_IN;
} else
ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
} else if ( state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF ) {
if ( isalpha(c) ) {
ptr->data[ ptr->len ] = c;
ptr->len++;
} else if ( c==']' ) {
state=RS_IN_WAIT;
} else
ts_error(ERROR,"Error in regis: %s at pos %d\n", str, i+1);
} else
ts_error(ERROR,"Internal error in RS_compile: %d\n", state);
}
ptr = r->node;
while(ptr) {
r->nchar++;
ptr=ptr->next;
}
return 0;
}
void
RS_free(Regis *r) {
RegisNode *ptr=r->node,*tmp;
while(ptr) {
tmp=ptr->next;
free(ptr);
ptr = tmp;
}
r->node = NULL;
}
int
RS_execute(Regis *r, const char *str, int len) {
RegisNode *ptr=r->node;
unsigned char *c;
if (len<0)
len=strlen(str);
if (len<r->nchar)
return 0;
if ( r->issuffix )
c = ((unsigned char*)str) + len - r->nchar;
else
c = (unsigned char*)str;
while(ptr) {
switch(ptr->type) {
case RSF_ONEOF:
if ( ptr->len==0 ) {
if ( *c != *(ptr->data) )
return 0;
} else if ( strchr((char*)ptr->data, *c) == NULL )
return 0;
break;
case RSF_NONEOF:
if ( ptr->len==0 ) {
if ( *c == *(ptr->data) )
return 0;
} else if ( strchr((char*)ptr->data, *c) != NULL )
return 0;
break;
default:
ts_error(ERROR,"RS_execute: Unknown type node: %d\n", ptr->type);
}
ptr=ptr->next;
c++;
}
return 1;
}
#ifndef __REGIS_H__
#define __REGIS_H__
#include "postgres.h"
typedef struct RegisNode {
uint32
type:2,
len:16,
unused:14;
struct RegisNode *next;
unsigned char data[1];
} RegisNode;
#define RNHDRSZ (sizeof(uint32)+sizeof(void*))
#define RSF_ONEOF 1
#define RSF_NONEOF 2
typedef struct Regis {
RegisNode *node;
uint32
issuffix:1,
nchar:16,
unused:15;
} Regis;
int RS_isRegis(const char *str);
int RS_compile(Regis *r, int issuffix, const char *str);
void RS_free(Regis *r);
/* 1 */
int RS_execute(Regis *r, const char *str, int len);
#endif
...@@ -190,24 +190,24 @@ FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly) ...@@ -190,24 +190,24 @@ FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
{ {
SPNode *node = Conf->Dictionary; SPNode *node = Conf->Dictionary;
SPNodeData *StopLow, *StopHigh, *StopMiddle; SPNodeData *StopLow, *StopHigh, *StopMiddle;
int level=0, wrdlen=strlen(word); uint8 *ptr =(uint8*)word;
while( node && level<wrdlen) { while( node && *ptr) {
StopLow = node->data; StopLow = node->data;
StopHigh = node->data+node->length; StopHigh = node->data+node->length;
while (StopLow < StopHigh) { while (StopLow < StopHigh) {
StopMiddle = StopLow + (StopHigh - StopLow) / 2; StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
if ( StopMiddle->val == ((uint8*)(word))[level] ) { if ( StopMiddle->val == *ptr ) {
if ( wrdlen==level+1 && StopMiddle->isword ) { if ( *(ptr+1)=='\0' && StopMiddle->isword ) {
if ( compoundonly && !StopMiddle->compoundallow ) if ( compoundonly && !StopMiddle->compoundallow )
return 0; return 0;
if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL)) if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
return 1; return 1;
} }
node=StopMiddle->node; node=StopMiddle->node;
level++; ptr++;
break; break;
} else if ( StopMiddle->val < ((uint8*)(word))[level] ) { } else if ( StopMiddle->val < *ptr ) {
StopLow = StopMiddle + 1; StopLow = StopMiddle + 1;
} else { } else {
StopHigh = StopMiddle; StopHigh = StopMiddle;
...@@ -236,10 +236,23 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const ...@@ -236,10 +236,23 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
} }
MEMOUT(Conf->Affix); MEMOUT(Conf->Affix);
} }
if (type == 's')
if ( strcmp(mask,".")==0 ) {
Conf->Affix[Conf->naffixes].issimple=1;
Conf->Affix[Conf->naffixes].isregis=0;
*( Conf->Affix[Conf->naffixes].mask )='\0';
} else if ( RS_isRegis(mask) ) {
Conf->Affix[Conf->naffixes].issimple=0;
Conf->Affix[Conf->naffixes].isregis=1;
strcpy(Conf->Affix[Conf->naffixes].mask, mask);
} else {
Conf->Affix[Conf->naffixes].issimple=0;
Conf->Affix[Conf->naffixes].isregis=0;
if (type == FF_SUFFIX)
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
else else
sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask); sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
}
Conf->Affix[Conf->naffixes].compile = 1; Conf->Affix[Conf->naffixes].compile = 1;
Conf->Affix[Conf->naffixes].flagflags = flagflags; Conf->Affix[Conf->naffixes].flagflags = flagflags;
Conf->Affix[Conf->naffixes].flag = flag; Conf->Affix[Conf->naffixes].flag = flag;
...@@ -366,7 +379,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename) ...@@ -366,7 +379,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
continue; continue;
} }
NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p'); NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
} }
fclose(affix); fclose(affix);
...@@ -550,6 +563,46 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) { ...@@ -550,6 +563,46 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type) {
return rs; return rs;
} }
static void
mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) {
int i,cnt=0;
int start = (issuffix) ? startsuffix : 0;
int end = (issuffix) ? Conf->naffixes : startsuffix;
AffixNode *Affix = (AffixNode*)malloc( ANHRDSZ + sizeof(AffixNodeData));
MEMOUT(Affix);
memset(Affix, 0, ANHRDSZ + sizeof(AffixNodeData) );
Affix->length=1;
Affix->isvoid=1;
if (issuffix) {
Affix->data->node=Conf->Suffix;
Conf->Suffix = Affix;
} else {
Affix->data->node=Conf->Prefix;
Conf->Prefix = Affix;
}
for(i=start;i<end;i++)
if (Conf->Affix[i].replen==0)
cnt++;
if ( cnt==0 )
return;
Affix->data->aff = (AFFIX**)malloc( sizeof(AFFIX*) * cnt );
MEMOUT(Affix->data->aff);
Affix->data->naff = (uint32)cnt;
cnt=0;
for(i=start;i<end;i++)
if (Conf->Affix[i].replen==0) {
Affix->data->aff[cnt] = Conf->Affix + i;
cnt++;
}
}
void void
NISortAffixes(IspellDict * Conf) NISortAffixes(IspellDict * Conf)
{ {
...@@ -584,6 +637,8 @@ NISortAffixes(IspellDict * Conf) ...@@ -584,6 +637,8 @@ NISortAffixes(IspellDict * Conf)
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p'); Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p');
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's'); Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
mkVoidAffix(Conf, 1, firstsuffix);
mkVoidAffix(Conf, 0, firstsuffix);
} }
static AffixNodeData* static AffixNodeData*
...@@ -591,17 +646,23 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) ...@@ -591,17 +646,23 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
AffixNodeData *StopLow, *StopHigh, *StopMiddle; AffixNodeData *StopLow, *StopHigh, *StopMiddle;
uint8 symbol; uint8 symbol;
if ( node->isvoid ) { /* search void affixes */
if (node->data->naff)
return node->data;
node = node->data->node;
}
while( node && *level<wrdlen) { while( node && *level<wrdlen) {
StopLow = node->data; StopLow = node->data;
StopHigh = node->data+node->length; StopHigh = node->data+node->length;
while (StopLow < StopHigh) { while (StopLow < StopHigh) {
StopMiddle = StopLow + (StopHigh - StopLow) / 2; StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
symbol = GETWCHAR(word,wrdlen,*level,type); symbol = GETWCHAR(word,wrdlen,*level,type);
if ( StopMiddle->val == symbol ) { if ( StopMiddle->val == symbol ) {
(*level)++;
if ( StopMiddle->naff ) if ( StopMiddle->naff )
return StopMiddle; return StopMiddle;
node=StopMiddle->node; node=StopMiddle->node;
(*level)++;
break; break;
} else if ( StopMiddle->val < symbol ) { } else if ( StopMiddle->val < symbol ) {
StopLow = StopMiddle + 1; StopLow = StopMiddle + 1;
...@@ -617,11 +678,6 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) ...@@ -617,11 +678,6 @@ FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
static char * static char *
CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) { CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
regmatch_t subs[2]; /* workaround for apache&linux */
int err;
pg_wchar *data;
size_t data_len;
int dat_len;
if ( flagflags & FF_COMPOUNDONLYAFX ) { if ( flagflags & FF_COMPOUNDONLYAFX ) {
if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 ) if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
...@@ -631,7 +687,7 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne ...@@ -631,7 +687,7 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
return NULL; return NULL;
} }
if ( Affix->type=='s' ) { if ( Affix->type==FF_SUFFIX ) {
strcpy(newword, word); strcpy(newword, word);
strcpy(newword + len - Affix->replen, Affix->find); strcpy(newword + len - Affix->replen, Affix->find);
} else { } else {
...@@ -639,6 +695,21 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne ...@@ -639,6 +695,21 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
strcat(newword, word + Affix->replen); strcat(newword, word + Affix->replen);
} }
if ( Affix->issimple ) {
return newword;
} else if ( Affix->isregis ) {
if (Affix->compile) {
RS_compile(&(Affix->reg.regis), (Affix->type==FF_SUFFIX) ? 1 : 0, Affix->mask);
Affix->compile = 0;
}
if ( RS_execute(&(Affix->reg.regis), newword, -1) )
return newword;
} else {
regmatch_t subs[2]; /* workaround for apache&linux */
int err;
pg_wchar *data;
size_t data_len;
int dat_len;
if (Affix->compile) if (Affix->compile)
{ {
int wmasklen,masklen = strlen(Affix->mask); int wmasklen,masklen = strlen(Affix->mask);
...@@ -646,12 +717,12 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne ...@@ -646,12 +717,12 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar)); mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen); wmasklen = pg_mb2wchar_with_len( Affix->mask, mask, masklen);
err = pg_regcomp(&(Affix->reg), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB); err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
pfree(mask); pfree(mask);
if (err) if (err)
{ {
/* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */ /* regerror(err, &(Affix->reg.regex), regerrstr, ERRSTRSIZE); */
pg_regfree(&(Affix->reg)); pg_regfree(&(Affix->reg.regex));
return (NULL); return (NULL);
} }
Affix->compile = 0; Affix->compile = 0;
...@@ -662,11 +733,12 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne ...@@ -662,11 +733,12 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(newword, data, dat_len); data_len = pg_mb2wchar_with_len(newword, data, dat_len);
if (!(err = pg_regexec(&(Affix->reg), data,dat_len,NULL, 1, subs, 0))) { if (!(err = pg_regexec(&(Affix->reg.regex), data,dat_len,NULL, 1, subs, 0))) {
pfree(data); pfree(data);
return newword; return newword;
} }
pfree(data); pfree(data);
}
return NULL; return NULL;
} }
...@@ -715,7 +787,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) { ...@@ -715,7 +787,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
} }
} }
pnode = prefix->node; pnode = prefix->node;
plevel++;
} }
/* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/ /* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
...@@ -754,13 +825,11 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) { ...@@ -754,13 +825,11 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
} }
} }
pnode = prefix->node; pnode = prefix->node;
plevel++;
} }
} }
} }
snode=suffix->node; snode=suffix->node;
slevel++;
} }
if (cur == forms) { if (cur == forms) {
...@@ -1013,8 +1082,12 @@ NIFree(IspellDict * Conf) ...@@ -1013,8 +1082,12 @@ NIFree(IspellDict * Conf)
for (i = 0; i < Conf->naffixes; i++) for (i = 0; i < Conf->naffixes; i++)
{ {
if (Affix[i].compile == 0) if (Affix[i].compile == 0) {
pg_regfree(&(Affix[i].reg)); if ( Affix[i].isregis )
RS_free(&(Affix[i].reg.regis));
else
pg_regfree(&(Affix[i].reg.regex));
}
} }
if (Conf->Spell) { if (Conf->Spell) {
for (i = 0; i < Conf->nspell; i++) for (i = 0; i < Conf->nspell; i++)
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <sys/types.h> #include <sys/types.h>
#include "regex/regex.h" #include "regex/regex.h"
#include "regis.h"
#include "c.h" #include "c.h"
...@@ -40,20 +41,29 @@ typedef struct spell_struct ...@@ -40,20 +41,29 @@ typedef struct spell_struct
typedef struct aff_struct typedef struct aff_struct
{ {
char flag; uint32
char flagflags; flag:8,
char type; type:2,
char mask[33]; compile:1,
flagflags:3,
issimple:1,
isregis:1,
unused:1,
replen:16;
char mask[32];
char find[16]; char find[16];
char repl[16]; char repl[16];
regex_t reg; union {
size_t replen; regex_t regex;
char compile; Regis regis;
} reg;
} AFFIX; } AFFIX;
#define FF_CROSSPRODUCT 0x01 #define FF_CROSSPRODUCT 0x01
#define FF_COMPOUNDWORD 0x02 #define FF_COMPOUNDWORD 0x02
#define FF_COMPOUNDONLYAFX 0x04 #define FF_COMPOUNDONLYAFX 0x04
#define FF_SUFFIX 2
#define FF_PREFIX 1
struct AffixNode; struct AffixNode;
...@@ -66,18 +76,13 @@ typedef struct { ...@@ -66,18 +76,13 @@ typedef struct {
} AffixNodeData; } AffixNodeData;
typedef struct AffixNode { typedef struct AffixNode {
uint32 length; uint32 isvoid:1,
length:31;
AffixNodeData data[1]; AffixNodeData data[1];
} AffixNode; } AffixNode;
#define ANHRDSZ (sizeof(uint32)) #define ANHRDSZ (sizeof(uint32))
typedef struct Tree_struct
{
int Left[256],
Right[256];
} Tree_struct;
typedef struct { typedef struct {
char *affix; char *affix;
int len; int len;
......
...@@ -816,7 +816,7 @@ CREATE OPERATOR CLASS tsvector_ops ...@@ -816,7 +816,7 @@ CREATE OPERATOR CLASS tsvector_ops
FUNCTION 1 tsvector_cmp(tsvector, tsvector); FUNCTION 1 tsvector_cmp(tsvector, tsvector);
--example of ISpell dictionary --example of ISpell dictionary
--update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_id=4; --update pg_ts_dict set dict_initoption='DictFile="/usr/local/share/ispell/russian.dict" ,AffFile ="/usr/local/share/ispell/russian.aff", StopFile="/usr/local/share/ispell/russian.stop"' where dict_name='ispell_template';
--example of synonym dict --example of synonym dict
--update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5; --update pg_ts_dict set dict_initoption='/usr/local/share/ispell/english.syn' where dict_id=5;
END; END;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment