Commit 7ac8a4be authored by Teodor Sigaev's avatar Teodor Sigaev

Multibyte encodings support for ISpell dictionary

parent e3b98527
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <ctype.h>
#include "regis.h" #include "regis.h"
#include "ts_locale.h"
#include "common.h" #include "common.h"
int bool
RS_isRegis(const char *str) RS_isRegis(const char *str)
{ {
unsigned char *ptr = (unsigned char *) str; unsigned char *ptr = (unsigned char *) str;
while (ptr && *ptr) while (ptr && *ptr)
if (isalpha(*ptr) || *ptr == '[' || *ptr == ']' || *ptr == '^') if (t_isalpha(ptr) || t_iseq(ptr,'[') || t_iseq(ptr,']') || t_iseq(ptr, '^'))
ptr++; ptr+=pg_mblen(ptr);
else else
return 0; return false;
return 1;
return true;
} }
#define RS_IN_ONEOF 1 #define RS_IN_ONEOF 1
...@@ -38,34 +39,32 @@ newRegisNode(RegisNode * prev, int len) ...@@ -38,34 +39,32 @@ newRegisNode(RegisNode * prev, int len)
return ptr; return ptr;
} }
int void
RS_compile(Regis * r, int issuffix, const char *str) RS_compile(Regis * r, bool issuffix, char *str)
{ {
int i, int len = strlen(str);
len = strlen(str);
int state = RS_IN_WAIT; int state = RS_IN_WAIT;
char *c = (char*)str;
RegisNode *ptr = NULL; RegisNode *ptr = NULL;
memset(r, 0, sizeof(Regis)); memset(r, 0, sizeof(Regis));
r->issuffix = (issuffix) ? 1 : 0; r->issuffix = (issuffix) ? 1 : 0;
for (i = 0; i < len; i++) while(*c)
{ {
unsigned char c = *(((unsigned char *) str) + i);
if (state == RS_IN_WAIT) if (state == RS_IN_WAIT)
{ {
if (isalpha(c)) if (t_isalpha(c))
{ {
if (ptr) if (ptr)
ptr = newRegisNode(ptr, len); ptr = newRegisNode(ptr, len);
else else
ptr = r->node = newRegisNode(NULL, len); ptr = r->node = newRegisNode(NULL, len);
ptr->data[0] = c; COPYCHAR(ptr->data, c);
ptr->type = RSF_ONEOF; ptr->type = RSF_ONEOF;
ptr->len = 1; ptr->len = pg_mblen(c);
} }
else if (c == '[') else if (t_iseq(c,'['))
{ {
if (ptr) if (ptr)
ptr = newRegisNode(ptr, len); ptr = newRegisNode(ptr, len);
...@@ -75,38 +74,39 @@ RS_compile(Regis * r, int issuffix, const char *str) ...@@ -75,38 +74,39 @@ RS_compile(Regis * r, int issuffix, const char *str)
state = RS_IN_ONEOF; state = RS_IN_ONEOF;
} }
else else
ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1); ts_error(ERROR, "Error in regis: %s", str );
} }
else if (state == RS_IN_ONEOF) else if (state == RS_IN_ONEOF)
{ {
if (c == '^') if (t_iseq(c,'^'))
{ {
ptr->type = RSF_NONEOF; ptr->type = RSF_NONEOF;
state = RS_IN_NONEOF; state = RS_IN_NONEOF;
} }
else if (isalpha(c)) else if (t_isalpha(c))
{ {
ptr->data[0] = c; COPYCHAR(ptr->data, c);
ptr->len = 1; ptr->len = pg_mblen(c);
state = RS_IN_ONEOF_IN; state = RS_IN_ONEOF_IN;
} }
else else
ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1); ts_error(ERROR, "Error in regis: %s", str);
} }
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF) else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
{ {
if (isalpha(c)) if (t_isalpha(c))
{ {
ptr->data[ptr->len] = c; COPYCHAR(ptr->data+ptr->len, c);
ptr->len++; ptr->len+=pg_mblen(c);
} }
else if (c == ']') else if (t_iseq(c,']'))
state = RS_IN_WAIT; state = RS_IN_WAIT;
else else
ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1); ts_error(ERROR, "Error in regis: %s", str);
} }
else else
ts_error(ERROR, "Internal error in RS_compile: %d\n", state); ts_error(ERROR, "Internal error in RS_compile: %d", state);
c += pg_mblen(c);
} }
ptr = r->node; ptr = r->node;
...@@ -115,8 +115,6 @@ RS_compile(Regis * r, int issuffix, const char *str) ...@@ -115,8 +115,6 @@ RS_compile(Regis * r, int issuffix, const char *str)
r->nchar++; r->nchar++;
ptr = ptr->next; ptr = ptr->next;
} }
return 0;
} }
void void
...@@ -135,51 +133,77 @@ RS_free(Regis * r) ...@@ -135,51 +133,77 @@ RS_free(Regis * r)
r->node = NULL; r->node = NULL;
} }
int #ifdef TS_USE_WIDE
RS_execute(Regis * r, const char *str, int len) static bool
mb_strchr(char *str, char *c) {
int clen = pg_mblen(c), plen,i;
char *ptr =str;
bool res=false;
clen = pg_mblen(c);
while( *ptr && !res) {
plen = pg_mblen(ptr);
if ( plen == clen ) {
i=plen;
res = true;
while(i--)
if ( *(ptr+i) != *(c+i) ) {
res = false;
break;
}
}
ptr += plen;
}
return res;
}
#else
#define mb_strchr(s,c) ( (strchr((s),*(c)) == NULL) ? false : true )
#endif
bool
RS_execute(Regis * r, char *str)
{ {
RegisNode *ptr = r->node; RegisNode *ptr = r->node;
unsigned char *c; char *c = str;
int len=0;
if (len < 0) while(*c) {
len = strlen(str); len++;
c += pg_mblen(c);
}
if (len < r->nchar) if (len < r->nchar)
return 0; return 0;
if (r->issuffix) c = str;
c = ((unsigned char *) str) + len - r->nchar; if (r->issuffix) {
else len -= r->nchar;
c = (unsigned char *) str; while(len-- > 0)
c += pg_mblen(c);
}
while (ptr) while (ptr)
{ {
switch (ptr->type) switch (ptr->type)
{ {
case RSF_ONEOF: case RSF_ONEOF:
if (ptr->len == 0) if ( mb_strchr((char *) ptr->data, c) != true )
{ return false;
if (*c != *(ptr->data))
return 0;
}
else if (strchr((char *) ptr->data, *c) == NULL)
return 0;
break; break;
case RSF_NONEOF: case RSF_NONEOF:
if (ptr->len == 0) if ( mb_strchr((char *) ptr->data, c) == true )
{ return false;
if (*c == *(ptr->data))
return 0;
}
else if (strchr((char *) ptr->data, *c) != NULL)
return 0;
break; break;
default: default:
ts_error(ERROR, "RS_execute: Unknown type node: %d\n", ptr->type); ts_error(ERROR, "RS_execute: Unknown type node: %d\n", ptr->type);
} }
ptr = ptr->next; ptr = ptr->next;
c++; c+=pg_mblen(c);
} }
return 1; return true;
} }
...@@ -27,12 +27,12 @@ typedef struct Regis ...@@ -27,12 +27,12 @@ typedef struct Regis
unused:15; unused:15;
} Regis; } Regis;
int RS_isRegis(const char *str); bool RS_isRegis(const char *str);
int RS_compile(Regis * r, int issuffix, const char *str); void RS_compile(Regis * r, bool issuffix, char *str);
void RS_free(Regis * r); void RS_free(Regis * r);
/* 1 */ /*returns true if matches */
int RS_execute(Regis * r, const char *str, int len); bool RS_execute(Regis * r, char *str);
#endif #endif
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "postgres.h" #include "postgres.h"
#include "spell.h" #include "spell.h"
#include "common.h"
#include "ts_locale.h" #include "ts_locale.h"
#define MAX_NORM 1024 #define MAX_NORM 1024
...@@ -13,7 +14,7 @@ ...@@ -13,7 +14,7 @@
#define ERRSTRSIZE 1024 #define ERRSTRSIZE 1024
#define STRNCASECMP(x,y) pg_strncasecmp(x, y, strlen(y)) #define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
#define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] ) #define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T ) #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
...@@ -41,6 +42,18 @@ strnduplicate(char *s, int len) ...@@ -41,6 +42,18 @@ strnduplicate(char *s, int len)
return d; return d;
} }
static char *
findchar(char *str, int c) {
while( *str ) {
if ( t_iseq(str, c) )
return str;
str+=pg_mblen(str);
}
return NULL;
}
/* backward string compare for suffix tree operations */ /* backward string compare for suffix tree operations */
static int static int
strbcmp(const unsigned char *s1, const unsigned char *s2) strbcmp(const unsigned char *s1, const unsigned char *s2)
...@@ -145,15 +158,17 @@ NIImportDictionary(IspellDict * Conf, const char *filename) ...@@ -145,15 +158,17 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
char *s; char *s;
const char *flag; const char *flag;
pg_verifymbstr( str, strlen(str), false);
flag = NULL; flag = NULL;
if ((s = strchr(str, '/'))) if ((s = findchar(str, '/')))
{ {
*s++ = '\0'; *s++ = '\0';
flag = s; flag = s;
while (*s) while (*s)
{ {
if (isprint((unsigned char) *s) && /* we allow only single encoded flags for faster works */
!isspace((unsigned char) *s)) if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
s++; s++;
else else
{ {
...@@ -164,16 +179,19 @@ NIImportDictionary(IspellDict * Conf, const char *filename) ...@@ -164,16 +179,19 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
} }
else else
flag = ""; flag = "";
lowerstr(str);
/* Dont load words if first letter is not required */
/* It allows to optimize loading at search time */
s = str; s = str;
while (*s) while (*s)
{ {
if (*s == '\r' || *s == '\n') if (t_isspace(s)) {
*s = '\0'; *s = '\0';
s++; break;
}
s+=pg_mblen(s);
} }
lowerstr(str);
NIAddSpell(Conf, str, flag); NIAddSpell(Conf, str, flag);
} }
fclose(dict); fclose(dict);
...@@ -253,9 +271,10 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const ...@@ -253,9 +271,10 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
} }
else else
{ {
int masklen = strlen(mask);
Conf->Affix[Conf->naffixes].issimple = 0; Conf->Affix[Conf->naffixes].issimple = 0;
Conf->Affix[Conf->naffixes].isregis = 0; Conf->Affix[Conf->naffixes].isregis = 0;
Conf->Affix[Conf->naffixes].mask = (char *) malloc(strlen(mask) + 2); Conf->Affix[Conf->naffixes].mask = (char *) malloc(masklen + 2);
if (type == FF_SUFFIX) if (type == FF_SUFFIX)
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
else else
...@@ -277,37 +296,93 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const ...@@ -277,37 +296,93 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
return (0); return (0);
} }
static char * #define PAE_WAIT_MASK 0
remove_spaces(char *dist, char *src) #define PAE_INMASK 1
{ #define PAE_WAIT_FIND 2
char *d, #define PAE_INFIND 3
*s; #define PAE_WAIT_REPL 4
#define PAE_INREPL 5
static bool
parse_affentry( char *str, char *mask, char *find, char *repl ) {
int state = PAE_WAIT_MASK;
char *pmask=mask, *pfind=find, *prepl=repl;
*mask = *find = *repl = '\0';
while(*str) {
if ( state == PAE_WAIT_MASK ) {
if ( t_iseq(str,'#') )
return false;
else if (!t_isspace(str)) {
COPYCHAR(pmask, str);
pmask += pg_mblen(str);
state = PAE_INMASK;
}
} else if ( state == PAE_INMASK ) {
if ( t_iseq(str,'>') ) {
*pmask='\0';
state = PAE_WAIT_FIND;
} else if (!t_isspace(str)) {
COPYCHAR(pmask, str);
pmask += pg_mblen(str);
}
} else if ( state == PAE_WAIT_FIND ) {
if ( t_iseq(str,'-') ) {
state = PAE_INFIND;
} else if (t_isalpha(str)) {
COPYCHAR(prepl,str);
prepl += pg_mblen(str);
state = PAE_INREPL;
} else if (!t_isspace(str))
ts_error(ERROR, "Affix parse error");
} else if ( state == PAE_INFIND ) {
if ( t_iseq(str,',') ) {
*pfind='\0';
state = PAE_WAIT_REPL;
} else if (t_isalpha(str)) {
COPYCHAR(pfind,str);
pfind += pg_mblen(str);
} else if (!t_isspace(str))
ts_error(ERROR, "Affix parse error");
} else if ( state == PAE_WAIT_REPL ) {
if ( t_iseq(str,'-') ) {
break; /* void repl */
} else if ( t_isalpha(str) ) {
COPYCHAR(prepl,str);
prepl += pg_mblen(str);
state = PAE_INREPL;
} else if (!t_isspace(str))
ts_error(ERROR, "Affix parse error");
} else if ( state == PAE_INREPL ) {
if ( t_iseq(str,'#') ) {
*prepl = '\0';
break;
} else if ( t_isalpha(str) ) {
COPYCHAR(prepl,str);
prepl += pg_mblen(str);
} else if (!t_isspace(str))
ts_error(ERROR, "Affix parse error");
} else
ts_error(ERROR, "Unknown state in parse_affentry: %d", state);
d = dist; str += pg_mblen(str);
s = src;
while (*s)
{
if (*s != ' ' && *s != '-' && *s != '\t')
{
*d = *s;
d++;
}
s++;
} }
*d = 0;
return (dist);
}
*pmask = *pfind = *prepl = '\0';
return ( *mask && ( *find || *repl) ) ? true : false;
}
int int
NIImportAffixes(IspellDict * Conf, const char *filename) NIImportAffixes(IspellDict * Conf, const char *filename)
{ {
char str[BUFSIZ]; char str[BUFSIZ];
char tmpstr[BUFSIZ];
char mask[BUFSIZ]; char mask[BUFSIZ];
char find[BUFSIZ]; char find[BUFSIZ];
char repl[BUFSIZ]; char repl[BUFSIZ];
char *s; char *s;
int i;
int suffixes = 0; int suffixes = 0;
int prefixes = 0; int prefixes = 0;
int flag = 0; int flag = 0;
...@@ -320,37 +395,45 @@ NIImportAffixes(IspellDict * Conf, const char *filename) ...@@ -320,37 +395,45 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
while (fgets(str, sizeof(str), affix)) while (fgets(str, sizeof(str), affix))
{ {
if (STRNCASECMP(str, "compoundwords") == 0) pg_verifymbstr( str, strlen(str), false);
memcpy(tmpstr, str, 32); /* compoundwords... */
tmpstr[32]='\0';
lowerstr(tmpstr);
if (STRNCMP(tmpstr, "compoundwords") == 0)
{ {
s = strchr(str, 'l'); s = findchar(str, 'l');
if (s) if (s)
{ {
while (*s != ' ') while (*s && !t_isspace(s)) s++;
s++; while (*s && t_isspace(s)) s++;
while (*s == ' ') if ( *s && pg_mblen(s) == 1 )
s++;
Conf->compoundcontrol = *s; Conf->compoundcontrol = *s;
continue; continue;
} }
} }
if (STRNCASECMP(str, "suffixes") == 0) if (STRNCMP(tmpstr, "suffixes") == 0)
{ {
suffixes = 1; suffixes = 1;
prefixes = 0; prefixes = 0;
continue; continue;
} }
if (STRNCASECMP(str, "prefixes") == 0) if (STRNCMP(tmpstr, "prefixes") == 0)
{ {
suffixes = 0; suffixes = 0;
prefixes = 1; prefixes = 1;
continue; continue;
} }
if (STRNCASECMP(str, "flag ") == 0) if (STRNCMP(tmpstr, "flag") == 0)
{ {
s = str + 5; s = str + 4;
flagflags = 0; flagflags = 0;
while (*s == ' ')
s++; while (*s && t_isspace(s)) s++;
/* allow only single-encoded flags */
if ( pg_mblen(s) != 1 )
continue;
if (*s == '*') if (*s == '*')
{ {
flagflags |= FF_CROSSPRODUCT; flagflags |= FF_CROSSPRODUCT;
...@@ -365,43 +448,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename) ...@@ -365,43 +448,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
if (*s == '\\') if (*s == '\\')
s++; s++;
/* allow only single-encoded flags */
if ( pg_mblen(s) != 1 ) {
flagflags = 0;
continue;
}
flag = (unsigned char) *s; flag = (unsigned char) *s;
continue; continue;
} }
if ((!suffixes) && (!prefixes)) if ((!suffixes) && (!prefixes))
continue; continue;
if ((s = strchr(str, '#')))
*s = 0;
if (!*str)
continue;
lowerstr(str); lowerstr(str);
strcpy(mask, ""); if ( !parse_affentry(str, mask, find, repl) )
strcpy(find, "");
strcpy(repl, "");
i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl);
remove_spaces(str, repl);
strcpy(repl, str);
remove_spaces(str, find);
strcpy(find, str);
remove_spaces(str, mask);
strcpy(mask, str);
switch (i)
{
case 3:
break;
case 2:
if (*find != '\0')
{
strcpy(repl, find);
strcpy(find, "");
}
break;
default:
continue; continue;
}
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
} }
fclose(affix); fclose(affix);
...@@ -768,30 +831,28 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne ...@@ -768,30 +831,28 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
{ {
if (Affix->compile) if (Affix->compile)
{ {
RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? 1 : 0, Affix->mask); RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? true : false, Affix->mask);
Affix->compile = 0; Affix->compile = 0;
} }
if (RS_execute(&(Affix->reg.regis), newword, -1)) if (RS_execute(&(Affix->reg.regis), newword))
return newword; return newword;
} }
else else
{ {
regmatch_t subs[2]; /* workaround for apache&linux */
int err; int err;
pg_wchar *data; pg_wchar *data;
size_t data_len; size_t data_len;
int dat_len; int newword_len;
if (Affix->compile) if (Affix->compile)
{ {
int wmasklen, int wmasklen,
masklen = strlen(Affix->mask); masklen = strlen(Affix->mask);
pg_wchar *mask; pg_wchar *mask;
mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar)); mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen); wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen);
err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB); err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_ADVANCED | REG_NOSUB);
pfree(mask); pfree(mask);
if (err) if (err)
{ {
...@@ -804,11 +865,11 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne ...@@ -804,11 +865,11 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
} }
/* Convert data string to wide characters */ /* Convert data string to wide characters */
dat_len = strlen(newword); newword_len = strlen(newword);
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(newword, data, dat_len); data_len = pg_mb2wchar_with_len(newword, data, newword_len);
if (!(err = pg_regexec(&(Affix->reg.regex), data, dat_len, 0, NULL, 1, subs, 0))) if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
{ {
pfree(data); pfree(data);
return newword; return newword;
......
...@@ -4,8 +4,6 @@ ...@@ -4,8 +4,6 @@
*/ */
#include "postgres.h" #include "postgres.h"
#include <ctype.h>
#include "miscadmin.h" #include "miscadmin.h"
#include "common.h" #include "common.h"
...@@ -71,6 +69,8 @@ readstoplist(text *in, StopList * s) ...@@ -71,6 +69,8 @@ readstoplist(text *in, StopList * s)
while (fgets(buf, STOPBUFLEN, hin)) while (fgets(buf, STOPBUFLEN, hin))
{ {
buf[strlen(buf) - 1] = '\0'; buf[strlen(buf) - 1] = '\0';
pg_verifymbstr( buf, strlen(buf), false );
lowerstr(buf);
if (*buf == '\0') if (*buf == '\0')
continue; continue;
......
...@@ -57,7 +57,7 @@ int _t_isprint( char *ptr ); ...@@ -57,7 +57,7 @@ int _t_isprint( char *ptr );
int lll = pg_mblen( s ); \ int lll = pg_mblen( s ); \
\ \
while( lll-- ) \ while( lll-- ) \
TOUCHAR(d+lll) = TOUCHAR(s+lll); \ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0) } while(0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment