Commit 7ac8a4be authored by Teodor Sigaev's avatar Teodor Sigaev

Multibyte encodings support for ISpell dictionary

parent e3b98527
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "regis.h"
#include "ts_locale.h"
#include "common.h"
int
bool
RS_isRegis(const char *str)
{
unsigned char *ptr = (unsigned char *) str;
while (ptr && *ptr)
if (isalpha(*ptr) || *ptr == '[' || *ptr == ']' || *ptr == '^')
ptr++;
if (t_isalpha(ptr) || t_iseq(ptr,'[') || t_iseq(ptr,']') || t_iseq(ptr, '^'))
ptr+=pg_mblen(ptr);
else
return 0;
return 1;
return false;
return true;
}
#define RS_IN_ONEOF 1
......@@ -38,34 +39,32 @@ newRegisNode(RegisNode * prev, int len)
return ptr;
}
int
RS_compile(Regis * r, int issuffix, const char *str)
void
RS_compile(Regis * r, bool issuffix, char *str)
{
int i,
len = strlen(str);
int len = strlen(str);
int state = RS_IN_WAIT;
char *c = (char*)str;
RegisNode *ptr = NULL;
memset(r, 0, sizeof(Regis));
r->issuffix = (issuffix) ? 1 : 0;
for (i = 0; i < len; i++)
while(*c)
{
unsigned char c = *(((unsigned char *) str) + i);
if (state == RS_IN_WAIT)
{
if (isalpha(c))
if (t_isalpha(c))
{
if (ptr)
ptr = newRegisNode(ptr, len);
else
ptr = r->node = newRegisNode(NULL, len);
ptr->data[0] = c;
COPYCHAR(ptr->data, c);
ptr->type = RSF_ONEOF;
ptr->len = 1;
ptr->len = pg_mblen(c);
}
else if (c == '[')
else if (t_iseq(c,'['))
{
if (ptr)
ptr = newRegisNode(ptr, len);
......@@ -75,38 +74,39 @@ RS_compile(Regis * r, int issuffix, const char *str)
state = RS_IN_ONEOF;
}
else
ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
ts_error(ERROR, "Error in regis: %s", str );
}
else if (state == RS_IN_ONEOF)
{
if (c == '^')
if (t_iseq(c,'^'))
{
ptr->type = RSF_NONEOF;
state = RS_IN_NONEOF;
}
else if (isalpha(c))
else if (t_isalpha(c))
{
ptr->data[0] = c;
ptr->len = 1;
COPYCHAR(ptr->data, c);
ptr->len = pg_mblen(c);
state = RS_IN_ONEOF_IN;
}
else
ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
ts_error(ERROR, "Error in regis: %s", str);
}
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
{
if (isalpha(c))
if (t_isalpha(c))
{
ptr->data[ptr->len] = c;
ptr->len++;
COPYCHAR(ptr->data+ptr->len, c);
ptr->len+=pg_mblen(c);
}
else if (c == ']')
else if (t_iseq(c,']'))
state = RS_IN_WAIT;
else
ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
ts_error(ERROR, "Error in regis: %s", str);
}
else
ts_error(ERROR, "Internal error in RS_compile: %d\n", state);
ts_error(ERROR, "Internal error in RS_compile: %d", state);
c += pg_mblen(c);
}
ptr = r->node;
......@@ -115,8 +115,6 @@ RS_compile(Regis * r, int issuffix, const char *str)
r->nchar++;
ptr = ptr->next;
}
return 0;
}
void
......@@ -135,51 +133,77 @@ RS_free(Regis * r)
r->node = NULL;
}
int
RS_execute(Regis * r, const char *str, int len)
#ifdef TS_USE_WIDE
static bool
mb_strchr(char *str, char *c) {
int clen = pg_mblen(c), plen,i;
char *ptr =str;
bool res=false;
clen = pg_mblen(c);
while( *ptr && !res) {
plen = pg_mblen(ptr);
if ( plen == clen ) {
i=plen;
res = true;
while(i--)
if ( *(ptr+i) != *(c+i) ) {
res = false;
break;
}
}
ptr += plen;
}
return res;
}
#else
#define mb_strchr(s,c) ( (strchr((s),*(c)) == NULL) ? false : true )
#endif
bool
RS_execute(Regis * r, char *str)
{
RegisNode *ptr = r->node;
unsigned char *c;
char *c = str;
int len=0;
if (len < 0)
len = strlen(str);
while(*c) {
len++;
c += pg_mblen(c);
}
if (len < r->nchar)
return 0;
if (r->issuffix)
c = ((unsigned char *) str) + len - r->nchar;
else
c = (unsigned char *) str;
c = str;
if (r->issuffix) {
len -= r->nchar;
while(len-- > 0)
c += pg_mblen(c);
}
while (ptr)
{
switch (ptr->type)
{
case RSF_ONEOF:
if (ptr->len == 0)
{
if (*c != *(ptr->data))
return 0;
}
else if (strchr((char *) ptr->data, *c) == NULL)
return 0;
if ( mb_strchr((char *) ptr->data, c) != true )
return false;
break;
case RSF_NONEOF:
if (ptr->len == 0)
{
if (*c == *(ptr->data))
return 0;
}
else if (strchr((char *) ptr->data, *c) != NULL)
return 0;
if ( mb_strchr((char *) ptr->data, c) == true )
return false;
break;
default:
ts_error(ERROR, "RS_execute: Unknown type node: %d\n", ptr->type);
}
ptr = ptr->next;
c++;
c+=pg_mblen(c);
}
return 1;
return true;
}
......@@ -27,12 +27,12 @@ typedef struct Regis
unused:15;
} Regis;
int RS_isRegis(const char *str);
bool RS_isRegis(const char *str);
int RS_compile(Regis * r, int issuffix, const char *str);
void RS_compile(Regis * r, bool issuffix, char *str);
void RS_free(Regis * r);
/* 1 */
int RS_execute(Regis * r, const char *str, int len);
/*returns true if matches */
bool RS_execute(Regis * r, char *str);
#endif
......@@ -6,6 +6,7 @@
#include "postgres.h"
#include "spell.h"
#include "common.h"
#include "ts_locale.h"
#define MAX_NORM 1024
......@@ -13,7 +14,7 @@
#define ERRSTRSIZE 1024
#define STRNCASECMP(x,y) pg_strncasecmp(x, y, strlen(y))
#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
#define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
......@@ -41,6 +42,18 @@ strnduplicate(char *s, int len)
return d;
}
static char *
findchar(char *str, int c) {
while( *str ) {
if ( t_iseq(str, c) )
return str;
str+=pg_mblen(str);
}
return NULL;
}
/* backward string compare for suffix tree operations */
static int
strbcmp(const unsigned char *s1, const unsigned char *s2)
......@@ -145,15 +158,17 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
char *s;
const char *flag;
pg_verifymbstr( str, strlen(str), false);
flag = NULL;
if ((s = strchr(str, '/')))
if ((s = findchar(str, '/')))
{
*s++ = '\0';
flag = s;
while (*s)
{
if (isprint((unsigned char) *s) &&
!isspace((unsigned char) *s))
/* we allow only single encoded flags for faster works */
if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
s++;
else
{
......@@ -164,16 +179,19 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
}
else
flag = "";
lowerstr(str);
/* Dont load words if first letter is not required */
/* It allows to optimize loading at search time */
s = str;
while (*s)
{
if (*s == '\r' || *s == '\n')
if (t_isspace(s)) {
*s = '\0';
s++;
break;
}
s+=pg_mblen(s);
}
lowerstr(str);
NIAddSpell(Conf, str, flag);
}
fclose(dict);
......@@ -253,9 +271,10 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
}
else
{
int masklen = strlen(mask);
Conf->Affix[Conf->naffixes].issimple = 0;
Conf->Affix[Conf->naffixes].isregis = 0;
Conf->Affix[Conf->naffixes].mask = (char *) malloc(strlen(mask) + 2);
Conf->Affix[Conf->naffixes].mask = (char *) malloc(masklen + 2);
if (type == FF_SUFFIX)
sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
else
......@@ -277,37 +296,93 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
return (0);
}
static char *
remove_spaces(char *dist, char *src)
{
char *d,
*s;
#define PAE_WAIT_MASK 0
#define PAE_INMASK 1
#define PAE_WAIT_FIND 2
#define PAE_INFIND 3
#define PAE_WAIT_REPL 4
#define PAE_INREPL 5
static bool
parse_affentry( char *str, char *mask, char *find, char *repl ) {
int state = PAE_WAIT_MASK;
char *pmask=mask, *pfind=find, *prepl=repl;
*mask = *find = *repl = '\0';
while(*str) {
if ( state == PAE_WAIT_MASK ) {
if ( t_iseq(str,'#') )
return false;
else if (!t_isspace(str)) {
COPYCHAR(pmask, str);
pmask += pg_mblen(str);
state = PAE_INMASK;
}
} else if ( state == PAE_INMASK ) {
if ( t_iseq(str,'>') ) {
*pmask='\0';
state = PAE_WAIT_FIND;
} else if (!t_isspace(str)) {
COPYCHAR(pmask, str);
pmask += pg_mblen(str);
}
} else if ( state == PAE_WAIT_FIND ) {
if ( t_iseq(str,'-') ) {
state = PAE_INFIND;
} else if (t_isalpha(str)) {
COPYCHAR(prepl,str);
prepl += pg_mblen(str);
state = PAE_INREPL;
} else if (!t_isspace(str))
ts_error(ERROR, "Affix parse error");
} else if ( state == PAE_INFIND ) {
if ( t_iseq(str,',') ) {
*pfind='\0';
state = PAE_WAIT_REPL;
} else if (t_isalpha(str)) {
COPYCHAR(pfind,str);
pfind += pg_mblen(str);
} else if (!t_isspace(str))
ts_error(ERROR, "Affix parse error");
} else if ( state == PAE_WAIT_REPL ) {
if ( t_iseq(str,'-') ) {
break; /* void repl */
} else if ( t_isalpha(str) ) {
COPYCHAR(prepl,str);
prepl += pg_mblen(str);
state = PAE_INREPL;
} else if (!t_isspace(str))
ts_error(ERROR, "Affix parse error");
} else if ( state == PAE_INREPL ) {
if ( t_iseq(str,'#') ) {
*prepl = '\0';
break;
} else if ( t_isalpha(str) ) {
COPYCHAR(prepl,str);
prepl += pg_mblen(str);
} else if (!t_isspace(str))
ts_error(ERROR, "Affix parse error");
} else
ts_error(ERROR, "Unknown state in parse_affentry: %d", state);
d = dist;
s = src;
while (*s)
{
if (*s != ' ' && *s != '-' && *s != '\t')
{
*d = *s;
d++;
}
s++;
str += pg_mblen(str);
}
*d = 0;
return (dist);
}
*pmask = *pfind = *prepl = '\0';
return ( *mask && ( *find || *repl) ) ? true : false;
}
int
NIImportAffixes(IspellDict * Conf, const char *filename)
{
char str[BUFSIZ];
char tmpstr[BUFSIZ];
char mask[BUFSIZ];
char find[BUFSIZ];
char repl[BUFSIZ];
char *s;
int i;
int suffixes = 0;
int prefixes = 0;
int flag = 0;
......@@ -320,37 +395,45 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
while (fgets(str, sizeof(str), affix))
{
if (STRNCASECMP(str, "compoundwords") == 0)
pg_verifymbstr( str, strlen(str), false);
memcpy(tmpstr, str, 32); /* compoundwords... */
tmpstr[32]='\0';
lowerstr(tmpstr);
if (STRNCMP(tmpstr, "compoundwords") == 0)
{
s = strchr(str, 'l');
s = findchar(str, 'l');
if (s)
{
while (*s != ' ')
s++;
while (*s == ' ')
s++;
while (*s && !t_isspace(s)) s++;
while (*s && t_isspace(s)) s++;
if ( *s && pg_mblen(s) == 1 )
Conf->compoundcontrol = *s;
continue;
}
}
if (STRNCASECMP(str, "suffixes") == 0)
if (STRNCMP(tmpstr, "suffixes") == 0)
{
suffixes = 1;
prefixes = 0;
continue;
}
if (STRNCASECMP(str, "prefixes") == 0)
if (STRNCMP(tmpstr, "prefixes") == 0)
{
suffixes = 0;
prefixes = 1;
continue;
}
if (STRNCASECMP(str, "flag ") == 0)
if (STRNCMP(tmpstr, "flag") == 0)
{
s = str + 5;
s = str + 4;
flagflags = 0;
while (*s == ' ')
s++;
while (*s && t_isspace(s)) s++;
/* allow only single-encoded flags */
if ( pg_mblen(s) != 1 )
continue;
if (*s == '*')
{
flagflags |= FF_CROSSPRODUCT;
......@@ -365,43 +448,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
if (*s == '\\')
s++;
/* allow only single-encoded flags */
if ( pg_mblen(s) != 1 ) {
flagflags = 0;
continue;
}
flag = (unsigned char) *s;
continue;
}
if ((!suffixes) && (!prefixes))
continue;
if ((s = strchr(str, '#')))
*s = 0;
if (!*str)
continue;
lowerstr(str);
strcpy(mask, "");
strcpy(find, "");
strcpy(repl, "");
i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl);
remove_spaces(str, repl);
strcpy(repl, str);
remove_spaces(str, find);
strcpy(find, str);
remove_spaces(str, mask);
strcpy(mask, str);
switch (i)
{
case 3:
break;
case 2:
if (*find != '\0')
{
strcpy(repl, find);
strcpy(find, "");
}
break;
default:
if ( !parse_affentry(str, mask, find, repl) )
continue;
}
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
}
fclose(affix);
......@@ -768,30 +831,28 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
{
if (Affix->compile)
{
RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? 1 : 0, Affix->mask);
RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? true : false, Affix->mask);
Affix->compile = 0;
}
if (RS_execute(&(Affix->reg.regis), newword, -1))
if (RS_execute(&(Affix->reg.regis), newword))
return newword;
}
else
{
regmatch_t subs[2]; /* workaround for apache&linux */
int err;
pg_wchar *data;
size_t data_len;
int dat_len;
int newword_len;
if (Affix->compile)
{
int wmasklen,
masklen = strlen(Affix->mask);
pg_wchar *mask;
mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen);
err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_ADVANCED | REG_NOSUB);
pfree(mask);
if (err)
{
......@@ -804,11 +865,11 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
}
/* Convert data string to wide characters */
dat_len = strlen(newword);
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(newword, data, dat_len);
newword_len = strlen(newword);
data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
data_len = pg_mb2wchar_with_len(newword, data, newword_len);
if (!(err = pg_regexec(&(Affix->reg.regex), data, dat_len, 0, NULL, 1, subs, 0)))
if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
{
pfree(data);
return newword;
......
......@@ -4,8 +4,6 @@
*/
#include "postgres.h"
#include <ctype.h>
#include "miscadmin.h"
#include "common.h"
......@@ -71,6 +69,8 @@ readstoplist(text *in, StopList * s)
while (fgets(buf, STOPBUFLEN, hin))
{
buf[strlen(buf) - 1] = '\0';
pg_verifymbstr( buf, strlen(buf), false );
lowerstr(buf);
if (*buf == '\0')
continue;
......
......@@ -57,7 +57,7 @@ int _t_isprint( char *ptr );
int lll = pg_mblen( s ); \
\
while( lll-- ) \
TOUCHAR(d+lll) = TOUCHAR(s+lll); \
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment