Multibyte encodings support for ISpell dictionary

7ac8a4be · Teodor Sigaev · e3b98527 · 7ac8a4be · 7ac8a4be · 7ac8a4be
Commit 7ac8a4be authored Dec 21, 2005 by Teodor Sigaev
5 changed files
--- a/contrib/tsearch2/ispell/regis.c
+++ b/contrib/tsearch2/ispell/regis.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <ctype.h>

 #include "regis.h"
+#include "ts_locale.h"
 #include "common.h"

-int
+bool
 RS_isRegis(const char *str)
 {
 	unsigned char *ptr = (unsigned char *) str;

 	while (ptr && *ptr)
-		if (isalpha(*ptr) || *ptr == '[' || *ptr == ']' || *ptr == '^')
-			ptr++;
+		if (t_isalpha(ptr) || t_iseq(ptr,'[') || t_iseq(ptr,']') || t_iseq(ptr, '^'))
+			ptr+=pg_mblen(ptr);
 		else
-			return 0;
-	return 1;
+			return false;
+
+	return true;
 }

 #define RS_IN_ONEOF 1
@@ -38,34 +39,32 @@ newRegisNode(RegisNode * prev, int len)
 	return ptr;
 }

-int
-RS_compile(Regis * r, int issuffix, const char *str)
+void
+RS_compile(Regis * r, bool issuffix, char *str)
 {
-	int			i,
-				len = strlen(str);
+	int			len = strlen(str);
 	int			state = RS_IN_WAIT;
+	char			*c = (char*)str;
 	RegisNode  *ptr = NULL;

 	memset(r, 0, sizeof(Regis));
 	r->issuffix = (issuffix) ? 1 : 0;

-	for (i = 0; i < len; i++)
+	while(*c)
 	{
-		unsigned char c = *(((unsigned char *) str) + i);
-
 		if (state == RS_IN_WAIT)
 		{
-			if (isalpha(c))
+			if (t_isalpha(c))
 			{
 				if (ptr)
 					ptr = newRegisNode(ptr, len);
 				else
 					ptr = r->node = newRegisNode(NULL, len);
-				ptr->data[0] = c;
+				COPYCHAR(ptr->data, c);
 				ptr->type = RSF_ONEOF;
-				ptr->len = 1;
+				ptr->len = pg_mblen(c);
 			}
-			else if (c == '[')
+			else if (t_iseq(c,'['))
 			{
 				if (ptr)
 					ptr = newRegisNode(ptr, len);
@@ -75,38 +74,39 @@ RS_compile(Regis * r, int issuffix, const char *str)
 				state = RS_IN_ONEOF;
 			}
 			else
-				ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+				ts_error(ERROR, "Error in regis: %s", str );
 		}
 		else if (state == RS_IN_ONEOF)
 		{
-			if (c == '^')
+			if (t_iseq(c,'^'))
 			{
 				ptr->type = RSF_NONEOF;
 				state = RS_IN_NONEOF;
 			}
-			else if (isalpha(c))
+			else if (t_isalpha(c))
 			{
-				ptr->data[0] = c;
-				ptr->len = 1;
+				COPYCHAR(ptr->data, c);
+				ptr->len = pg_mblen(c);
 				state = RS_IN_ONEOF_IN;
 			}
 			else
-				ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+				ts_error(ERROR, "Error in regis: %s", str);
 		}
 		else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
 		{
-			if (isalpha(c))
+			if (t_isalpha(c))
 			{
-				ptr->data[ptr->len] = c;
-				ptr->len++;
+				COPYCHAR(ptr->data+ptr->len,  c);
+				ptr->len+=pg_mblen(c);
 			}
-			else if (c == ']')
+			else if (t_iseq(c,']'))
 				state = RS_IN_WAIT;
 			else
-				ts_error(ERROR, "Error in regis: %s at pos %d\n", str, i + 1);
+				ts_error(ERROR, "Error in regis: %s", str);
 		}
 		else
-			ts_error(ERROR, "Internal error in RS_compile: %d\n", state);
+			ts_error(ERROR, "Internal error in RS_compile: %d", state);
+		c += pg_mblen(c);
 	}

 	ptr = r->node;
@@ -115,8 +115,6 @@ RS_compile(Regis * r, int issuffix, const char *str)
 		r->nchar++;
 		ptr = ptr->next;
 	}
-
-	return 0;
 }

 void
@@ -135,51 +133,77 @@ RS_free(Regis * r)
 	r->node = NULL;
 }

-int
-RS_execute(Regis * r, const char *str, int len)
+#ifdef TS_USE_WIDE
+static bool
+mb_strchr(char *str, char *c) {
+	int clen = pg_mblen(c), plen,i;
+	char 	*ptr =str;
+	bool	res=false;
+
+	clen = pg_mblen(c);
+	while( *ptr && !res) {
+		plen = pg_mblen(ptr);
+		if ( plen == clen ) {
+			i=plen;
+			res = true;
+			while(i--)
+				if ( *(ptr+i) != *(c+i) ) {
+					res = false;
+					break; 
+				}
+		}
+		
+		ptr += plen;
+	}	 
+
+	return res;	
+}
+#else
+#define mb_strchr(s,c)	( (strchr((s),*(c)) == NULL) ? false : true )
+#endif
+
+
+bool
+RS_execute(Regis * r, char *str)
 {
 	RegisNode  *ptr = r->node;
-	unsigned char *c;
+	char *c = str;
+	int len=0;

-	if (len < 0)
-		len = strlen(str);
+	while(*c) {
+		len++;
+		c += pg_mblen(c);
+	}	

 	if (len < r->nchar)
 		return 0;

-	if (r->issuffix)
-		c = ((unsigned char *) str) + len - r->nchar;
-	else
-		c = (unsigned char *) str;
+	c = str;
+	if (r->issuffix) {
+		len -= r->nchar;
+		while(len-- > 0)
+			c += pg_mblen(c);
+	}
+

 	while (ptr)
 	{
 		switch (ptr->type)
 		{
 			case RSF_ONEOF:
-				if (ptr->len == 0)
-				{
-					if (*c != *(ptr->data))
-						return 0;
-				}
-				else if (strchr((char *) ptr->data, *c) == NULL)
-					return 0;
+				if ( mb_strchr((char *) ptr->data, c) != true )
+					return false;
 				break;
 			case RSF_NONEOF:
-				if (ptr->len == 0)
-				{
-					if (*c == *(ptr->data))
-						return 0;
-				}
-				else if (strchr((char *) ptr->data, *c) != NULL)
-					return 0;
+				if ( mb_strchr((char *) ptr->data, c) == true )
+					return false;
 				break;
 			default:
 				ts_error(ERROR, "RS_execute: Unknown type node: %d\n", ptr->type);
 		}
 		ptr = ptr->next;
-		c++;
+		c+=pg_mblen(c);
 	}

-	return 1;
+	return true;
 }
--- a/contrib/tsearch2/ispell/regis.h
+++ b/contrib/tsearch2/ispell/regis.h
@@ -27,12 +27,12 @@ typedef struct Regis
 				unused:15;
 }	Regis;

-int			RS_isRegis(const char *str);
+bool			RS_isRegis(const char *str);

-int			RS_compile(Regis * r, int issuffix, const char *str);
+void			RS_compile(Regis * r, bool issuffix, char *str);
 void		RS_free(Regis * r);

-/* 1   */
-int			RS_execute(Regis * r, const char *str, int len);
+/*returns true if matches */
+bool			RS_execute(Regis * r, char *str);

 #endif
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@@ -6,6 +6,7 @@
 #include "postgres.h"

 #include "spell.h"
+#include "common.h"
 #include "ts_locale.h"

 #define MAX_NORM 1024
@@ -13,7 +14,7 @@

 #define ERRSTRSIZE	1024

-#define STRNCASECMP(x,y)		pg_strncasecmp(x, y, strlen(y))
+#define STRNCMP(s,p)	strncmp( (s), (p), strlen(p) )
 #define GETWCHAR(W,L,N,T) ( ((uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
 #define GETCHAR(A,N,T)	  GETWCHAR( (A)->repl, (A)->replen, N, T )

@@ -41,6 +42,18 @@ strnduplicate(char *s, int len)
 	return d;
 }

+static char *
+findchar(char *str, int c) {
+	while( *str ) {
+		if ( t_iseq(str, c) ) 
+			return str;
+		str+=pg_mblen(str);
+	}
+
+	return NULL;
+}
+		
+
 /* backward string compare for suffix tree operations */
 static int
 strbcmp(const unsigned char *s1, const unsigned char *s2)
@@ -145,15 +158,17 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
 		char	   *s;
 		const char *flag;

+		pg_verifymbstr( str, strlen(str), false);
+
 		flag = NULL;
-		if ((s = strchr(str, '/')))
+		if ((s = findchar(str, '/')))
 		{
 			*s++ = '\0';
 			flag = s;
 			while (*s)
 			{
-				if (isprint((unsigned char) *s) &&
-					!isspace((unsigned char) *s))
+				/* we allow only single encoded flags for faster works */
+				if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
 					s++;
 				else
 				{
@@ -164,16 +179,19 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
 		}
 		else
 			flag = "";
-		lowerstr(str);
-		/* Dont load words if first letter is not required */
-		/* It allows to optimize loading at  search time   */
+
+
 		s = str;
 		while (*s)
 		{
-			if (*s == '\r' || *s == '\n')
+			if (t_isspace(s)) {
 				*s = '\0';
-			s++;
+				break;
+			}
+			s+=pg_mblen(s);
 		}
+		lowerstr(str);
+
 		NIAddSpell(Conf, str, flag);
 	}
 	fclose(dict);
@@ -253,9 +271,10 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
 	}
 	else
 	{
+		int masklen = strlen(mask);
 		Conf->Affix[Conf->naffixes].issimple = 0;
 		Conf->Affix[Conf->naffixes].isregis = 0;
-		Conf->Affix[Conf->naffixes].mask = (char *) malloc(strlen(mask) + 2);
+		Conf->Affix[Conf->naffixes].mask = (char *) malloc(masklen + 2);
 		if (type == FF_SUFFIX) 
 			sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask); 
 		else
@@ -277,37 +296,93 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const
 	return (0);
 }

-static char *
-remove_spaces(char *dist, char *src)
-{
-	char	   *d,
-			   *s;
+#define PAE_WAIT_MASK	0
+#define PAE_INMASK 	1
+#define PAE_WAIT_FIND 	2
+#define PAE_INFIND 	3
+#define PAE_WAIT_REPL 	4
+#define PAE_INREPL 	5
+
+static bool
+parse_affentry( char *str, char *mask, char *find, char *repl ) {
+	int state = PAE_WAIT_MASK;
+	char 	*pmask=mask, *pfind=find, *prepl=repl;
+
+	*mask = *find = *repl = '\0';
+
+	while(*str) {
+		if ( state == PAE_WAIT_MASK ) {
+			if ( t_iseq(str,'#') ) 
+				return false;
+			else if (!t_isspace(str)) {
+				COPYCHAR(pmask, str);
+				pmask += pg_mblen(str);
+				state = PAE_INMASK;
+			}
+		} else if ( state == PAE_INMASK ) {
+			if ( t_iseq(str,'>') ) {
+				*pmask='\0';
+				state = PAE_WAIT_FIND;
+			} else if (!t_isspace(str)) {
+				COPYCHAR(pmask, str);
+				pmask += pg_mblen(str);
+			}
+		} else if ( state == PAE_WAIT_FIND ) {
+			if ( t_iseq(str,'-') ) {
+				state = PAE_INFIND;
+			} else if (t_isalpha(str)) {
+				COPYCHAR(prepl,str);
+				prepl += pg_mblen(str);
+				state = PAE_INREPL;
+			} else if (!t_isspace(str))
+				ts_error(ERROR, "Affix parse error");
+		} else if ( state == PAE_INFIND ) {
+			if ( t_iseq(str,',') ) {
+				*pfind='\0';
+				state = PAE_WAIT_REPL;
+			} else if (t_isalpha(str)) {
+				COPYCHAR(pfind,str);
+				pfind += pg_mblen(str);
+			} else if (!t_isspace(str))
+				ts_error(ERROR, "Affix parse error");
+		} else if ( state == PAE_WAIT_REPL ) {
+			if ( t_iseq(str,'-') ) {
+				break; /* void repl */
+			} else if ( t_isalpha(str) ) {
+				COPYCHAR(prepl,str);
+				prepl += pg_mblen(str);
+				state = PAE_INREPL;
+			} else if (!t_isspace(str))
+				ts_error(ERROR, "Affix parse error");
+		} else if ( state == PAE_INREPL ) {
+			if ( t_iseq(str,'#') ) {
+				*prepl = '\0';
+				break;
+			} else if ( t_isalpha(str) ) { 
+				COPYCHAR(prepl,str);
+				prepl += pg_mblen(str);
+			} else if (!t_isspace(str))
+				ts_error(ERROR, "Affix parse error");
+		} else
+			ts_error(ERROR, "Unknown state in parse_affentry: %d", state);

-	d = dist;
-	s = src;
-	while (*s)
-	{
-		if (*s != ' ' && *s != '-' && *s != '\t')
-		{
-			*d = *s;
-			d++;
-		}
-		s++;
+		str += pg_mblen(str);
 	}
-	*d = 0;
-	return (dist);
-}

+	*pmask = *pfind = *prepl = '\0';
+
+	return ( *mask && ( *find || *repl) ) ? true : false;
+} 

 int
 NIImportAffixes(IspellDict * Conf, const char *filename)
 {
 	char		str[BUFSIZ];
+	char		tmpstr[BUFSIZ];
 	char		mask[BUFSIZ];
 	char		find[BUFSIZ];
 	char		repl[BUFSIZ];
 	char	   *s;
-	int			i;
 	int			suffixes = 0;
 	int			prefixes = 0;
 	int			flag = 0;
@@ -320,37 +395,45 @@ NIImportAffixes(IspellDict * Conf, const char *filename)

 	while (fgets(str, sizeof(str), affix))
 	{
-		if (STRNCASECMP(str, "compoundwords") == 0)
+		pg_verifymbstr( str, strlen(str), false);
+		memcpy(tmpstr, str, 32); /* compoundwords... */
+		tmpstr[32]='\0';
+		lowerstr(tmpstr);
+		if (STRNCMP(tmpstr, "compoundwords") == 0)
 		{
-			s = strchr(str, 'l');
+			s = findchar(str, 'l');
 			if (s)
 			{
-				while (*s != ' ')
-					s++;
-				while (*s == ' ')
-					s++;
+				while (*s && !t_isspace(s)) s++;
+				while (*s && t_isspace(s)) s++;
+				if ( *s && pg_mblen(s) == 1 ) 
 					Conf->compoundcontrol = *s;
 				continue;
 			}
 		}
-		if (STRNCASECMP(str, "suffixes") == 0)
+		if (STRNCMP(tmpstr, "suffixes") == 0)
 		{
 			suffixes = 1;
 			prefixes = 0;
 			continue;
 		}
-		if (STRNCASECMP(str, "prefixes") == 0)
+		if (STRNCMP(tmpstr, "prefixes") == 0)
 		{
 			suffixes = 0;
 			prefixes = 1;
 			continue;
 		}
-		if (STRNCASECMP(str, "flag ") == 0)
+		if (STRNCMP(tmpstr, "flag") == 0)
 		{
-			s = str + 5;
+			s = str + 4;
 			flagflags = 0;
-			while (*s == ' ')
-				s++;
+
+			while (*s && t_isspace(s)) s++;
+
+			/* allow only single-encoded flags */
+			if ( pg_mblen(s) != 1 )
+				continue;			
+
 			if (*s == '*')
 			{
 				flagflags |= FF_CROSSPRODUCT;
@@ -365,43 +448,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
 			if (*s == '\\')
 				s++;

+			/* allow only single-encoded flags */
+			if ( pg_mblen(s) != 1 ) {
+				flagflags = 0;
+				continue;
+			}
+
 			flag = (unsigned char) *s;
 			continue;
 		}
 		if ((!suffixes) && (!prefixes))
 			continue;
-		if ((s = strchr(str, '#')))
-			*s = 0;
-		if (!*str)
-			continue;
+
 		lowerstr(str);
-		strcpy(mask, "");
-		strcpy(find, "");
-		strcpy(repl, "");
-		i = sscanf(str, "%[^>\n]>%[^,\n],%[^\n]", mask, find, repl);
-		remove_spaces(str, repl);
-		strcpy(repl, str);
-		remove_spaces(str, find);
-		strcpy(find, str);
-		remove_spaces(str, mask);
-		strcpy(mask, str);
-		switch (i)
-		{
-			case 3:
-				break;
-			case 2:
-				if (*find != '\0')
-				{
-					strcpy(repl, find);
-					strcpy(find, "");
-				}
-				break;
-			default:
+		if ( !parse_affentry(str, mask, find, repl) )
 			continue;
-		}

 		NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
-
 	}
 	fclose(affix);

@@ -768,30 +831,28 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
 	{
 		if (Affix->compile)
 		{
-			RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? 1 : 0, Affix->mask);
+			RS_compile(&(Affix->reg.regis), (Affix->type == FF_SUFFIX) ? true : false, Affix->mask);
 			Affix->compile = 0;
 		}
-		if (RS_execute(&(Affix->reg.regis), newword, -1))
+		if (RS_execute(&(Affix->reg.regis), newword))
 			return newword;
 	}
 	else
 	{
-		regmatch_t	subs[2];	/* workaround for apache&linux */
 		int			err;
 		pg_wchar   *data;
 		size_t		data_len;
-		int			dat_len;
+		int			newword_len;

 		if (Affix->compile)
 		{
 			int			wmasklen,
 						masklen = strlen(Affix->mask);
 			pg_wchar   *mask;
-
 			mask = (pg_wchar *) palloc((masklen + 1) * sizeof(pg_wchar));
 			wmasklen = pg_mb2wchar_with_len(Affix->mask, mask, masklen);

-			err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_EXTENDED | REG_ICASE | REG_NOSUB);
+			err = pg_regcomp(&(Affix->reg.regex), mask, wmasklen, REG_ADVANCED | REG_NOSUB);
 			pfree(mask);
 			if (err)
 			{
@@ -804,11 +865,11 @@ CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *ne
 		}

 		/* Convert data string to wide characters */
-		dat_len = strlen(newword);
-		data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
-		data_len = pg_mb2wchar_with_len(newword, data, dat_len);
+		newword_len = strlen(newword);
+		data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
+		data_len = pg_mb2wchar_with_len(newword, data, newword_len);

-		if (!(err = pg_regexec(&(Affix->reg.regex), data, dat_len, 0, NULL, 1, subs, 0)))
+		if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
 		{
 			pfree(data);
 			return newword;

--- a/contrib/tsearch2/stopword.c
+++ b/contrib/tsearch2/stopword.c
@@ -4,8 +4,6 @@
 */
 #include "postgres.h"

-#include <ctype.h>
-
 #include "miscadmin.h"

 #include "common.h"
@@ -71,6 +69,8 @@ readstoplist(text *in, StopList * s)
 		while (fgets(buf, STOPBUFLEN, hin))
 		{
 			buf[strlen(buf) - 1] = '\0';
+			pg_verifymbstr( buf, strlen(buf), false );	
+			lowerstr(buf);
 			if (*buf == '\0')
 				continue;


--- a/contrib/tsearch2/ts_locale.h
+++ b/contrib/tsearch2/ts_locale.h
@@ -57,7 +57,7 @@ int _t_isprint( char *ptr );
 	int lll = pg_mblen( s );			\
 							\
 	while( lll-- ) 					\
-		TOUCHAR(d+lll) = TOUCHAR(s+lll);	\
+		TOUCHAR((d)+lll) = TOUCHAR((s)+lll);	\
 } while(0)