Fix bug http://archives.postgresql.org/pgsql-bugs/2006-10/msg00258.php.

Fix string's length calculation for recoding, fix strlower() to avoid wrong assumption about length of recoded string (was: recoded string is no greater that source, it may not true for multibyte encodings) Thanks to Thomas H. <me@alternize.com> and Magnus Hagander <mha@sollentuna.net>

Fix bug http://archives.postgresql.org/pgsql-bugs/2006-10/msg00258.php.
Fix string's length calculation for recoding, fix strlower() to avoid wrong assumption about length of recoded string (was: recoded string is no greater that source, it may not true for multibyte encodings) Thanks to Thomas H. <me@alternize.com> and Magnus Hagander <mha@sollentuna.net>
419fe7cd · Teodor Sigaev · 1a5c450f · 419fe7cd · 419fe7cd · 419fe7cd
Commit 419fe7cd authored Nov 20, 2006 by Teodor Sigaev
6 changed files
--- a/contrib/tsearch2/dict_ex.c
+++ b/contrib/tsearch2/dict_ex.c
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict_ex.c,v 1.8 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict_ex.c,v 1.9 2006/11/20 14:03:30 teodor Exp $ */
 /*
 * example of dictionary
@@ -52,9 +52,11 @@ dex_lexize(PG_FUNCTION_ARGS)
 {
 	DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
-	char	   *txt = pnstrdup(in, PG_GETARG_INT32(2));
+	char	   *utxt = pnstrdup(in, PG_GETARG_INT32(2));
 	TSLexeme   *res = palloc(sizeof(TSLexeme) * 2);
+	char	   *txt = lowerstr(utxt);
+	pfree(utxt);
 	memset(res, 0, sizeof(TSLexeme) * 2);
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))

--- a/contrib/tsearch2/dict_snowball.c
+++ b/contrib/tsearch2/dict_snowball.c
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict_snowball.c,v 1.12 2006/07/11 16:35:31 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict_snowball.c,v 1.13 2006/11/20 14:03:30 teodor Exp $ */
 /*
 * example of Snowball dictionary
@@ -142,9 +142,11 @@ snb_lexize(PG_FUNCTION_ARGS)
 {
 	DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
-	char	   *txt = pnstrdup(in, PG_GETARG_INT32(2));
+	char	   *utxt = pnstrdup(in, PG_GETARG_INT32(2));
 	TSLexeme   *res = palloc(sizeof(TSLexeme) * 2);
+	char	   *txt = lowerstr(utxt);
+	pfree(utxt);
 	memset(res, 0, sizeof(TSLexeme) * 2);
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{

--- a/contrib/tsearch2/dict_syn.c
+++ b/contrib/tsearch2/dict_syn.c
-/* $PostgreSQL: pgsql/contrib/tsearch2/dict_syn.c,v 1.9 2006/03/11 04:38:30 momjian Exp $ */
+/* $PostgreSQL: pgsql/contrib/tsearch2/dict_syn.c,v 1.10 2006/11/20 14:03:30 teodor Exp $ */
 /*
 * ISpell interface
@@ -132,8 +132,8 @@ syn_init(PG_FUNCTION_ARGS)
 			continue;
 		*end = '\0';
-		d->syn[cur].in = strdup(lowerstr(starti));
+		d->syn[cur].in = lowerstr(starti);
-		d->syn[cur].out = strdup(lowerstr(starto));
+		d->syn[cur].out = lowerstr(starto);
 		if (!(d->syn[cur].in && d->syn[cur].out))
 		{
 			fclose(fin);
@@ -163,12 +163,15 @@ syn_lexize(PG_FUNCTION_ARGS)
 	Syn			key,
 			   *found;
 	TSLexeme   *res = NULL;
+	char	   *wrd;
 	if (!PG_GETARG_INT32(2))
 		PG_RETURN_POINTER(NULL);
 	key.out = NULL;
-	key.in = lowerstr(pnstrdup(in, PG_GETARG_INT32(2)));
+	wrd = pnstrdup(in, PG_GETARG_INT32(2));
+	key.in = lowerstr(wrd);
+	pfree(wrd);
 	found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
 	pfree(key.in);

--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@@ -147,7 +147,7 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
 int
 NIImportDictionary(IspellDict * Conf, const char *filename)
 {
-	char		str[BUFSIZ];
+	char		str[BUFSIZ], *pstr;
 	FILE	   *dict;
 	if (!(dict = fopen(filename, "r")))
@@ -190,9 +190,10 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
 			}
 			s += pg_mblen(s);
 		}
-		lowerstr(str);
+		pstr = lowerstr(str);
-		NIAddSpell(Conf, str, flag);
+		NIAddSpell(Conf, pstr, flag);
+		pfree(pstr);
 	}
 	fclose(dict);
 	return (0);
@@ -418,8 +419,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, int line)
 int
 NIImportAffixes(IspellDict * Conf, const char *filename)
 {
-	char		str[BUFSIZ];
+	char		str[BUFSIZ], *pstr = NULL;
-	char		tmpstr[BUFSIZ];
 	char		mask[BUFSIZ];
 	char		find[BUFSIZ];
 	char		repl[BUFSIZ];
@@ -439,11 +439,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
 	while (fgets(str, sizeof(str), affix))
 	{
 		line++;
+		if ( *str == '#' || *str == '\n' )
+			continue;
 		pg_verifymbstr(str, strlen(str), false);
-		memcpy(tmpstr, str, 32);	/* compoundwords... */
+		if ( pstr )
-		tmpstr[32] = '\0';
+			pfree( pstr );
-		lowerstr(tmpstr);
+		pstr = lowerstr(str);
-		if (STRNCMP(tmpstr, "compoundwords") == 0)
+		if (STRNCMP(pstr, "compoundwords") == 0)
 		{
 			s = findchar(str, 'l');
 			if (s)
@@ -458,21 +461,21 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
 				continue;
 			}
 		}
-		if (STRNCMP(tmpstr, "suffixes") == 0)
+		if (STRNCMP(pstr, "suffixes") == 0)
 		{
 			suffixes = 1;
 			prefixes = 0;
 			oldformat++;
 			continue;
 		}
-		if (STRNCMP(tmpstr, "prefixes") == 0)
+		if (STRNCMP(pstr, "prefixes") == 0)
 		{
 			suffixes = 0;
 			prefixes = 1;
 			oldformat++;
 			continue;
 		}
-		if (STRNCMP(tmpstr, "flag") == 0)
+		if (STRNCMP(pstr, "flag") == 0)
 		{
 			s = str + 4;
 			flagflags = 0;
@@ -523,14 +526,16 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
 		if ((!suffixes) && (!prefixes))
 			continue;
-		lowerstr(str);
+		if (!parse_affentry(pstr, mask, find, repl, line)) 
-		if (!parse_affentry(str, mask, find, repl, line))
 			continue;
 		NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
 	}
 	fclose(affix);
+	if ( pstr )
+		pfree( pstr );
 	return (0);
 }
@@ -538,11 +543,11 @@ int
 NIImportOOAffixes(IspellDict * Conf, const char *filename)
 {
 	char		str[BUFSIZ];
-	char		type[BUFSIZ];
+	char		type[BUFSIZ], *ptype = NULL;
 	char		sflag[BUFSIZ];
-	char		mask[BUFSIZ];
+	char		mask[BUFSIZ], *pmask;
-	char		find[BUFSIZ];
+	char		find[BUFSIZ], *pfind;
-	char		repl[BUFSIZ];
+	char		repl[BUFSIZ], *prepl;
 	bool		isSuffix = false;
 	int			flag = 0;
 	char		flagflags = 0;
@@ -577,8 +582,10 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
 		scanread = sscanf(str, scanbuf, type, sflag, find, repl, mask);
-		lowerstr(type);
+		if (ptype)
-		if (scanread < 4 || (STRNCMP(type, "sfx") && STRNCMP(type, "pfx")))
+			pfree(ptype);
+		ptype = lowerstr(type);
+		if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
 			continue;
 		if (scanread == 4)
@@ -586,29 +593,35 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
 			if (strlen(sflag) != 1)
 				continue;
 			flag = *sflag;
-			isSuffix = (STRNCMP(type, "sfx") == 0) ? true : false;
+			isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
-			lowerstr(find);
+			pfind = lowerstr(find);
 			if (t_iseq(find, 'y'))
 				flagflags |= FF_CROSSPRODUCT;
 			else
 				flagflags = 0;
+			pfree(pfind);
 		}
 		else
 		{
 			if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
 				continue;
-			lowerstr(repl);
+			prepl = lowerstr(repl);
-			lowerstr(find);
+			pfind = lowerstr(find);
-			lowerstr(mask);
+			pmask = lowerstr(mask);
 			if (t_iseq(find, '0'))
 				*find = '\0';
 			if (t_iseq(repl, '0'))
 				*repl = '\0';
 			NIAddAffix(Conf, flag, flagflags, mask, find, repl, isSuffix ? FF_SUFFIX : FF_PREFIX);
+			pfree(prepl);
+			pfree(pfind);
+			pfree(pmask);
 		}
 	}
+	if (ptype)
+		pfree(ptype);
 	fclose(affix);
 	return 0;
@@ -1053,7 +1066,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
 	if (wrdlen > MAXNORMLEN)
 		return NULL;
-	lowerstr(word);
 	cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
 	*cur = NULL;
@@ -1354,13 +1366,17 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
 }
 TSLexeme *
-NINormalizeWord(IspellDict * Conf, char *word)
+NINormalizeWord(IspellDict * Conf, char *uword)
 {
-	char	  **res = NormalizeSubWord(Conf, word, 0);
+	char	  **res;
+	char	   *word;
 	TSLexeme   *lcur = NULL,
 			   *lres = NULL;
 	uint16		NVariant = 1;
+	word = lowerstr(uword);
+	res = NormalizeSubWord(Conf, word, 0);
 	if (res)
 	{
 		char	  **ptr = res;
@@ -1431,6 +1447,9 @@ NINormalizeWord(IspellDict * Conf, char *word)
 			var = ptr;
 		}
 	}
+	pfree(word);
 	return lres;
 }

--- a/contrib/tsearch2/stopword.c
+++ b/contrib/tsearch2/stopword.c
@@ -36,7 +36,7 @@ readstoplist(text *in, StopList * s)
 	{
 		char	   *filename = to_absfilename(text2char(in));
 		FILE	   *hin;
-		char		buf[STOPBUFLEN];
+		char		buf[STOPBUFLEN], *pbuf;
 		int			reallen = 0;
 		if ((hin = fopen(filename, "r")) == NULL)
@@ -49,7 +49,6 @@ readstoplist(text *in, StopList * s)
 		{
 			buf[strlen(buf) - 1] = '\0';
 			pg_verifymbstr(buf, strlen(buf), false);
-			lowerstr(buf);
 			if (*buf == '\0')
 				continue;
@@ -70,7 +69,14 @@ readstoplist(text *in, StopList * s)
 				stop = tmp;
 			}
-			stop[s->len] = strdup(buf);
+			if (s->wordop) 
+			{
+				pbuf = s->wordop(buf);
+				stop[s->len] = strdup(pbuf);
+				pfree(pbuf);
+			} else
+				stop[s->len] = strdup(buf);
 			if (!stop[s->len])
 			{
 				freestoplist(s);
@@ -79,8 +85,6 @@ readstoplist(text *in, StopList * s)
 						(errcode(ERRCODE_OUT_OF_MEMORY),
 						 errmsg("out of memory")));
 			}
-			if (s->wordop)
-				stop[s->len] = (s->wordop) (stop[s->len]);
 			(s->len)++;
 		}
@@ -106,7 +110,5 @@ sortstoplist(StopList * s)
 bool
 searchstoplist(StopList * s, char *key)
 {
-	if (s->wordop)
-		key = (*(s->wordop)) (key);
 	return (s->stop && s->len > 0 && bsearch(&key, s->stop, s->len, sizeof(char *), comparestr)) ? true : false;
 }
--- a/contrib/tsearch2/ts_locale.c
+++ b/contrib/tsearch2/ts_locale.c
@@ -14,21 +14,12 @@ wchar2char(char *to, const wchar_t *from, size_t len)
 {
 	if (GetDatabaseEncoding() == PG_UTF8)
 	{
-		int			r,
+		int			r;
-					nbytes;
 		if (len == 0)
 			return 0;
-		/* in any case, *to should be allocated with enough space */
+		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
-		nbytes = WideCharToMultiByte(CP_UTF8, 0, from, len, NULL, 0, NULL, NULL);
-		if (nbytes == 0)
-			ereport(ERROR,
-					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-					 errmsg("UTF-16 to UTF-8 translation failed: %lu",
-							GetLastError())));
-		r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
 								NULL, NULL);
 		if (r == 0)
@@ -36,6 +27,8 @@ wchar2char(char *to, const wchar_t *from, size_t len)
 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 					 errmsg("UTF-16 to UTF-8 translation failed: %lu",
 							GetLastError())));
+		Assert(r <= len);
 		return r;
 	}
@@ -56,7 +49,7 @@ char2wchar(wchar_t *to, const char *from, size_t len)
 		if (!r)
 		{
-			pg_verifymbstr(from, len, false);
+			pg_verifymbstr(from, strlen(from), false);
 			ereport(ERROR,
 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 					 errmsg("invalid multibyte character for locale"),
@@ -97,6 +90,11 @@ char *
 lowerstr(char *str)
 {
 	char	   *ptr = str;
+	char	   *out;
+	int			len = strlen(str);
+	if ( len == 0 )
+		return pstrdup("");
 #ifdef TS_USE_WIDE
@@ -110,24 +108,67 @@ lowerstr(char *str)
 	{
 		wchar_t    *wstr,
 				   *wptr;
-		int			len = strlen(str);
+		int		    wlen;
+		/* 
+		 *alloc number of wchar_t for worst case, len contains
+		 * number of bytes <= number of characters and
+		 * alloc 1 wchar_t for 0, because wchar2char(wcstombs in really)
+		 * wants zero-terminated string
+		 */
+		wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1));
+		/*
+		 * str SHOULD be cstring, so wlen contains number
+		 * of converted character
+		 */
+		wlen = char2wchar(wstr, str, len);
+		if ( wlen < 0 )
+			ereport(ERROR,
+					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+					 errmsg("transalation failed from server encoding to wchar_t")));
+		Assert(wlen<=len);
+		wstr[wlen] = 0;
-		wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
-		char2wchar(wstr, str, len + 1);
 		while (*wptr)
 		{
 			*wptr = towlower((wint_t) *wptr);
 			wptr++;
 		}
-		wchar2char(str, wstr, len);
+		/*
+		 * Alloc result string for worst case + '\0'
+		 */
+		len = sizeof(char)*pg_database_encoding_max_length()*(wlen+1);
+		out = (char*)palloc(len);
+		/*
+		 * wlen now is number of bytes which is always >= number of characters
+		 */
+		wlen = wchar2char(out, wstr, len);
 		pfree(wstr);
+		if ( wlen < 0 )
+			ereport(ERROR,
+					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+					 errmsg("transalation failed from wchar_t to server encoding %d", errno)));
+		Assert(wlen<=len);
+		out[wlen]='\0';
 	}
 	else
 #endif
+	{
+		char *outptr;
+		outptr = out = (char*)palloc( sizeof(char) * (len+1) );
 		while (*ptr)
 		{
-			*ptr = tolower(*(unsigned char *) ptr);
+			*outptr++ = tolower(*(unsigned char *) ptr);
 			ptr++;
 		}
-	return str;
+		*outptr = '\0';
+	}
+	return out;
 }