Fix usage of char2wchar/wchar2char. Changes:

- pg_wchar and wchar_t could have different size, so char2wchar doesn't call pg_mb2wchar_with_len to prevent out-of-bound memory bug - make char2wchar/wchar2char symmetric, now they should not be called with C-locale because mbstowcs/wcstombs oftenly doesn't work correct with C-locale. - Text parser uses pg_mb2wchar_with_len directly in case of C-locale and multibyte encoding Per bug report by Hiroshi Inoue <inoue@tpf.co.jp> and following discussion. Backpatch up to 8.2 when multybyte support was implemented in tsearch.

Fix usage of char2wchar/wchar2char. Changes:
- pg_wchar and wchar_t could have different size, so char2wchar doesn't call pg_mb2wchar_with_len to prevent out-of-bound memory bug - make char2wchar/wchar2char symmetric, now they should not be called with C-locale because mbstowcs/wcstombs oftenly doesn't work correct with C-locale. - Text parser uses pg_mb2wchar_with_len directly in case of C-locale and multibyte encoding Per bug report by Hiroshi Inoue <inoue@tpf.co.jp> and following discussion. Backpatch up to 8.2 when multybyte support was implemented in tsearch.
32032d42 · Teodor Sigaev · 876b37d5 · 32032d42 · 32032d42
Commit 32032d42 authored Mar 02, 2009 by Teodor Sigaev
Hide whitespace changes
Inline Side-by-side

Showing with 42 additions and 34 deletions

src/backend/tsearch/wparser_def.c src/backend/tsearch/wparser_def.c +33 -18

src/backend/utils/mb/mbutils.c src/backend/utils/mb/mbutils.c +9 -16

No files found.
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.20 2009/01/15 16:33:59 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.21 2009/03/02 15:10:09 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -240,12 +240,12 @@ typedef struct TParser
 	int			lenstr;			/* length of mbstring */
 #ifdef USE_WIDE_UPPER_LOWER
 	wchar_t    *wstr;			/* wide character string */
-	int			lenwstr;		/* length of wsting */
+	pg_wchar   *pgwstr;			/* wide character string for C-locale */
+	bool		usewide;
 #endif

 	/* State of parse */
 	int			charmaxlen;
-	bool		usewide;
 	TParserPosition *state;
 	bool		ignore;
 	bool		wanthost;
@@ -299,13 +299,24 @@ TParserInit(char *str, int len)
 	if (prs->charmaxlen > 1)
 	{
 		prs->usewide = true;
-		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
-		prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
-								  prs->str, prs->lenstr);
+		if ( lc_ctype_is_c() )
+		{
+			/*
+			 * char2wchar doesn't work for C-locale and
+			 * sizeof(pg_wchar) could be not equal to sizeof(wchar_t)
+			 */
+			prs->pgwstr = (pg_wchar*) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
+			pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
+		}
+		else
+		{
+			prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
+			char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr);
+		}
 	}
 	else
-#endif
 		prs->usewide = false;
+#endif

 	prs->state = newTParserPosition(NULL);
 	prs->state->state = TPS_Base;
@@ -331,6 +342,8 @@ TParserClose(TParser *prs)
 #ifdef USE_WIDE_UPPER_LOWER
 	if (prs->wstr)
 		pfree(prs->wstr);
+	if (prs->pgwstr)
+		pfree(prs->pgwstr);
 #endif

 	pfree(prs);
@@ -338,10 +351,12 @@ TParserClose(TParser *prs)

 /*
 * Character-type support functions, equivalent to is* macros, but
- * working with any possible encodings and locales. Note,
- * that with multibyte encoding and C-locale isw* function may fail
- * or give wrong result. Note 2: multibyte encoding and C-locale
- * often are used for Asian languages
+ * working with any possible encodings and locales. Notes:
+ *  - with multibyte encoding and C-locale isw* function may fail
+ *    or give wrong result. 
+ *  - multibyte encoding and C-locale often are used for 
+ *    Asian languages.
+ *  - if locale is C the we use pgwstr instead of wstr
 */

 #ifdef USE_WIDE_UPPER_LOWER
@@ -352,14 +367,14 @@ p_is##type(TParser *prs) {													\
 	Assert( prs->state );													\
 	if ( prs->usewide )														\
 	{																		\
-		if ( lc_ctype_is_c() )												\
-			return is##type( 0xff & *( prs->wstr + prs->state->poschar) );	\
+		if ( prs->pgwstr )													\
+			return is##type( 0xff & *( prs->pgwstr + prs->state->poschar) );\
 																			\
 		return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );	\
 	}																		\
 																			\
 	return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
-}	\
+}																			\
 																			\
 static int																	\
 p_isnot##type(TParser *prs) {												\
@@ -373,9 +388,9 @@ p_isalnum(TParser *prs)

 	if (prs->usewide)
 	{
-		if (lc_ctype_is_c())
+		if (prs->pgwstr)
 		{
-			unsigned int c = *(prs->wstr + prs->state->poschar);
+			unsigned int c = *(prs->pgwstr + prs->state->poschar);

 			/*
 			 * any non-ascii symbol with multibyte encoding with C-locale is
@@ -405,9 +420,9 @@ p_isalpha(TParser *prs)

 	if (prs->usewide)
 	{
-		if (lc_ctype_is_c())
+		if (prs->pgwstr)
 		{
-			unsigned int c = *(prs->wstr + prs->state->poschar);
+			unsigned int c = *(prs->pgwstr + prs->state->poschar);

 			/*
 			 * any non-ascii symbol with multibyte encoding with C-locale is

--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -4,7 +4,7 @@
 * (currently mule internal code (mic) is used)
 * Tatsuo Ishii
 *
- * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.78 2009/01/22 10:09:48 mha Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.79 2009/03/02 15:10:09 teodor Exp $
 */
 #include "postgres.h"

@@ -601,7 +601,10 @@ wchar2char(char *to, const wchar_t *from, size_t tolen)
 	}
 	else
 #endif   /* WIN32 */
+	{
+		Assert( !lc_ctype_is_c() );
 		result = wcstombs(to, from, tolen);
+	}
 	return result;
 }

@@ -647,22 +650,12 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
 	else
 #endif   /* WIN32 */
 	{
-		if (lc_ctype_is_c())
-		{
-			/*
-			 * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
-			 * allocated with sufficient space
-			 */
-			result = pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
-		}
-		else
-		{
-			/* mbstowcs requires ending '\0' */
-			char	   *str = pnstrdup(from, fromlen);
+		/* mbstowcs requires ending '\0' */
+		char	   *str = pnstrdup(from, fromlen);

-			result = mbstowcs(to, str, tolen);
-			pfree(str);
-		}
+		Assert( !lc_ctype_is_c() );
+		result = mbstowcs(to, str, tolen);
+		pfree(str);
 	}

 	if (result == -1)