Mop-up for commit 85feb77a.

Adjust commentary in regc_pg_locale.c to remove mention of the possibility of not having <wctype.h> functions, since we no longer consider that. Eliminate duplicate code in wparser_def.c by generalizing the p_iswhat macro to take a parameter saying what to return for non-ASCII chars in C locale. (That's not really a consequence of the USE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.)

Mop-up for commit 85feb77a.
Adjust commentary in regc_pg_locale.c to remove mention of the possibility of not having <wctype.h> functions, since we no longer consider that. Eliminate duplicate code in wparser_def.c by generalizing the p_iswhat macro to take a parameter saying what to return for non-ASCII chars in C locale. (That's not really a consequence of the USE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.)
ed87e198 · Tom Lane · 85feb77a · ed87e198 · ed87e198
Commit ed87e198 authored Sep 22, 2017 by Tom Lane
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 97 deletions

src/backend/regex/regc_pg_locale.c src/backend/regex/regc_pg_locale.c +12 -12

src/backend/tsearch/wparser_def.c src/backend/tsearch/wparser_def.c +28 -85

No files found.
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -29,20 +29,20 @@
 *
 * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
 *
- * 2a. When working in UTF8 encoding, we use the <wctype.h> functions if
+ * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
- * available.  This assumes that every platform uses Unicode codepoints
+ * This assumes that every platform uses Unicode codepoints directly
- * directly as the wchar_t representation of Unicode.  On some platforms
+ * as the wchar_t representation of Unicode.  On some platforms
 * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
 *
- * 2b. In all other encodings, or on machines that lack <wctype.h>, we use
+ * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
- * the <ctype.h> functions for pg_wchar values up to 255, and punt for values
+ * values up to 255, and punt for values above that.  This is 100% correct
- * above that.  This is only 100% correct in single-byte encodings such as
+ * only in single-byte encodings such as LATINn.  However, non-Unicode
- * LATINn.  However, non-Unicode multibyte encodings are mostly Far Eastern
+ * multibyte encodings are mostly Far Eastern character sets for which the
- * character sets for which the properties being tested here aren't very
+ * properties being tested here aren't very relevant for higher code values
- * relevant for higher code values anyway.  The difficulty with using the
+ * anyway.  The difficulty with using the <wctype.h> functions with
- * <wctype.h> functions with non-Unicode multibyte encodings is that we can
+ * non-Unicode multibyte encodings is that we can have no certainty that
- * have no certainty that the platform's wchar_t representation matches
+ * the platform's wchar_t representation matches what we do in pg_wchar
- * what we do in pg_wchar conversions.
+ * conversions.
 *
 * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
 * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>

--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -427,94 +427,45 @@ TParserCopyClose(TParser *prs)
 *	- if locale is C then we use pgwstr instead of wstr.
 */
-#define p_iswhat(type)														\
+#define p_iswhat(type, nonascii)											\
+																			\
 static int																	\
-p_is##type(TParser *prs) {													\
+p_is##type(TParser *prs)													\
-	Assert( prs->state );													\
+{																			\
-	if ( prs->usewide )														\
+	Assert(prs->state);														\
+	if (prs->usewide)														\
 	{																		\
-		if ( prs->pgwstr )													\
+		if (prs->pgwstr)													\
 		{																	\
 			unsigned int c = *(prs->pgwstr + prs->state->poschar);			\
-			if ( c > 0x7f )													\
+			if (c > 0x7f)													\
-				return 0;													\
+				return nonascii;											\
-			return is##type( c );											\
+			return is##type(c);												\
 		}																	\
-		return isw##type( *( prs->wstr + prs->state->poschar ) );			\
+		return isw##type(*(prs->wstr + prs->state->poschar));				\
 	}																		\
-																			\
+	return is##type(*(unsigned char *) (prs->str + prs->state->posbyte));	\
-	return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
+}																			\
-}	\
 																			\
 static int																	\
-p_isnot##type(TParser *prs) {												\
+p_isnot##type(TParser *prs)													\
+{																			\
 	return !p_is##type(prs);												\
 }
-static int
+/*
-p_isalnum(TParser *prs)
+ * In C locale with a multibyte encoding, any non-ASCII symbol is considered
-{
+ * an alpha character, but not a member of other char classes.
-	Assert(prs->state);
+ */
+p_iswhat(alnum, 1)
-	if (prs->usewide)
+p_iswhat(alpha, 1)
-	{
+p_iswhat(digit, 0)
-		if (prs->pgwstr)
+p_iswhat(lower, 0)
-		{
+p_iswhat(print, 0)
-			unsigned int c = *(prs->pgwstr + prs->state->poschar);
+p_iswhat(punct, 0)
+p_iswhat(space, 0)
-			/*
+p_iswhat(upper, 0)
-			 * any non-ascii symbol with multibyte encoding with C-locale is
+p_iswhat(xdigit, 0)
-			 * an alpha character
-			 */
-			if (c > 0x7f)
-				return 1;
-			return isalnum(c);
-		}
-		return iswalnum(*(prs->wstr + prs->state->poschar));
-	}
-	return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
-}
-static int
-p_isnotalnum(TParser *prs)
-{
-	return !p_isalnum(prs);
-}
-static int
-p_isalpha(TParser *prs)
-{
-	Assert(prs->state);
-	if (prs->usewide)
-	{
-		if (prs->pgwstr)
-		{
-			unsigned int c = *(prs->pgwstr + prs->state->poschar);
-			/*
-			 * any non-ascii symbol with multibyte encoding with C-locale is
-			 * an alpha character
-			 */
-			if (c > 0x7f)
-				return 1;
-			return isalpha(c);
-		}
-		return iswalpha(*(prs->wstr + prs->state->poschar));
-	}
-	return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
-}
-static int
-p_isnotalpha(TParser *prs)
-{
-	return !p_isalpha(prs);
-}
 /* p_iseq should be used only for ascii symbols */
@@ -525,14 +476,6 @@ p_iseq(TParser *prs, char c)
 	return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
 }
-p_iswhat(digit)
-p_iswhat(lower)
-p_iswhat(print)
-p_iswhat(punct)
-p_iswhat(space)
-p_iswhat(upper)
-p_iswhat(xdigit)
 static int
 p_isEOF(TParser *prs)
 {