Commit ed87e198 authored by Tom Lane's avatar Tom Lane

Mop-up for commit 85feb77a.

Adjust commentary in regc_pg_locale.c to remove mention of the possibility
of not having <wctype.h> functions, since we no longer consider that.

Eliminate duplicate code in wparser_def.c by generalizing the p_iswhat
macro to take a parameter saying what to return for non-ASCII chars
in C locale.  (That's not really a consequence of the
USE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.)
parent 85feb77a
...@@ -29,20 +29,20 @@ ...@@ -29,20 +29,20 @@
* *
* 2. In the "default" collation (which is supposed to obey LC_CTYPE): * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
* *
* 2a. When working in UTF8 encoding, we use the <wctype.h> functions if * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
* available. This assumes that every platform uses Unicode codepoints * This assumes that every platform uses Unicode codepoints directly
* directly as the wchar_t representation of Unicode. On some platforms * as the wchar_t representation of Unicode. On some platforms
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
* *
* 2b. In all other encodings, or on machines that lack <wctype.h>, we use * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
* the <ctype.h> functions for pg_wchar values up to 255, and punt for values * values up to 255, and punt for values above that. This is 100% correct
* above that. This is only 100% correct in single-byte encodings such as * only in single-byte encodings such as LATINn. However, non-Unicode
* LATINn. However, non-Unicode multibyte encodings are mostly Far Eastern * multibyte encodings are mostly Far Eastern character sets for which the
* character sets for which the properties being tested here aren't very * properties being tested here aren't very relevant for higher code values
* relevant for higher code values anyway. The difficulty with using the * anyway. The difficulty with using the <wctype.h> functions with
* <wctype.h> functions with non-Unicode multibyte encodings is that we can * non-Unicode multibyte encodings is that we can have no certainty that
* have no certainty that the platform's wchar_t representation matches * the platform's wchar_t representation matches what we do in pg_wchar
* what we do in pg_wchar conversions. * conversions.
* *
* 3. Other collations are only supported on platforms that HAVE_LOCALE_T. * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
* Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h> * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
......
...@@ -427,94 +427,45 @@ TParserCopyClose(TParser *prs) ...@@ -427,94 +427,45 @@ TParserCopyClose(TParser *prs)
* - if locale is C then we use pgwstr instead of wstr. * - if locale is C then we use pgwstr instead of wstr.
*/ */
#define p_iswhat(type) \ #define p_iswhat(type, nonascii) \
\
static int \ static int \
p_is##type(TParser *prs) { \ p_is##type(TParser *prs) \
Assert( prs->state ); \ { \
if ( prs->usewide ) \ Assert(prs->state); \
if (prs->usewide) \
{ \ { \
if ( prs->pgwstr ) \ if (prs->pgwstr) \
{ \ { \
unsigned int c = *(prs->pgwstr + prs->state->poschar); \ unsigned int c = *(prs->pgwstr + prs->state->poschar); \
if ( c > 0x7f ) \ if (c > 0x7f) \
return 0; \ return nonascii; \
return is##type( c ); \ return is##type(c); \
} \ } \
return isw##type( *( prs->wstr + prs->state->poschar ) ); \ return isw##type(*(prs->wstr + prs->state->poschar)); \
} \ } \
\ return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ } \
} \
\ \
static int \ static int \
p_isnot##type(TParser *prs) { \ p_isnot##type(TParser *prs) \
{ \
return !p_is##type(prs); \ return !p_is##type(prs); \
} }
static int /*
p_isalnum(TParser *prs) * In C locale with a multibyte encoding, any non-ASCII symbol is considered
{ * an alpha character, but not a member of other char classes.
Assert(prs->state); */
p_iswhat(alnum, 1)
if (prs->usewide) p_iswhat(alpha, 1)
{ p_iswhat(digit, 0)
if (prs->pgwstr) p_iswhat(lower, 0)
{ p_iswhat(print, 0)
unsigned int c = *(prs->pgwstr + prs->state->poschar); p_iswhat(punct, 0)
p_iswhat(space, 0)
/* p_iswhat(upper, 0)
* any non-ascii symbol with multibyte encoding with C-locale is p_iswhat(xdigit, 0)
* an alpha character
*/
if (c > 0x7f)
return 1;
return isalnum(c);
}
return iswalnum(*(prs->wstr + prs->state->poschar));
}
return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
}
static int
p_isnotalnum(TParser *prs)
{
return !p_isalnum(prs);
}
static int
p_isalpha(TParser *prs)
{
Assert(prs->state);
if (prs->usewide)
{
if (prs->pgwstr)
{
unsigned int c = *(prs->pgwstr + prs->state->poschar);
/*
* any non-ascii symbol with multibyte encoding with C-locale is
* an alpha character
*/
if (c > 0x7f)
return 1;
return isalpha(c);
}
return iswalpha(*(prs->wstr + prs->state->poschar));
}
return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
}
static int
p_isnotalpha(TParser *prs)
{
return !p_isalpha(prs);
}
/* p_iseq should be used only for ascii symbols */ /* p_iseq should be used only for ascii symbols */
...@@ -525,14 +476,6 @@ p_iseq(TParser *prs, char c) ...@@ -525,14 +476,6 @@ p_iseq(TParser *prs, char c)
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
} }
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
p_iswhat(punct)
p_iswhat(space)
p_iswhat(upper)
p_iswhat(xdigit)
static int static int
p_isEOF(TParser *prs) p_isEOF(TParser *prs)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment