Commit 0d323425 authored by Tom Lane's avatar Tom Lane

Teach the regular expression functions to do case-insensitive matching and

locale-dependent character classification properly when the database encoding
is UTF8.

The previous coding worked okay in single-byte encodings, or in any case for
ASCII characters, but failed entirely on multibyte characters.  The fix
assumes that the <wctype.h> functions use Unicode code points as the wchar
representation for Unicode, ie, wchar matches pg_wchar.

This is only a partial solution, since we're still stupid about non-ASCII
characters in multibyte encodings other than UTF8.  The practical effect
of that is limited, however, since those cases are generally Far Eastern
glyphs for which concepts like case-folding don't apply anyway.  Certainly
all or nearly all of the field reports of problems have been about UTF8.
A more general solution would require switching to the platform's wchar
representation for all regex operations; which is possible but would have
substantial disadvantages.  Let's try this and see if it's sufficient in
practice.
parent ef51395e
...@@ -47,7 +47,7 @@ ...@@ -47,7 +47,7 @@
* permission to use and distribute the software in accordance with the * permission to use and distribute the software in accordance with the
* terms specified in this license. * terms specified in this license.
* *
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.9 2008/02/14 17:33:37 tgl Exp $ * $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.10 2009/12/01 21:00:24 tgl Exp $
*/ */
/* ASCII character-name table */ /* ASCII character-name table */
...@@ -349,67 +349,152 @@ static const struct cname ...@@ -349,67 +349,152 @@ static const struct cname
} }
}; };
/* /*
* some ctype functions with non-ascii-char guard * ctype functions adapted to work on pg_wchar (a/k/a chr)
*
* When working in UTF8 encoding, we use the <wctype.h> functions if
* available. This assumes that every platform uses Unicode codepoints
* directly as the wchar_t representation of Unicode. On some platforms
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
*
* In all other encodings, we use the <ctype.h> functions for pg_wchar
* values up to 255, and punt for values above that. This is only 100%
* correct in single-byte encodings such as LATINn. However, non-Unicode
* multibyte encodings are mostly Far Eastern character sets for which the
* properties being tested here aren't relevant for higher code values anyway.
*
* NB: the coding here assumes pg_wchar is an unsigned type.
*/ */
static int static int
pg_wc_isdigit(pg_wchar c) pg_wc_isdigit(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswdigit((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
} }
static int static int
pg_wc_isalpha(pg_wchar c) pg_wc_isalpha(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalpha((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
} }
static int static int
pg_wc_isalnum(pg_wchar c) pg_wc_isalnum(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalnum((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
} }
static int static int
pg_wc_isupper(pg_wchar c) pg_wc_isupper(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswupper((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
} }
static int static int
pg_wc_islower(pg_wchar c) pg_wc_islower(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswlower((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
} }
static int static int
pg_wc_isgraph(pg_wchar c) pg_wc_isgraph(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswgraph((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
} }
static int static int
pg_wc_isprint(pg_wchar c) pg_wc_isprint(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswprint((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
} }
static int static int
pg_wc_ispunct(pg_wchar c) pg_wc_ispunct(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswpunct((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
} }
static int static int
pg_wc_isspace(pg_wchar c) pg_wc_isspace(pg_wchar c)
{ {
return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c)); #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswspace((wint_t) c);
}
#endif
return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
} }
static pg_wchar static pg_wchar
pg_wc_toupper(pg_wchar c) pg_wc_toupper(pg_wchar c)
{ {
if (c >= 0 && c <= UCHAR_MAX) #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return towupper((wint_t) c);
}
#endif
if (c <= (pg_wchar) UCHAR_MAX)
return toupper((unsigned char) c); return toupper((unsigned char) c);
return c; return c;
} }
...@@ -417,7 +502,14 @@ pg_wc_toupper(pg_wchar c) ...@@ -417,7 +502,14 @@ pg_wc_toupper(pg_wchar c)
static pg_wchar static pg_wchar
pg_wc_tolower(pg_wchar c) pg_wc_tolower(pg_wchar c)
{ {
if (c >= 0 && c <= UCHAR_MAX) #ifdef USE_WIDE_UPPER_LOWER
if (GetDatabaseEncoding() == PG_UTF8)
{
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return towlower((wint_t) c);
}
#endif
if (c <= (pg_wchar) UCHAR_MAX)
return tolower((unsigned char) c); return tolower((unsigned char) c);
return c; return c;
} }
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* *
* $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.7 2008/02/14 17:33:37 tgl Exp $ * $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.8 2009/12/01 21:00:24 tgl Exp $
*/ */
/* headers if any */ /* headers if any */
...@@ -34,6 +34,17 @@ ...@@ -34,6 +34,17 @@
#include <ctype.h> #include <ctype.h>
#include <limits.h> #include <limits.h>
/*
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems
* declare them in <wchar.h>.
*/
#ifdef HAVE_WCHAR_H
#include <wchar.h>
#endif
#ifdef HAVE_WCTYPE_H
#include <wctype.h>
#endif
#include "mb/pg_wchar.h" #include "mb/pg_wchar.h"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment