Teach regular expression operators to honor collations.

This involves getting the character classification and case-folding functions in the regex library to use the collations infrastructure. Most of this work had been done already in connection with the upper/lower and LIKE logic, so it was a simple matter of transposition. While at it, split out these functions into a separate source file regc_pg_locale.c, so that they can be correctly labeled with the Postgres project's license rather than the Scriptics license. These functions are 100% Postgres-written code whereas what remains in regc_locale.c is still mostly not ours, so lumping them both under the same copyright notice was getting more and more misleading.

Teach regular expression operators to honor collations.
This involves getting the character classification and case-folding functions in the regex library to use the collations infrastructure. Most of this work had been done already in connection with the upper/lower and LIKE logic, so it was a simple matter of transposition. While at it, split out these functions into a separate source file regc_pg_locale.c, so that they can be correctly labeled with the Postgres project's license rather than the Scriptics license. These functions are 100% Postgres-written code whereas what remains in regc_locale.c is still mostly not ours, so lumping them both under the same copyright notice was getting more and more misleading.
1e16a810 · Tom Lane · 210f95f1 · 1e16a810 · 1e16a810 · 1e16a810
Commit 1e16a810 authored Apr 10, 2011 by Tom Lane
12 changed files
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -221,17 +221,21 @@ initdb --locale=sv_SE
     <listitem>
      <para>
-       The ability to use indexes with <literal>LIKE</> clauses
+       The <function>upper</>, <function>lower</>, and <function>initcap</>
-       <indexterm><primary>LIKE</><secondary>and locales</></indexterm>
+       functions
+       <indexterm><primary>upper</><secondary>and locales</></indexterm>
+       <indexterm><primary>lower</><secondary>and locales</></indexterm>
      </para>
     </listitem>
     <listitem>
      <para>
-       The <function>upper</>,  <function>lower</>,  and <function>initcap</>
+       Pattern matching operators (<literal>LIKE</>, <literal>SIMILAR TO</>,
-       functions
+       and POSIX-style regular expressions); locales affect both case
-       <indexterm><primary>upper</><secondary>and locales</></indexterm>
+       insensitive matching and the classification of characters by
-       <indexterm><primary>lower</><secondary>and locales</></indexterm>
+       character-class regular expressions
+       <indexterm><primary>LIKE</><secondary>and locales</></indexterm>
+       <indexterm><primary>regular expressions</><secondary>and locales</></indexterm>
      </para>
     </listitem>
@@ -241,6 +245,12 @@ initdb --locale=sv_SE
       <indexterm><primary>to_char</><secondary>and locales</></indexterm>
      </para>
     </listitem>
+     <listitem>
+      <para>
+       The ability to use indexes with <literal>LIKE</> clauses
+      </para>
+     </listitem>
    </itemizedlist>
   </para>
@@ -319,8 +329,8 @@ initdb --locale=sv_SE
  <indexterm zone="collation"><primary>collation</></>
  <para>
-   The collation feature allows specifying the sort order and certain
+   The collation feature allows specifying the sort order and character
-   other locale aspects of data per-column, or even per-operation.
+   classification behavior of data per-column, or even per-operation.
   This alleviates the restriction that the
   <symbol>LC_COLLATE</symbol> and <symbol>LC_CTYPE</symbol> settings
   of a database cannot be changed after its creation.
@@ -351,8 +361,8 @@ initdb --locale=sv_SE
   </para>
   <para>
-    When the database system has to perform an ordering or a
+    When the database system has to perform an ordering or a character
-    comparison, it uses the collation of the input expression.  This
+    classification, it uses the collation of the input expression.  This
    happens, for example, with <literal>ORDER BY</literal> clauses
    and function or operator calls such as <literal>&lt;</literal>.
    The collation to apply for an <literal>ORDER BY</literal> clause
@@ -361,7 +371,8 @@ initdb --locale=sv_SE
    below.  In addition to comparison operators, collations are taken into
    account by functions that convert between lower and upper case
    letters, such as <function>lower</>, <function>upper</>, and
-    <function>initcap</>.
+    <function>initcap</>; by pattern matching operators; and by
+    <function>to_char</> and related functions.
   </para>
   <para>

--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -25,6 +25,7 @@
 #include <arpa/inet.h>
 #include <unistd.h>
+#include "catalog/pg_collation.h"
 #include "libpq/ip.h"
 #include "libpq/libpq.h"
 #include "regex/regex.h"
@@ -1781,7 +1782,7 @@ parse_ident_usermap(List *line, int line_number, const char *usermap_name,
 		 * XXX: Major room for optimization: regexps could be compiled when
 		 * the file is loaded and then re-used in every connection.
 		 */
-		r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED);
+		r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED, C_COLLATION_OID);
 		if (r)
 		{
 			char		errstr[100];

--- a/src/backend/regex/Makefile
+++ b/src/backend/regex/Makefile
@@ -17,6 +17,7 @@ OBJS = regcomp.o regerror.o regexec.o regfree.o
 include $(top_srcdir)/src/backend/common.mk
 # mark inclusion dependencies between .c files explicitly
-regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c regc_locale.c
+regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \
+        regc_locale.c regc_pg_locale.c
 regexec.o: regexec.c rege_dfa.c
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -350,171 +350,6 @@ static const struct cname
 };
-/*
- * ctype functions adapted to work on pg_wchar (a/k/a chr)
- *
- * When working in UTF8 encoding, we use the <wctype.h> functions if
- * available.  This assumes that every platform uses Unicode codepoints
- * directly as the wchar_t representation of Unicode.  On some platforms
- * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
- *
- * In all other encodings, we use the <ctype.h> functions for pg_wchar
- * values up to 255, and punt for values above that.  This is only 100%
- * correct in single-byte encodings such as LATINn.  However, non-Unicode
- * multibyte encodings are mostly Far Eastern character sets for which the
- * properties being tested here aren't relevant for higher code values anyway.
- *
- * NB: the coding here assumes pg_wchar is an unsigned type.
- */
-static int
-pg_wc_isdigit(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswdigit((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
-}
-static int
-pg_wc_isalpha(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswalpha((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
-}
-static int
-pg_wc_isalnum(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswalnum((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
-}
-static int
-pg_wc_isupper(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswupper((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
-}
-static int
-pg_wc_islower(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswlower((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
-}
-static int
-pg_wc_isgraph(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswgraph((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
-}
-static int
-pg_wc_isprint(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswprint((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
-}
-static int
-pg_wc_ispunct(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswpunct((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
-}
-static int
-pg_wc_isspace(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswspace((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
-}
-static pg_wchar
-pg_wc_toupper(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return towupper((wint_t) c);
-	}
-#endif
-	if (c <= (pg_wchar) UCHAR_MAX)
-		return toupper((unsigned char) c);
-	return c;
-}
-static pg_wchar
-pg_wc_tolower(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return towlower((wint_t) c);
-	}
-#endif
-	if (c <= (pg_wchar) UCHAR_MAX)
-		return tolower((unsigned char) c);
-	return c;
-}
 /*
 * element - map collating-element name to celt
 */

--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
+/*-------------------------------------------------------------------------
+ *
+ * regc_pg_locale.c
+ *	  ctype functions adapted to work on pg_wchar (a/k/a chr)
+ *
+ * This file is #included by regcomp.c; it's not meant to compile standalone.
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/regex/regc_pg_locale.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "catalog/pg_collation.h"
+#include "utils/pg_locale.h"
+/*
+ * To provide as much functionality as possible on a variety of platforms,
+ * without going so far as to implement everything from scratch, we use
+ * several implementation strategies depending on the situation:
+ *
+ * 1. In C/POSIX collations, we use hard-wired code.  We can't depend on
+ * the <ctype.h> functions since those will obey LC_CTYPE.  Note that these
+ * collations don't give a fig about multibyte characters.
+ *
+ * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
+ *
+ * 2a. When working in UTF8 encoding, we use the <wctype.h> functions if
+ * available.  This assumes that every platform uses Unicode codepoints
+ * directly as the wchar_t representation of Unicode.  On some platforms
+ * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
+ *
+ * 2b. In all other encodings, or on machines that lack <wctype.h>, we use
+ * the <ctype.h> functions for pg_wchar values up to 255, and punt for values
+ * above that.  This is only 100% correct in single-byte encodings such as
+ * LATINn.  However, non-Unicode multibyte encodings are mostly Far Eastern
+ * character sets for which the properties being tested here aren't very
+ * relevant for higher code values anyway.  The difficulty with using the
+ * <wctype.h> functions with non-Unicode multibyte encodings is that we can
+ * have no certainty that the platform's wchar_t representation matches
+ * what we do in pg_wchar conversions.
+ *
+ * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
+ * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
+ * functions, under exactly the same cases as #2.
+ *
+ * There is one notable difference between cases 2 and 3: in the "default"
+ * collation we force ASCII letters to follow ASCII upcase/downcase rules,
+ * while in a non-default collation we just let the library functions do what
+ * they will.  The case where this matters is treatment of I/i in Turkish,
+ * and the behavior is meant to match the upper()/lower() SQL functions.
+ *
+ * We store the active collation setting in static variables.  In principle
+ * it could be passed down to here via the regex library's "struct vars" data
+ * structure; but that would require somewhat invasive changes in the regex
+ * library, and right now there's no real benefit to be gained from that.
+ *
+ * NB: the coding here assumes pg_wchar is an unsigned type.
+ */
+typedef enum
+{
+	PG_REGEX_LOCALE_C,			/* C locale (encoding independent) */
+	PG_REGEX_LOCALE_WIDE,		/* Use <wctype.h> functions */
+	PG_REGEX_LOCALE_1BYTE,		/* Use <ctype.h> functions */
+	PG_REGEX_LOCALE_WIDE_L,		/* Use locale_t <wctype.h> functions */
+	PG_REGEX_LOCALE_1BYTE_L		/* Use locale_t <ctype.h> functions */
+} PG_Locale_Strategy;
+static PG_Locale_Strategy pg_regex_strategy;
+static pg_locale_t pg_regex_locale;
+/*
+ * Hard-wired character properties for C locale
+ */
+#define PG_ISDIGIT	0x01
+#define PG_ISALPHA	0x02
+#define PG_ISALNUM	(PG_ISDIGIT | PG_ISALPHA)
+#define PG_ISUPPER	0x04
+#define PG_ISLOWER	0x08
+#define PG_ISGRAPH	0x10
+#define PG_ISPRINT	0x20
+#define PG_ISPUNCT	0x40
+#define PG_ISSPACE	0x80
+static const unsigned char pg_char_properties[128] = {
+	/* NUL */	0,
+	/* ^A */	0,
+	/* ^B */	0,
+	/* ^C */	0,
+	/* ^D */	0,
+	/* ^E */	0,
+	/* ^F */	0,
+	/* ^G */	0,
+	/* ^H */	0,
+	/* ^I */	PG_ISSPACE,
+	/* ^J */	PG_ISSPACE,
+	/* ^K */	PG_ISSPACE,
+	/* ^L */	PG_ISSPACE,
+	/* ^M */	PG_ISSPACE,
+	/* ^N */	0,
+	/* ^O */	0,
+	/* ^P */	0,
+	/* ^Q */	0,
+	/* ^R */	0,
+	/* ^S */	0,
+	/* ^T */	0,
+	/* ^U */	0,
+	/* ^V */	0,
+	/* ^W */	0,
+	/* ^X */	0,
+	/* ^Y */	0,
+	/* ^Z */	0,
+	/* ^[ */	0,
+	/* ^\ */	0,
+	/* ^] */	0,
+	/* ^^ */	0,
+	/* ^_ */	0,
+	/*    */	PG_ISPRINT | PG_ISSPACE,
+	/* !  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* "  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* #  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* $  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* %  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* &  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* '  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* (  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* )  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* *  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* +  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* ,  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* -  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* .  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* /  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* 0  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 1  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 2  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 3  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 4  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 5  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 6  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 7  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 8  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* 9  */	PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
+	/* :  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* ;  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* <  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* =  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* >  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* ?  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* @  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* A  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* B  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* C  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* D  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* E  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* F  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* G  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* H  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* I  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* J  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* K  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* L  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* M  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* N  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* O  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* P  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* Q  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* R  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* S  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* T  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* U  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* V  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* W  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* X  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* Y  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* Z  */	PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
+	/* [  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* \  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* ]  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* ^  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* _  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* `  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* a  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* b  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* c  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* d  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* e  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* f  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* g  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* h  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* i  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* j  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* k  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* l  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* m  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* n  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* o  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* p  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* q  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* r  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* s  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* t  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* u  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* v  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* w  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* x  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* y  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* z  */	PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
+	/* {  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* |  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* }  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* ~  */	PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
+	/* DEL */	0
+};
+/*
+ * pg_set_regex_collation: set collation for these functions to obey
+ *
+ * This is called when beginning compilation or execution of a regexp.
+ * Since there's no need for re-entrancy of regexp operations, it's okay
+ * to store the results in static variables.
+ */
+void
+pg_set_regex_collation(Oid collation)
+{
+	if (lc_ctype_is_c(collation))
+	{
+		/* C/POSIX collations use this path regardless of database encoding */
+		pg_regex_strategy = PG_REGEX_LOCALE_C;
+		pg_regex_locale = 0;
+	}
+	else
+	{
+		if (collation == DEFAULT_COLLATION_OID)
+			pg_regex_locale = 0;
+		else if (OidIsValid(collation))
+		{
+			/*
+			 * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T;
+			 * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does
+			 * not have to be considered below.
+			 */
+			pg_regex_locale = pg_newlocale_from_collation(collation);
+		}
+		else
+		{
+			/*
+			 * This typically means that the parser could not resolve a
+			 * conflict of implicit collations, so report it that way.
+			 */
+			ereport(ERROR,
+					(errcode(ERRCODE_INDETERMINATE_COLLATION),
+					 errmsg("could not determine which collation to use for regular expression"),
+					 errhint("Use the COLLATE clause to set the collation explicitly.")));
+		}
+#ifdef USE_WIDE_UPPER_LOWER
+		if (GetDatabaseEncoding() == PG_UTF8)
+		{
+			if (pg_regex_locale)
+				pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
+			else
+				pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
+		}
+		else
+#endif   /* USE_WIDE_UPPER_LOWER */
+		{
+			if (pg_regex_locale)
+				pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
+			else
+				pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
+		}
+	}
+}
+static int
+pg_wc_isdigit(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISDIGIT));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswdigit((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isdigit((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswdigit_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isdigit_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static int
+pg_wc_isalpha(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISALPHA));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswalpha((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isalpha((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswalpha_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isalpha_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static int
+pg_wc_isalnum(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISALNUM));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswalnum((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isalnum((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswalnum_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isalnum_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static int
+pg_wc_isupper(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISUPPER));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswupper((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isupper((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswupper_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isupper_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static int
+pg_wc_islower(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISLOWER));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswlower((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					islower((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswlower_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					islower_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static int
+pg_wc_isgraph(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISGRAPH));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswgraph((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isgraph((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswgraph_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isgraph_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static int
+pg_wc_isprint(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISPRINT));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswprint((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isprint((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswprint_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isprint_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static int
+pg_wc_ispunct(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISPUNCT));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswpunct((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					ispunct((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswpunct_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					ispunct_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static int
+pg_wc_isspace(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			return (c <= (pg_wchar) 127 &&
+					(pg_char_properties[c] & PG_ISSPACE));
+		case PG_REGEX_LOCALE_WIDE:
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswspace((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isspace((unsigned char) c));
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return iswspace_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			return (c <= (pg_wchar) UCHAR_MAX &&
+					isspace_l((unsigned char) c, pg_regex_locale));
+#endif
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static pg_wchar
+pg_wc_toupper(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_toupper((unsigned char) c);
+			return c;
+		case PG_REGEX_LOCALE_WIDE:
+			/* force C behavior for ASCII characters, per comments above */
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_toupper((unsigned char) c);
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return towupper((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			/* force C behavior for ASCII characters, per comments above */
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_toupper((unsigned char) c);
+			if (c <= (pg_wchar) UCHAR_MAX)
+				return toupper((unsigned char) c);
+			return c;
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return towupper_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			if (c <= (pg_wchar) UCHAR_MAX)
+				return toupper_l((unsigned char) c, pg_regex_locale);
+#endif
+			return c;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
+static pg_wchar
+pg_wc_tolower(pg_wchar c)
+{
+	switch (pg_regex_strategy)
+	{
+		case PG_REGEX_LOCALE_C:
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_tolower((unsigned char) c);
+			return c;
+		case PG_REGEX_LOCALE_WIDE:
+			/* force C behavior for ASCII characters, per comments above */
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_tolower((unsigned char) c);
+#ifdef USE_WIDE_UPPER_LOWER
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return towlower((wint_t) c);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE:
+			/* force C behavior for ASCII characters, per comments above */
+			if (c <= (pg_wchar) 127)
+				return pg_ascii_tolower((unsigned char) c);
+			if (c <= (pg_wchar) UCHAR_MAX)
+				return tolower((unsigned char) c);
+			return c;
+		case PG_REGEX_LOCALE_WIDE_L:
+#if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
+			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
+				return towlower_l((wint_t) c, pg_regex_locale);
+#endif
+			/* FALL THRU */
+		case PG_REGEX_LOCALE_1BYTE_L:
+#ifdef HAVE_LOCALE_T
+			if (c <= (pg_wchar) UCHAR_MAX)
+				return tolower_l((unsigned char) c, pg_regex_locale);
+#endif
+			return c;
+	}
+	return 0;					/* can't get here, but keep compiler quiet */
+}
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -172,7 +172,7 @@ static void addrange(struct cvec *, chr, chr);
 static struct cvec *getcvec(struct vars *, int, int);
 static void freecvec(struct cvec *);
-/* === regc_locale.c === */
+/* === regc_pg_locale.c === */
 static int	pg_wc_isdigit(pg_wchar c);
 static int	pg_wc_isalpha(pg_wchar c);
 static int	pg_wc_isalnum(pg_wchar c);
@@ -184,6 +184,8 @@ static int	pg_wc_ispunct(pg_wchar c);
 static int	pg_wc_isspace(pg_wchar c);
 static pg_wchar pg_wc_toupper(pg_wchar c);
 static pg_wchar pg_wc_tolower(pg_wchar c);
+/* === regc_locale.c === */
 static celt element(struct vars *, const chr *, const chr *);
 static struct cvec *range(struct vars *, celt, celt, int);
 static int	before(celt, celt);
@@ -281,7 +283,8 @@ int
 pg_regcomp(regex_t *re,
 		   const chr *string,
 		   size_t len,
-		   int flags)
+		   int flags,
+		   Oid collation)
 {
 	struct vars var;
 	struct vars *v = &var;
@@ -307,6 +310,9 @@ pg_regcomp(regex_t *re,
 	if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
 		return REG_INVARG;
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(collation);
 	/* initial setup (after which freev() is callable) */
 	v->re = re;
 	v->now = string;
@@ -333,6 +339,7 @@ pg_regcomp(regex_t *re,
 	re->re_magic = REMAGIC;
 	re->re_info = 0;			/* bits get set during parse */
 	re->re_csize = sizeof(chr);
+	re->re_collation = collation;
 	re->re_guts = NULL;
 	re->re_fns = VS(&functions);
@@ -1987,4 +1994,5 @@ stid(struct subre * t,
 #include "regc_color.c"
 #include "regc_nfa.c"
 #include "regc_cvec.c"
+#include "regc_pg_locale.c"
 #include "regc_locale.c"
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -192,6 +192,9 @@ pg_regexec(regex_t *re,
 	if (re->re_csize != sizeof(chr))
 		return REG_MIXED;
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(re->re_collation);
 	/* setup */
 	v->re = re;
 	v->g = (struct guts *) re->re_guts;

--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -14,6 +14,7 @@
 #include "postgres.h"
+#include "catalog/pg_collation.h"
 #include "tsearch/dicts/spell.h"
 #include "tsearch/ts_locale.h"
 #include "utils/memutils.h"
@@ -425,7 +426,9 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
 		wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
 		wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
-		err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen, REG_ADVANCED | REG_NOSUB);
+		err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
+						 REG_ADVANCED | REG_NOSUB,
+						 DEFAULT_COLLATION_OID);
 		if (err)
 		{
 			char		errstr[100];

--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -96,6 +96,7 @@ typedef struct cached_re_str
 	char	   *cre_pat;		/* original RE (not null terminated!) */
 	int			cre_pat_len;	/* length of original RE, in bytes */
 	int			cre_flags;		/* compile flags: extended,icase etc */
+	Oid			cre_collation;	/* collation to use */
 	regex_t		cre_re;			/* the compiled regular expression */
 } cached_re_str;
@@ -106,6 +107,7 @@ static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */
 /* Local functions */
 static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
 					 text *flags,
+					 Oid collation,
 					 bool force_glob,
 					 bool use_subpatterns,
 					 bool ignore_degenerate);
@@ -121,12 +123,13 @@ static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
 *
 *	text_re --- the pattern, expressed as a TEXT object
 *	cflags --- compile options for the pattern
+ *	collation --- collation to use for LC_CTYPE-dependent behavior
 *
 * Pattern is given in the database encoding.  We internally convert to
 * an array of pg_wchar, which is what Spencer's regex package wants.
 */
 static regex_t *
-RE_compile_and_cache(text *text_re, int cflags)
+RE_compile_and_cache(text *text_re, int cflags, Oid collation)
 {
 	int			text_re_len = VARSIZE_ANY_EXHDR(text_re);
 	char	   *text_re_val = VARDATA_ANY(text_re);
@@ -146,6 +149,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 	{
 		if (re_array[i].cre_pat_len == text_re_len &&
 			re_array[i].cre_flags == cflags &&
+			re_array[i].cre_collation == collation &&
 			memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
 		{
 			/*
@@ -176,7 +180,8 @@ RE_compile_and_cache(text *text_re, int cflags)
 	regcomp_result = pg_regcomp(&re_temp.cre_re,
 								pattern,
 								pattern_len,
-								cflags);
+								cflags,
+								collation);
 	pfree(pattern);
@@ -207,6 +212,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 	memcpy(re_temp.cre_pat, text_re_val, text_re_len);
 	re_temp.cre_pat_len = text_re_len;
 	re_temp.cre_flags = cflags;
+	re_temp.cre_collation = collation;
 	/*
 	 * Okay, we have a valid new item in re_temp; insert it into the storage
@@ -313,6 +319,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
 *	dat --- the data to match against (need not be null-terminated)
 *	dat_len --- the length of the data string
 *	cflags --- compile options for the pattern
+ *	collation --- collation to use for LC_CTYPE-dependent behavior
 *	nmatch, pmatch	--- optional return area for match details
 *
 * Both pattern and data are given in the database encoding.  We internally
@@ -320,12 +327,13 @@ RE_execute(regex_t *re, char *dat, int dat_len,
 */
 static bool
 RE_compile_and_execute(text *text_re, char *dat, int dat_len,
-					   int cflags, int nmatch, regmatch_t *pmatch)
+					   int cflags, Oid collation,
+					   int nmatch, regmatch_t *pmatch)
 {
 	regex_t    *re;
 	/* Compile RE */
-	re = RE_compile_and_cache(text_re, cflags);
+	re = RE_compile_and_cache(text_re, cflags, collation);
 	return RE_execute(re, dat, dat_len, nmatch, pmatch);
 }
@@ -424,6 +432,7 @@ nameregexeq(PG_FUNCTION_ARGS)
 										  NameStr(*n),
 										  strlen(NameStr(*n)),
 										  REG_ADVANCED,
+										  PG_GET_COLLATION(),
 										  0, NULL));
 }
@@ -437,6 +446,7 @@ nameregexne(PG_FUNCTION_ARGS)
 										   NameStr(*n),
 										   strlen(NameStr(*n)),
 										   REG_ADVANCED,
+										   PG_GET_COLLATION(),
 										   0, NULL));
 }
@@ -450,6 +460,7 @@ textregexeq(PG_FUNCTION_ARGS)
 										  VARDATA_ANY(s),
 										  VARSIZE_ANY_EXHDR(s),
 										  REG_ADVANCED,
+										  PG_GET_COLLATION(),
 										  0, NULL));
 }
@@ -463,6 +474,7 @@ textregexne(PG_FUNCTION_ARGS)
 										   VARDATA_ANY(s),
 										   VARSIZE_ANY_EXHDR(s),
 										   REG_ADVANCED,
+										   PG_GET_COLLATION(),
 										   0, NULL));
 }
@@ -483,6 +495,7 @@ nameicregexeq(PG_FUNCTION_ARGS)
 										  NameStr(*n),
 										  strlen(NameStr(*n)),
 										  REG_ADVANCED | REG_ICASE,
+										  PG_GET_COLLATION(),
 										  0, NULL));
 }
@@ -496,6 +509,7 @@ nameicregexne(PG_FUNCTION_ARGS)
 										   NameStr(*n),
 										   strlen(NameStr(*n)),
 										   REG_ADVANCED | REG_ICASE,
+										   PG_GET_COLLATION(),
 										   0, NULL));
 }
@@ -509,6 +523,7 @@ texticregexeq(PG_FUNCTION_ARGS)
 										  VARDATA_ANY(s),
 										  VARSIZE_ANY_EXHDR(s),
 										  REG_ADVANCED | REG_ICASE,
+										  PG_GET_COLLATION(),
 										  0, NULL));
 }
@@ -522,6 +537,7 @@ texticregexne(PG_FUNCTION_ARGS)
 										   VARDATA_ANY(s),
 										   VARSIZE_ANY_EXHDR(s),
 										   REG_ADVANCED | REG_ICASE,
+										   PG_GET_COLLATION(),
 										   0, NULL));
 }
@@ -541,7 +557,7 @@ textregexsubstr(PG_FUNCTION_ARGS)
 				eo;
 	/* Compile RE */
-	re = RE_compile_and_cache(p, REG_ADVANCED);
+	re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
 	/*
 	 * We pass two regmatch_t structs to get info about the overall match and
@@ -597,7 +613,7 @@ textregexreplace_noopt(PG_FUNCTION_ARGS)
 	text	   *r = PG_GETARG_TEXT_PP(2);
 	regex_t    *re;
-	re = RE_compile_and_cache(p, REG_ADVANCED);
+	re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
 	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
 }
@@ -618,7 +634,7 @@ textregexreplace(PG_FUNCTION_ARGS)
 	parse_re_flags(&flags, opt);
-	re = RE_compile_and_cache(p, flags.cflags);
+	re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
 	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
 }
@@ -781,7 +797,9 @@ regexp_matches(PG_FUNCTION_ARGS)
 		/* be sure to copy the input string into the multi-call ctx */
 		matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-										flags, false, true, false);
+										flags,
+										PG_GET_COLLATION(),
+										false, true, false);
 		/* Pre-create workspace that build_regexp_matches_result needs */
 		matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -830,6 +848,7 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
 */
 static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, text *flags,
+					 Oid collation,
 					 bool force_glob, bool use_subpatterns,
 					 bool ignore_degenerate)
 {
@@ -868,7 +887,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
 	}
 	/* set up the compiled pattern */
-	cpattern = RE_compile_and_cache(pattern, re_flags.cflags);
+	cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation);
 	/* do we want to remember subpatterns? */
 	if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1039,7 +1058,9 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
 		/* be sure to copy the input string into the multi-call ctx */
 		splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-										flags, true, false, true);
+										flags,
+										PG_GET_COLLATION(),
+										true, false, true);
 		MemoryContextSwitchTo(oldcontext);
 		funcctx->user_fctx = (void *) splitctx;
@@ -1083,6 +1104,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
 	splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
 									PG_GETARG_TEXT_PP(1),
 									PG_GETARG_TEXT_PP_IF_EXISTS(2),
+									PG_GET_COLLATION(),
 									true, false, true);
 	while (splitctx->next_match <= splitctx->nmatches)

--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -73,6 +73,7 @@ typedef struct
 #define  REG_USHORTEST		 020000
 	int			re_csize;		/* sizeof(character) */
 	char	   *re_endp;		/* backward compatibility kludge */
+	Oid			re_collation;	/* Collation that defines LC_CTYPE behavior */
 	/* the rest is opaque pointers to hidden innards */
 	char	   *re_guts;		/* `char *' is more portable than `void *' */
 	char	   *re_fns;
@@ -161,9 +162,10 @@ typedef struct
 /*
 * the prototypes for exported functions
 */
-extern int	pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
+extern int	pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
 extern int	pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
 extern void pg_regfree(regex_t *);
 extern size_t pg_regerror(int, const regex_t *, char *, size_t);
+extern void pg_set_regex_collation(Oid collation);
 #endif   /* _REGEX_H_ */
--- a/src/test/regress/expected/collate.linux.utf8.out
+++ b/src/test/regress/expected/collate.linux.utf8.out
@@ -319,6 +319,80 @@ SELECT relname FROM pg_class WHERE relname ILIKE 'abc%';
 ---------
 (0 rows)
+-- regular expressions
+SELECT * FROM collate_test1 WHERE b ~ '^abc$';
+ a |  b  
+---+-----
+ 1 | abc
+(1 row)
+SELECT * FROM collate_test1 WHERE b ~ '^abc';
+ a |  b  
+---+-----
+ 1 | abc
+(1 row)
+SELECT * FROM collate_test1 WHERE b ~ 'bc';
+ a |  b  
+---+-----
+ 1 | abc
+ 2 | äbc
+ 3 | bbc
+(3 rows)
+SELECT * FROM collate_test1 WHERE b ~* '^abc$';
+ a |  b  
+---+-----
+ 1 | abc
+ 4 | ABC
+(2 rows)
+SELECT * FROM collate_test1 WHERE b ~* '^abc';
+ a |  b  
+---+-----
+ 1 | abc
+ 4 | ABC
+(2 rows)
+SELECT * FROM collate_test1 WHERE b ~* 'bc';
+ a |  b  
+---+-----
+ 1 | abc
+ 2 | äbc
+ 3 | bbc
+ 4 | ABC
+(4 rows)
+SELECT 'Türkiye' COLLATE "en_US" ~* 'KI' AS "true";
+ true 
+------
+ t
+(1 row)
+SELECT 'Türkiye' COLLATE "tr_TR" ~* 'KI' AS "false";
+ false 
+-------
+ f
+(1 row)
+SELECT 'bıt' ~* 'BIT' COLLATE "en_US" AS "false";
+ false 
+-------
+ f
+(1 row)
+SELECT 'bıt' ~* 'BIT' COLLATE "tr_TR" AS "true";
+ true 
+------
+ t
+(1 row)
+-- The following actually exercises the selectivity estimation for ~*.
+SELECT relname FROM pg_class WHERE relname ~* '^abc';
+ relname 
+---------
+(0 rows)
 -- to_char
 SET lc_time TO 'tr_TR';
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');

--- a/src/test/regress/sql/collate.linux.utf8.sql
+++ b/src/test/regress/sql/collate.linux.utf8.sql
@@ -124,6 +124,24 @@ SELECT 'bıt' ILIKE 'BIT' COLLATE "tr_TR" AS "true";
 -- The following actually exercises the selectivity estimation for ILIKE.
 SELECT relname FROM pg_class WHERE relname ILIKE 'abc%';
+-- regular expressions
+SELECT * FROM collate_test1 WHERE b ~ '^abc$';
+SELECT * FROM collate_test1 WHERE b ~ '^abc';
+SELECT * FROM collate_test1 WHERE b ~ 'bc';
+SELECT * FROM collate_test1 WHERE b ~* '^abc$';
+SELECT * FROM collate_test1 WHERE b ~* '^abc';
+SELECT * FROM collate_test1 WHERE b ~* 'bc';
+SELECT 'Türkiye' COLLATE "en_US" ~* 'KI' AS "true";
+SELECT 'Türkiye' COLLATE "tr_TR" ~* 'KI' AS "false";
+SELECT 'bıt' ~* 'BIT' COLLATE "en_US" AS "false";
+SELECT 'bıt' ~* 'BIT' COLLATE "tr_TR" AS "true";
+-- The following actually exercises the selectivity estimation for ~*.
+SELECT relname FROM pg_class WHERE relname ~* '^abc';
 -- to_char