Teach regular expression operators to honor collations.

This involves getting the character classification and case-folding functions in the regex library to use the collations infrastructure. Most of this work had been done already in connection with the upper/lower and LIKE logic, so it was a simple matter of transposition. While at it, split out these functions into a separate source file regc_pg_locale.c, so that they can be correctly labeled with the Postgres project's license rather than the Scriptics license. These functions are 100% Postgres-written code whereas what remains in regc_locale.c is still mostly not ours, so lumping them both under the same copyright notice was getting more and more misleading.

Teach regular expression operators to honor collations.
This involves getting the character classification and case-folding functions in the regex library to use the collations infrastructure. Most of this work had been done already in connection with the upper/lower and LIKE logic, so it was a simple matter of transposition. While at it, split out these functions into a separate source file regc_pg_locale.c, so that they can be correctly labeled with the Postgres project's license rather than the Scriptics license. These functions are 100% Postgres-written code whereas what remains in regc_locale.c is still mostly not ours, so lumping them both under the same copyright notice was getting more and more misleading.
1e16a810 · Tom Lane · 210f95f1 · 1e16a810 · 1e16a810 · 1e16a810
Commit 1e16a810 authored Apr 10, 2011 by Tom Lane
12 changed files
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -221,17 +221,21 @@ initdb --locale=sv_SE
     <listitem>
      <para>
-       The ability to use indexes with <literal>LIKE</> clauses
+       The <function>upper</>, <function>lower</>, and <function>initcap</>
-       <indexterm><primary>LIKE</><secondary>and locales</></indexterm>
+       functions
+       <indexterm><primary>upper</><secondary>and locales</></indexterm>
+       <indexterm><primary>lower</><secondary>and locales</></indexterm>
      </para>
     </listitem>
     <listitem>
      <para>
-       The <function>upper</>,  <function>lower</>,  and <function>initcap</>
+       Pattern matching operators (<literal>LIKE</>, <literal>SIMILAR TO</>,
-       functions
+       and POSIX-style regular expressions); locales affect both case
-       <indexterm><primary>upper</><secondary>and locales</></indexterm>
+       insensitive matching and the classification of characters by
-       <indexterm><primary>lower</><secondary>and locales</></indexterm>
+       character-class regular expressions
+       <indexterm><primary>LIKE</><secondary>and locales</></indexterm>
+       <indexterm><primary>regular expressions</><secondary>and locales</></indexterm>
      </para>
     </listitem>
@@ -241,6 +245,12 @@ initdb --locale=sv_SE
       <indexterm><primary>to_char</><secondary>and locales</></indexterm>
      </para>
     </listitem>
+     <listitem>
+      <para>
+       The ability to use indexes with <literal>LIKE</> clauses
+      </para>
+     </listitem>
    </itemizedlist>
   </para>
@@ -319,8 +329,8 @@ initdb --locale=sv_SE
  <indexterm zone="collation"><primary>collation</></>
  <para>
-   The collation feature allows specifying the sort order and certain
+   The collation feature allows specifying the sort order and character
-   other locale aspects of data per-column, or even per-operation.
+   classification behavior of data per-column, or even per-operation.
   This alleviates the restriction that the
   <symbol>LC_COLLATE</symbol> and <symbol>LC_CTYPE</symbol> settings
   of a database cannot be changed after its creation.
@@ -351,8 +361,8 @@ initdb --locale=sv_SE
   </para>
   <para>
-    When the database system has to perform an ordering or a
+    When the database system has to perform an ordering or a character
-    comparison, it uses the collation of the input expression.  This
+    classification, it uses the collation of the input expression.  This
    happens, for example, with <literal>ORDER BY</literal> clauses
    and function or operator calls such as <literal>&lt;</literal>.
    The collation to apply for an <literal>ORDER BY</literal> clause
@@ -361,7 +371,8 @@ initdb --locale=sv_SE
    below.  In addition to comparison operators, collations are taken into
    account by functions that convert between lower and upper case
    letters, such as <function>lower</>, <function>upper</>, and
-    <function>initcap</>.
+    <function>initcap</>; by pattern matching operators; and by
+    <function>to_char</> and related functions.
   </para>
   <para>

--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -25,6 +25,7 @@
 #include <arpa/inet.h>
 #include <unistd.h>
+#include "catalog/pg_collation.h"
 #include "libpq/ip.h"
 #include "libpq/libpq.h"
 #include "regex/regex.h"
@@ -1781,7 +1782,7 @@ parse_ident_usermap(List *line, int line_number, const char *usermap_name,
 		 * XXX: Major room for optimization: regexps could be compiled when
 		 * the file is loaded and then re-used in every connection.
 		 */
-		r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED);
+		r = pg_regcomp(&re, wstr, wlen, REG_ADVANCED, C_COLLATION_OID);
 		if (r)
 		{
 			char		errstr[100];

--- a/src/backend/regex/Makefile
+++ b/src/backend/regex/Makefile
@@ -17,6 +17,7 @@ OBJS = regcomp.o regerror.o regexec.o regfree.o
 include $(top_srcdir)/src/backend/common.mk
 # mark inclusion dependencies between .c files explicitly
-regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c regc_locale.c
+regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c \
+        regc_locale.c regc_pg_locale.c
 regexec.o: regexec.c rege_dfa.c
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@@ -350,171 +350,6 @@ static const struct cname
 };
-/*
- * ctype functions adapted to work on pg_wchar (a/k/a chr)
- *
- * When working in UTF8 encoding, we use the <wctype.h> functions if
- * available.  This assumes that every platform uses Unicode codepoints
- * directly as the wchar_t representation of Unicode.  On some platforms
- * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
- *
- * In all other encodings, we use the <ctype.h> functions for pg_wchar
- * values up to 255, and punt for values above that.  This is only 100%
- * correct in single-byte encodings such as LATINn.  However, non-Unicode
- * multibyte encodings are mostly Far Eastern character sets for which the
- * properties being tested here aren't relevant for higher code values anyway.
- *
- * NB: the coding here assumes pg_wchar is an unsigned type.
- */
-static int
-pg_wc_isdigit(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswdigit((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
-}
-static int
-pg_wc_isalpha(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswalpha((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
-}
-static int
-pg_wc_isalnum(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswalnum((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
-}
-static int
-pg_wc_isupper(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswupper((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
-}
-static int
-pg_wc_islower(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswlower((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
-}
-static int
-pg_wc_isgraph(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswgraph((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
-}
-static int
-pg_wc_isprint(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswprint((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
-}
-static int
-pg_wc_ispunct(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswpunct((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
-}
-static int
-pg_wc_isspace(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return iswspace((wint_t) c);
-	}
-#endif
-	return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
-}
-static pg_wchar
-pg_wc_toupper(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return towupper((wint_t) c);
-	}
-#endif
-	if (c <= (pg_wchar) UCHAR_MAX)
-		return toupper((unsigned char) c);
-	return c;
-}
-static pg_wchar
-pg_wc_tolower(pg_wchar c)
-{
-#ifdef USE_WIDE_UPPER_LOWER
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-			return towlower((wint_t) c);
-	}
-#endif
-	if (c <= (pg_wchar) UCHAR_MAX)
-		return tolower((unsigned char) c);
-	return c;
-}
 /*
 * element - map collating-element name to celt
 */

--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -172,7 +172,7 @@ static void addrange(struct cvec *, chr, chr);
 static struct cvec *getcvec(struct vars *, int, int);
 static void freecvec(struct cvec *);
-/* === regc_locale.c === */
+/* === regc_pg_locale.c === */
 static int	pg_wc_isdigit(pg_wchar c);
 static int	pg_wc_isalpha(pg_wchar c);
 static int	pg_wc_isalnum(pg_wchar c);
@@ -184,6 +184,8 @@ static int	pg_wc_ispunct(pg_wchar c);
 static int	pg_wc_isspace(pg_wchar c);
 static pg_wchar pg_wc_toupper(pg_wchar c);
 static pg_wchar pg_wc_tolower(pg_wchar c);
+/* === regc_locale.c === */
 static celt element(struct vars *, const chr *, const chr *);
 static struct cvec *range(struct vars *, celt, celt, int);
 static int	before(celt, celt);
@@ -281,7 +283,8 @@ int
 pg_regcomp(regex_t *re,
 		   const chr *string,
 		   size_t len,
-		   int flags)
+		   int flags,
+		   Oid collation)
 {
 	struct vars var;
 	struct vars *v = &var;
@@ -307,6 +310,9 @@ pg_regcomp(regex_t *re,
 	if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
 		return REG_INVARG;
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(collation);
 	/* initial setup (after which freev() is callable) */
 	v->re = re;
 	v->now = string;
@@ -333,6 +339,7 @@ pg_regcomp(regex_t *re,
 	re->re_magic = REMAGIC;
 	re->re_info = 0;			/* bits get set during parse */
 	re->re_csize = sizeof(chr);
+	re->re_collation = collation;
 	re->re_guts = NULL;
 	re->re_fns = VS(&functions);
@@ -1987,4 +1994,5 @@ stid(struct subre * t,
 #include "regc_color.c"
 #include "regc_nfa.c"
 #include "regc_cvec.c"
+#include "regc_pg_locale.c"
 #include "regc_locale.c"
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -192,6 +192,9 @@ pg_regexec(regex_t *re,
 	if (re->re_csize != sizeof(chr))
 		return REG_MIXED;
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(re->re_collation);
 	/* setup */
 	v->re = re;
 	v->g = (struct guts *) re->re_guts;

--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -14,6 +14,7 @@
 #include "postgres.h"
+#include "catalog/pg_collation.h"
 #include "tsearch/dicts/spell.h"
 #include "tsearch/ts_locale.h"
 #include "utils/memutils.h"
@@ -425,7 +426,9 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
 		wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
 		wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
-		err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen, REG_ADVANCED | REG_NOSUB);
+		err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
+						 REG_ADVANCED | REG_NOSUB,
+						 DEFAULT_COLLATION_OID);
 		if (err)
 		{
 			char		errstr[100];

--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -96,6 +96,7 @@ typedef struct cached_re_str
 	char	   *cre_pat;		/* original RE (not null terminated!) */
 	int			cre_pat_len;	/* length of original RE, in bytes */
 	int			cre_flags;		/* compile flags: extended,icase etc */
+	Oid			cre_collation;	/* collation to use */
 	regex_t		cre_re;			/* the compiled regular expression */
 } cached_re_str;
@@ -106,6 +107,7 @@ static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */
 /* Local functions */
 static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
 					 text *flags,
+					 Oid collation,
 					 bool force_glob,
 					 bool use_subpatterns,
 					 bool ignore_degenerate);
@@ -121,12 +123,13 @@ static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
 *
 *	text_re --- the pattern, expressed as a TEXT object
 *	cflags --- compile options for the pattern
+ *	collation --- collation to use for LC_CTYPE-dependent behavior
 *
 * Pattern is given in the database encoding.  We internally convert to
 * an array of pg_wchar, which is what Spencer's regex package wants.
 */
 static regex_t *
-RE_compile_and_cache(text *text_re, int cflags)
+RE_compile_and_cache(text *text_re, int cflags, Oid collation)
 {
 	int			text_re_len = VARSIZE_ANY_EXHDR(text_re);
 	char	   *text_re_val = VARDATA_ANY(text_re);
@@ -146,6 +149,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 	{
 		if (re_array[i].cre_pat_len == text_re_len &&
 			re_array[i].cre_flags == cflags &&
+			re_array[i].cre_collation == collation &&
 			memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
 		{
 			/*
@@ -176,7 +180,8 @@ RE_compile_and_cache(text *text_re, int cflags)
 	regcomp_result = pg_regcomp(&re_temp.cre_re,
 								pattern,
 								pattern_len,
-								cflags);
+								cflags,
+								collation);
 	pfree(pattern);
@@ -207,6 +212,7 @@ RE_compile_and_cache(text *text_re, int cflags)
 	memcpy(re_temp.cre_pat, text_re_val, text_re_len);
 	re_temp.cre_pat_len = text_re_len;
 	re_temp.cre_flags = cflags;
+	re_temp.cre_collation = collation;
 	/*
 	 * Okay, we have a valid new item in re_temp; insert it into the storage
@@ -313,6 +319,7 @@ RE_execute(regex_t *re, char *dat, int dat_len,
 *	dat --- the data to match against (need not be null-terminated)
 *	dat_len --- the length of the data string
 *	cflags --- compile options for the pattern
+ *	collation --- collation to use for LC_CTYPE-dependent behavior
 *	nmatch, pmatch	--- optional return area for match details
 *
 * Both pattern and data are given in the database encoding.  We internally
@@ -320,12 +327,13 @@ RE_execute(regex_t *re, char *dat, int dat_len,
 */
 static bool
 RE_compile_and_execute(text *text_re, char *dat, int dat_len,
-					   int cflags, int nmatch, regmatch_t *pmatch)
+					   int cflags, Oid collation,
+					   int nmatch, regmatch_t *pmatch)
 {
 	regex_t    *re;
 	/* Compile RE */
-	re = RE_compile_and_cache(text_re, cflags);
+	re = RE_compile_and_cache(text_re, cflags, collation);
 	return RE_execute(re, dat, dat_len, nmatch, pmatch);
 }
@@ -424,6 +432,7 @@ nameregexeq(PG_FUNCTION_ARGS)
 										  NameStr(*n),
 										  strlen(NameStr(*n)),
 										  REG_ADVANCED,
+										  PG_GET_COLLATION(),
 										  0, NULL));
 }
@@ -437,6 +446,7 @@ nameregexne(PG_FUNCTION_ARGS)
 										   NameStr(*n),
 										   strlen(NameStr(*n)),
 										   REG_ADVANCED,
+										   PG_GET_COLLATION(),
 										   0, NULL));
 }
@@ -450,6 +460,7 @@ textregexeq(PG_FUNCTION_ARGS)
 										  VARDATA_ANY(s),
 										  VARSIZE_ANY_EXHDR(s),
 										  REG_ADVANCED,
+										  PG_GET_COLLATION(),
 										  0, NULL));
 }
@@ -463,6 +474,7 @@ textregexne(PG_FUNCTION_ARGS)
 										   VARDATA_ANY(s),
 										   VARSIZE_ANY_EXHDR(s),
 										   REG_ADVANCED,
+										   PG_GET_COLLATION(),
 										   0, NULL));
 }
@@ -483,6 +495,7 @@ nameicregexeq(PG_FUNCTION_ARGS)
 										  NameStr(*n),
 										  strlen(NameStr(*n)),
 										  REG_ADVANCED | REG_ICASE,
+										  PG_GET_COLLATION(),
 										  0, NULL));
 }
@@ -496,6 +509,7 @@ nameicregexne(PG_FUNCTION_ARGS)
 										   NameStr(*n),
 										   strlen(NameStr(*n)),
 										   REG_ADVANCED | REG_ICASE,
+										   PG_GET_COLLATION(),
 										   0, NULL));
 }
@@ -509,6 +523,7 @@ texticregexeq(PG_FUNCTION_ARGS)
 										  VARDATA_ANY(s),
 										  VARSIZE_ANY_EXHDR(s),
 										  REG_ADVANCED | REG_ICASE,
+										  PG_GET_COLLATION(),
 										  0, NULL));
 }
@@ -522,6 +537,7 @@ texticregexne(PG_FUNCTION_ARGS)
 										   VARDATA_ANY(s),
 										   VARSIZE_ANY_EXHDR(s),
 										   REG_ADVANCED | REG_ICASE,
+										   PG_GET_COLLATION(),
 										   0, NULL));
 }
@@ -541,7 +557,7 @@ textregexsubstr(PG_FUNCTION_ARGS)
 				eo;
 	/* Compile RE */
-	re = RE_compile_and_cache(p, REG_ADVANCED);
+	re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
 	/*
 	 * We pass two regmatch_t structs to get info about the overall match and
@@ -597,7 +613,7 @@ textregexreplace_noopt(PG_FUNCTION_ARGS)
 	text	   *r = PG_GETARG_TEXT_PP(2);
 	regex_t    *re;
-	re = RE_compile_and_cache(p, REG_ADVANCED);
+	re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
 	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
 }
@@ -618,7 +634,7 @@ textregexreplace(PG_FUNCTION_ARGS)
 	parse_re_flags(&flags, opt);
-	re = RE_compile_and_cache(p, flags.cflags);
+	re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
 	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
 }
@@ -781,7 +797,9 @@ regexp_matches(PG_FUNCTION_ARGS)
 		/* be sure to copy the input string into the multi-call ctx */
 		matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-										flags, false, true, false);
+										flags,
+										PG_GET_COLLATION(),
+										false, true, false);
 		/* Pre-create workspace that build_regexp_matches_result needs */
 		matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -830,6 +848,7 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
 */
 static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, text *flags,
+					 Oid collation,
 					 bool force_glob, bool use_subpatterns,
 					 bool ignore_degenerate)
 {
@@ -868,7 +887,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
 	}
 	/* set up the compiled pattern */
-	cpattern = RE_compile_and_cache(pattern, re_flags.cflags);
+	cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation);
 	/* do we want to remember subpatterns? */
 	if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1039,7 +1058,9 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
 		/* be sure to copy the input string into the multi-call ctx */
 		splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-										flags, true, false, true);
+										flags,
+										PG_GET_COLLATION(),
+										true, false, true);
 		MemoryContextSwitchTo(oldcontext);
 		funcctx->user_fctx = (void *) splitctx;
@@ -1083,6 +1104,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
 	splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
 									PG_GETARG_TEXT_PP(1),
 									PG_GETARG_TEXT_PP_IF_EXISTS(2),
+									PG_GET_COLLATION(),
 									true, false, true);
 	while (splitctx->next_match <= splitctx->nmatches)

--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -73,6 +73,7 @@ typedef struct
 #define  REG_USHORTEST		 020000
 	int			re_csize;		/* sizeof(character) */
 	char	   *re_endp;		/* backward compatibility kludge */
+	Oid			re_collation;	/* Collation that defines LC_CTYPE behavior */
 	/* the rest is opaque pointers to hidden innards */
 	char	   *re_guts;		/* `char *' is more portable than `void *' */
 	char	   *re_fns;
@@ -161,9 +162,10 @@ typedef struct
 /*
 * the prototypes for exported functions
 */
-extern int	pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
+extern int	pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
 extern int	pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
 extern void pg_regfree(regex_t *);
 extern size_t pg_regerror(int, const regex_t *, char *, size_t);
+extern void pg_set_regex_collation(Oid collation);
 #endif   /* _REGEX_H_ */
--- a/src/test/regress/expected/collate.linux.utf8.out
+++ b/src/test/regress/expected/collate.linux.utf8.out
@@ -319,6 +319,80 @@ SELECT relname FROM pg_class WHERE relname ILIKE 'abc%';
 ---------
 (0 rows)
+-- regular expressions
+SELECT * FROM collate_test1 WHERE b ~ '^abc$';
+ a |  b  
+---+-----
+ 1 | abc
+(1 row)
+SELECT * FROM collate_test1 WHERE b ~ '^abc';
+ a |  b  
+---+-----
+ 1 | abc
+(1 row)
+SELECT * FROM collate_test1 WHERE b ~ 'bc';
+ a |  b  
+---+-----
+ 1 | abc
+ 2 | äbc
+ 3 | bbc
+(3 rows)
+SELECT * FROM collate_test1 WHERE b ~* '^abc$';
+ a |  b  
+---+-----
+ 1 | abc
+ 4 | ABC
+(2 rows)
+SELECT * FROM collate_test1 WHERE b ~* '^abc';
+ a |  b  
+---+-----
+ 1 | abc
+ 4 | ABC
+(2 rows)
+SELECT * FROM collate_test1 WHERE b ~* 'bc';
+ a |  b  
+---+-----
+ 1 | abc
+ 2 | äbc
+ 3 | bbc
+ 4 | ABC
+(4 rows)
+SELECT 'Türkiye' COLLATE "en_US" ~* 'KI' AS "true";
+ true 
+------
+ t
+(1 row)
+SELECT 'Türkiye' COLLATE "tr_TR" ~* 'KI' AS "false";
+ false 
+-------
+ f
+(1 row)
+SELECT 'bıt' ~* 'BIT' COLLATE "en_US" AS "false";
+ false 
+-------
+ f
+(1 row)
+SELECT 'bıt' ~* 'BIT' COLLATE "tr_TR" AS "true";
+ true 
+------
+ t
+(1 row)
+-- The following actually exercises the selectivity estimation for ~*.
+SELECT relname FROM pg_class WHERE relname ~* '^abc';
+ relname 
+---------
+(0 rows)
 -- to_char
 SET lc_time TO 'tr_TR';
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');

--- a/src/test/regress/sql/collate.linux.utf8.sql
+++ b/src/test/regress/sql/collate.linux.utf8.sql
@@ -124,6 +124,24 @@ SELECT 'bıt' ILIKE 'BIT' COLLATE "tr_TR" AS "true";
 -- The following actually exercises the selectivity estimation for ILIKE.
 SELECT relname FROM pg_class WHERE relname ILIKE 'abc%';
+-- regular expressions
+SELECT * FROM collate_test1 WHERE b ~ '^abc$';
+SELECT * FROM collate_test1 WHERE b ~ '^abc';
+SELECT * FROM collate_test1 WHERE b ~ 'bc';
+SELECT * FROM collate_test1 WHERE b ~* '^abc$';
+SELECT * FROM collate_test1 WHERE b ~* '^abc';
+SELECT * FROM collate_test1 WHERE b ~* 'bc';
+SELECT 'Türkiye' COLLATE "en_US" ~* 'KI' AS "true";
+SELECT 'Türkiye' COLLATE "tr_TR" ~* 'KI' AS "false";
+SELECT 'bıt' ~* 'BIT' COLLATE "en_US" AS "false";
+SELECT 'bıt' ~* 'BIT' COLLATE "tr_TR" AS "true";
+-- The following actually exercises the selectivity estimation for ~*.
+SELECT relname FROM pg_class WHERE relname ~* '^abc';
 -- to_char