Implement regexp_match(), a simplified alternative to regexp_matches().

regexp_match() is like regexp_matches(), but it disallows the 'g' flag and in consequence does not need to return a set. Instead, it returns a simple text array value, or NULL if there's no match. Previously people usually got that behavior with a sub-select, but this way is considerably more efficient. Documentation adjusted so that regexp_match() is presented first and then regexp_matches() is introduced as a more complicated version. This is a bit historically revisionist but seems pedagogically better. Still TODO: extend contrib/citext to support this function. Emre Hasegeli, reviewed by David Johnston Discussion: <CAE2gYzy42sna2ME_e3y1KLQ-4UBrB-eVF0SWn8QG39sQSeVhEw@mail.gmail.com>

Implement regexp_match(), a simplified alternative to regexp_matches().
regexp_match() is like regexp_matches(), but it disallows the 'g' flag and in consequence does not need to return a set. Instead, it returns a simple text array value, or NULL if there's no match. Previously people usually got that behavior with a sub-select, but this way is considerably more efficient. Documentation adjusted so that regexp_match() is presented first and then regexp_matches() is introduced as a more complicated version. This is a bit historically revisionist but seems pedagogically better. Still TODO: extend contrib/citext to support this function. Emre Hasegeli, reviewed by David Johnston Discussion: <CAE2gYzy42sna2ME_e3y1KLQ-4UBrB-eVF0SWn8QG39sQSeVhEw@mail.gmail.com>
cf9b0fea · Tom Lane · 2d7e5910 · cf9b0fea · cf9b0fea · cf9b0fea
Commit cf9b0fea authored Aug 17, 2016 by Tom Lane
9 changed files
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
--- a/src/backend/catalog/information_schema.sql
+++ b/src/backend/catalog/information_schema.sql
@@ -2068,7 +2068,7 @@ CREATE VIEW triggers AS
           -- XXX strange hacks follow
           CAST(
             CASE WHEN pg_has_role(c.relowner, 'USAGE')
-               THEN (SELECT m[1] FROM regexp_matches(pg_get_triggerdef(t.oid), E'.{35,} WHEN \\((.+)\\) EXECUTE PROCEDURE') AS rm(m) LIMIT 1)
+               THEN (regexp_match(pg_get_triggerdef(t.oid), E'.{35,} WHEN \\((.+)\\) EXECUTE PROCEDURE'))[1]
               ELSE null END
             AS character_data) AS action_condition,
           CAST(

--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -47,7 +47,7 @@ typedef struct pg_re_flags
 	bool		glob;			/* do it globally (for each occurrence) */
 } pg_re_flags;

-/* cross-call state for regexp_matches(), also regexp_split() */
+/* cross-call state for regexp_match and regexp_split functions */
 typedef struct regexp_matches_ctx
 {
 	text	   *orig_str;		/* data string in original TEXT form */
@@ -57,7 +57,7 @@ typedef struct regexp_matches_ctx
 	/* so the number of entries in match_locs is nmatches * npatterns * 2 */
 	int		   *match_locs;		/* 0-based character indexes */
 	int			next_match;		/* 0-based index of next match to process */
-	/* workspace for build_regexp_matches_result() */
+	/* workspace for build_regexp_match_result() */
 	Datum	   *elems;			/* has npatterns elements */
 	bool	   *nulls;			/* has npatterns elements */
 } regexp_matches_ctx;
@@ -107,13 +107,12 @@ static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */

 /* Local functions */
 static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
-					 text *flags,
+					 pg_re_flags *flags,
 					 Oid collation,
-					 bool force_glob,
 					 bool use_subpatterns,
 					 bool ignore_degenerate);
 static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
-static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
+static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);


@@ -350,7 +349,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,


 /*
- * parse_re_flags - parse the options argument of regexp_matches and friends
+ * parse_re_flags - parse the options argument of regexp_match and friends
 *
 *	flags --- output argument, filled with desired options
 *	opts --- TEXT object, or NULL for defaults
@@ -840,9 +839,53 @@ similar_escape(PG_FUNCTION_ARGS)
 	PG_RETURN_TEXT_P(result);
 }

+/*
+ * regexp_match()
+ *		Return the first substring(s) matching a pattern within a string.
+ */
+Datum
+regexp_match(PG_FUNCTION_ARGS)
+{
+	text	   *orig_str = PG_GETARG_TEXT_PP(0);
+	text	   *pattern = PG_GETARG_TEXT_PP(1);
+	text	   *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
+	pg_re_flags re_flags;
+	regexp_matches_ctx *matchctx;
+
+	/* Determine options */
+	parse_re_flags(&re_flags, flags);
+	/* User mustn't specify 'g' */
+	if (re_flags.glob)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("regexp_match does not support the global option"),
+				 errhint("Use the regexp_matches function instead.")));
+
+	matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
+									PG_GET_COLLATION(), true, false);
+
+	if (matchctx->nmatches == 0)
+		PG_RETURN_NULL();
+
+	Assert(matchctx->nmatches == 1);
+
+	/* Create workspace that build_regexp_match_result needs */
+	matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
+	matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
+
+	PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
+}
+
+/* This is separate to keep the opr_sanity regression test from complaining */
+Datum
+regexp_match_no_flags(PG_FUNCTION_ARGS)
+{
+	return regexp_match(fcinfo);
+}
+
 /*
 * regexp_matches()
- *		Return a table of matches of a pattern within a string.
+ *		Return a table of all matches of a pattern within a string.
 */
 Datum
 regexp_matches(PG_FUNCTION_ARGS)
@@ -854,18 +897,22 @@ regexp_matches(PG_FUNCTION_ARGS)
 	{
 		text	   *pattern = PG_GETARG_TEXT_PP(1);
 		text	   *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
+		pg_re_flags re_flags;
 		MemoryContext oldcontext;

 		funcctx = SRF_FIRSTCALL_INIT();
 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);

+		/* Determine options */
+		parse_re_flags(&re_flags, flags);
+
 		/* be sure to copy the input string into the multi-call ctx */
 		matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-										flags,
+										&re_flags,
 										PG_GET_COLLATION(),
-										false, true, false);
+										true, false);

-		/* Pre-create workspace that build_regexp_matches_result needs */
+		/* Pre-create workspace that build_regexp_match_result needs */
 		matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
 		matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);

@@ -880,7 +927,7 @@ regexp_matches(PG_FUNCTION_ARGS)
 	{
 		ArrayType  *result_ary;

-		result_ary = build_regexp_matches_result(matchctx);
+		result_ary = build_regexp_match_result(matchctx);
 		matchctx->next_match++;
 		SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
 	}
@@ -899,28 +946,27 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
 }

 /*
- * setup_regexp_matches --- do the initial matching for regexp_matches()
- *		or regexp_split()
+ * setup_regexp_matches --- do the initial matching for regexp_match
+ *		and regexp_split functions
 *
 * To avoid having to re-find the compiled pattern on each call, we do
 * all the matching in one swoop.  The returned regexp_matches_ctx contains
 * the locations of all the substrings matching the pattern.
 *
- * The three bool parameters have only two patterns (one for each caller)
- * but it seems clearer to distinguish the functionality this way than to
- * key it all off one "is_split" flag.
+ * The two bool parameters have only two patterns (one for matching, one for
+ * splitting) but it seems clearer to distinguish the functionality this way
+ * than to key it all off one "is_split" flag.
 */
 static regexp_matches_ctx *
-setup_regexp_matches(text *orig_str, text *pattern, text *flags,
+setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 					 Oid collation,
-					 bool force_glob, bool use_subpatterns,
+					 bool use_subpatterns,
 					 bool ignore_degenerate)
 {
 	regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
 	int			orig_len;
 	pg_wchar   *wide_str;
 	int			wide_len;
-	pg_re_flags re_flags;
 	regex_t    *cpattern;
 	regmatch_t *pmatch;
 	int			pmatch_len;
@@ -937,21 +983,8 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
 	wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
 	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);

-	/* determine options */
-	parse_re_flags(&re_flags, flags);
-	if (force_glob)
-	{
-		/* user mustn't specify 'g' for regexp_split */
-		if (re_flags.glob)
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("regexp_split does not support the global option")));
-		/* but we find all the matches anyway */
-		re_flags.glob = true;
-	}
-
 	/* set up the compiled pattern */
-	cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation);
+	cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);

 	/* do we want to remember subpatterns? */
 	if (use_subpatterns && cpattern->re_nsub > 0)
@@ -970,7 +1003,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
 	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);

 	/* the real output space (grown dynamically if needed) */
-	array_len = re_flags.glob ? 256 : 32;
+	array_len = re_flags->glob ? 256 : 32;
 	matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
 	array_idx = 0;

@@ -1018,7 +1051,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
 		prev_match_end = pmatch[0].rm_eo;

 		/* if not glob, stop after one match */
-		if (!re_flags.glob)
+		if (!re_flags->glob)
 			break;

 		/*
@@ -1057,10 +1090,10 @@ cleanup_regexp_matches(regexp_matches_ctx *matchctx)
 }

 /*
- * build_regexp_matches_result - build output array for current match
+ * build_regexp_match_result - build output array for current match
 */
 static ArrayType *
-build_regexp_matches_result(regexp_matches_ctx *matchctx)
+build_regexp_match_result(regexp_matches_ctx *matchctx)
 {
 	Datum	   *elems = matchctx->elems;
 	bool	   *nulls = matchctx->nulls;
@@ -1114,16 +1147,27 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
 	{
 		text	   *pattern = PG_GETARG_TEXT_PP(1);
 		text	   *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
+		pg_re_flags re_flags;
 		MemoryContext oldcontext;

 		funcctx = SRF_FIRSTCALL_INIT();
 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);

+		/* Determine options */
+		parse_re_flags(&re_flags, flags);
+		/* User mustn't specify 'g' */
+		if (re_flags.glob)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("regexp_split_to_table does not support the global option")));
+		/* But we find all the matches anyway */
+		re_flags.glob = true;
+
 		/* be sure to copy the input string into the multi-call ctx */
 		splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
-										flags,
+										&re_flags,
 										PG_GET_COLLATION(),
-										true, false, true);
+										false, true);

 		MemoryContextSwitchTo(oldcontext);
 		funcctx->user_fctx = (void *) splitctx;
@@ -1162,13 +1206,24 @@ Datum
 regexp_split_to_array(PG_FUNCTION_ARGS)
 {
 	ArrayBuildState *astate = NULL;
+	pg_re_flags re_flags;
 	regexp_matches_ctx *splitctx;

+	/* Determine options */
+	parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
+	/* User mustn't specify 'g' */
+	if (re_flags.glob)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+		errmsg("regexp_split_to_array does not support the global option")));
+	/* But we find all the matches anyway */
+	re_flags.glob = true;
+
 	splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
 									PG_GETARG_TEXT_PP(1),
-									PG_GETARG_TEXT_PP_IF_EXISTS(2),
+									&re_flags,
 									PG_GET_COLLATION(),
-									true, false, true);
+									false, true);

 	while (splitctx->next_match <= splitctx->nmatches)
 	{

--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
 */

 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201608161
+#define CATALOG_VERSION_NO	201608171

 #endif
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -1912,10 +1912,14 @@ DATA(insert OID =  2284 ( regexp_replace	   PGNSP PGUID 12 1 0 0 0 f f f f t f i
 DESCR("replace text using regexp");
 DATA(insert OID =  2285 ( regexp_replace	   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 4 0 25 "25 25 25 25" _null_ _null_ _null_ _null_ _null_ textregexreplace _null_ _null_ _null_ ));
 DESCR("replace text using regexp");
+DATA(insert OID =  3396 ( regexp_match	   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 1009 "25 25" _null_ _null_ _null_ _null_ _null_ regexp_match_no_flags _null_ _null_ _null_ ));
+DESCR("find first match for regexp");
+DATA(insert OID =  3397 ( regexp_match	   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ _null_ regexp_match _null_ _null_ _null_ ));
+DESCR("find first match for regexp");
 DATA(insert OID =  2763 ( regexp_matches   PGNSP PGUID 12 1 1 0 0 f f f f t t i s 2 0 1009 "25 25" _null_ _null_ _null_ _null_ _null_ regexp_matches_no_flags _null_ _null_ _null_ ));
-DESCR("find all match groups for regexp");
+DESCR("find match(es) for regexp");
 DATA(insert OID =  2764 ( regexp_matches   PGNSP PGUID 12 1 10 0 0 f f f f t t i s 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ _null_ regexp_matches _null_ _null_ _null_ ));
-DESCR("find all match groups for regexp");
+DESCR("find match(es) for regexp");
 DATA(insert OID =  2088 ( split_part   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 25 "25 25 23" _null_ _null_ _null_ _null_ _null_	split_text _null_ _null_ _null_ ));
 DESCR("split string by field_sep and return field_num");
 DATA(insert OID =  2765 ( regexp_split_to_table PGNSP PGUID 12 1 1000 0 0 f f f f t t i s 2 0 25 "25 25" _null_ _null_ _null_ _null_ _null_ regexp_split_to_table_no_flags _null_ _null_ _null_ ));

--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -628,6 +628,8 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS);
 extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
 extern Datum textregexreplace(PG_FUNCTION_ARGS);
 extern Datum similar_escape(PG_FUNCTION_ARGS);
+extern Datum regexp_match(PG_FUNCTION_ARGS);
+extern Datum regexp_match_no_flags(PG_FUNCTION_ARGS);
 extern Datum regexp_matches(PG_FUNCTION_ARGS);
 extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS);
 extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);

--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -90,6 +90,34 @@ select substring('a' from '((a)+)');
 a
 (1 row)

+-- Test regexp_match()
+select regexp_match('abc', '');
+ regexp_match 
+--------------
+ {""}
+(1 row)
+
+select regexp_match('abc', 'bc');
+ regexp_match 
+--------------
+ {bc}
+(1 row)
+
+select regexp_match('abc', 'd') is null;
+ ?column? 
+----------
+ t
+(1 row)
+
+select regexp_match('abc', '(B)(c)', 'i');
+ regexp_match 
+--------------
+ {b,c}
+(1 row)
+
+select regexp_match('abc', 'Bd', 'ig'); -- error
+ERROR:  regexp_match does not support the global option
+HINT:  Use the regexp_matches function instead.
 -- Test lookahead constraints
 select regexp_matches('ab', 'a(?=b)b*');
 regexp_matches 

--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -681,9 +681,9 @@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e',
 ERROR:  invalid regexp option: "z"
 -- global option meaningless for regexp_split
 SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g') AS foo;
-ERROR:  regexp_split does not support the global option
+ERROR:  regexp_split_to_table does not support the global option
 SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g');
-ERROR:  regexp_split does not support the global option
+ERROR:  regexp_split_to_array does not support the global option
 -- change NULL-display back
 \pset null ''
 -- E021-11 position expression

--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -25,6 +25,13 @@ select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)');
 select substring('a' from '((a))+');
 select substring('a' from '((a)+)');

+-- Test regexp_match()
+select regexp_match('abc', '');
+select regexp_match('abc', 'bc');
+select regexp_match('abc', 'd') is null;
+select regexp_match('abc', '(B)(c)', 'i');
+select regexp_match('abc', 'Bd', 'ig'); -- error
+
 -- Test lookahead constraints
 select regexp_matches('ab', 'a(?=b)b*');
 select regexp_matches('a', 'a(?=b)b*');