Commit cf9b0fea authored by Tom Lane's avatar Tom Lane

Implement regexp_match(), a simplified alternative to regexp_matches().

regexp_match() is like regexp_matches(), but it disallows the 'g' flag
and in consequence does not need to return a set.  Instead, it returns
a simple text array value, or NULL if there's no match.  Previously people
usually got that behavior with a sub-select, but this way is considerably
more efficient.

Documentation adjusted so that regexp_match() is presented first and then
regexp_matches() is introduced as a more complicated version.  This is
a bit historically revisionist but seems pedagogically better.

Still TODO: extend contrib/citext to support this function.

Emre Hasegeli, reviewed by David Johnston

Discussion: <CAE2gYzy42sna2ME_e3y1KLQ-4UBrB-eVF0SWn8QG39sQSeVhEw@mail.gmail.com>
parent 2d7e5910
This diff is collapsed.
......@@ -2068,7 +2068,7 @@ CREATE VIEW triggers AS
-- XXX strange hacks follow
CAST(
CASE WHEN pg_has_role(c.relowner, 'USAGE')
THEN (SELECT m[1] FROM regexp_matches(pg_get_triggerdef(t.oid), E'.{35,} WHEN \\((.+)\\) EXECUTE PROCEDURE') AS rm(m) LIMIT 1)
THEN (regexp_match(pg_get_triggerdef(t.oid), E'.{35,} WHEN \\((.+)\\) EXECUTE PROCEDURE'))[1]
ELSE null END
AS character_data) AS action_condition,
CAST(
......
......@@ -47,7 +47,7 @@ typedef struct pg_re_flags
bool glob; /* do it globally (for each occurrence) */
} pg_re_flags;
/* cross-call state for regexp_matches(), also regexp_split() */
/* cross-call state for regexp_match and regexp_split functions */
typedef struct regexp_matches_ctx
{
text *orig_str; /* data string in original TEXT form */
......@@ -57,7 +57,7 @@ typedef struct regexp_matches_ctx
/* so the number of entries in match_locs is nmatches * npatterns * 2 */
int *match_locs; /* 0-based character indexes */
int next_match; /* 0-based index of next match to process */
/* workspace for build_regexp_matches_result() */
/* workspace for build_regexp_match_result() */
Datum *elems; /* has npatterns elements */
bool *nulls; /* has npatterns elements */
} regexp_matches_ctx;
......@@ -107,13 +107,12 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
/* Local functions */
static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
text *flags,
pg_re_flags *flags,
Oid collation,
bool force_glob,
bool use_subpatterns,
bool ignore_degenerate);
static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
......@@ -350,7 +349,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
/*
* parse_re_flags - parse the options argument of regexp_matches and friends
* parse_re_flags - parse the options argument of regexp_match and friends
*
* flags --- output argument, filled with desired options
* opts --- TEXT object, or NULL for defaults
......@@ -840,9 +839,53 @@ similar_escape(PG_FUNCTION_ARGS)
PG_RETURN_TEXT_P(result);
}
/*
* regexp_match()
* Return the first substring(s) matching a pattern within a string.
*/
Datum
regexp_match(PG_FUNCTION_ARGS)
{
text *orig_str = PG_GETARG_TEXT_PP(0);
text *pattern = PG_GETARG_TEXT_PP(1);
text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
pg_re_flags re_flags;
regexp_matches_ctx *matchctx;
/* Determine options */
parse_re_flags(&re_flags, flags);
/* User mustn't specify 'g' */
if (re_flags.glob)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("regexp_match does not support the global option"),
errhint("Use the regexp_matches function instead.")));
matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
PG_GET_COLLATION(), true, false);
if (matchctx->nmatches == 0)
PG_RETURN_NULL();
Assert(matchctx->nmatches == 1);
/* Create workspace that build_regexp_match_result needs */
matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
}
/* This is separate to keep the opr_sanity regression test from complaining */
Datum
regexp_match_no_flags(PG_FUNCTION_ARGS)
{
return regexp_match(fcinfo);
}
/*
* regexp_matches()
* Return a table of matches of a pattern within a string.
* Return a table of all matches of a pattern within a string.
*/
Datum
regexp_matches(PG_FUNCTION_ARGS)
......@@ -854,18 +897,22 @@ regexp_matches(PG_FUNCTION_ARGS)
{
text *pattern = PG_GETARG_TEXT_PP(1);
text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
pg_re_flags re_flags;
MemoryContext oldcontext;
funcctx = SRF_FIRSTCALL_INIT();
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Determine options */
parse_re_flags(&re_flags, flags);
/* be sure to copy the input string into the multi-call ctx */
matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
flags,
&re_flags,
PG_GET_COLLATION(),
false, true, false);
true, false);
/* Pre-create workspace that build_regexp_matches_result needs */
/* Pre-create workspace that build_regexp_match_result needs */
matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
......@@ -880,7 +927,7 @@ regexp_matches(PG_FUNCTION_ARGS)
{
ArrayType *result_ary;
result_ary = build_regexp_matches_result(matchctx);
result_ary = build_regexp_match_result(matchctx);
matchctx->next_match++;
SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
}
......@@ -899,28 +946,27 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
}
/*
* setup_regexp_matches --- do the initial matching for regexp_matches()
* or regexp_split()
* setup_regexp_matches --- do the initial matching for regexp_match
* and regexp_split functions
*
* To avoid having to re-find the compiled pattern on each call, we do
* all the matching in one swoop. The returned regexp_matches_ctx contains
* the locations of all the substrings matching the pattern.
*
* The three bool parameters have only two patterns (one for each caller)
* but it seems clearer to distinguish the functionality this way than to
* key it all off one "is_split" flag.
* The two bool parameters have only two patterns (one for matching, one for
* splitting) but it seems clearer to distinguish the functionality this way
* than to key it all off one "is_split" flag.
*/
static regexp_matches_ctx *
setup_regexp_matches(text *orig_str, text *pattern, text *flags,
setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
Oid collation,
bool force_glob, bool use_subpatterns,
bool use_subpatterns,
bool ignore_degenerate)
{
regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
int orig_len;
pg_wchar *wide_str;
int wide_len;
pg_re_flags re_flags;
regex_t *cpattern;
regmatch_t *pmatch;
int pmatch_len;
......@@ -937,21 +983,8 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
/* determine options */
parse_re_flags(&re_flags, flags);
if (force_glob)
{
/* user mustn't specify 'g' for regexp_split */
if (re_flags.glob)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("regexp_split does not support the global option")));
/* but we find all the matches anyway */
re_flags.glob = true;
}
/* set up the compiled pattern */
cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation);
cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
/* do we want to remember subpatterns? */
if (use_subpatterns && cpattern->re_nsub > 0)
......@@ -970,7 +1003,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
/* the real output space (grown dynamically if needed) */
array_len = re_flags.glob ? 256 : 32;
array_len = re_flags->glob ? 256 : 32;
matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
array_idx = 0;
......@@ -1018,7 +1051,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
prev_match_end = pmatch[0].rm_eo;
/* if not glob, stop after one match */
if (!re_flags.glob)
if (!re_flags->glob)
break;
/*
......@@ -1057,10 +1090,10 @@ cleanup_regexp_matches(regexp_matches_ctx *matchctx)
}
/*
* build_regexp_matches_result - build output array for current match
* build_regexp_match_result - build output array for current match
*/
static ArrayType *
build_regexp_matches_result(regexp_matches_ctx *matchctx)
build_regexp_match_result(regexp_matches_ctx *matchctx)
{
Datum *elems = matchctx->elems;
bool *nulls = matchctx->nulls;
......@@ -1114,16 +1147,27 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
{
text *pattern = PG_GETARG_TEXT_PP(1);
text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
pg_re_flags re_flags;
MemoryContext oldcontext;
funcctx = SRF_FIRSTCALL_INIT();
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Determine options */
parse_re_flags(&re_flags, flags);
/* User mustn't specify 'g' */
if (re_flags.glob)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("regexp_split_to_table does not support the global option")));
/* But we find all the matches anyway */
re_flags.glob = true;
/* be sure to copy the input string into the multi-call ctx */
splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
flags,
&re_flags,
PG_GET_COLLATION(),
true, false, true);
false, true);
MemoryContextSwitchTo(oldcontext);
funcctx->user_fctx = (void *) splitctx;
......@@ -1162,13 +1206,24 @@ Datum
regexp_split_to_array(PG_FUNCTION_ARGS)
{
ArrayBuildState *astate = NULL;
pg_re_flags re_flags;
regexp_matches_ctx *splitctx;
/* Determine options */
parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
/* User mustn't specify 'g' */
if (re_flags.glob)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("regexp_split_to_array does not support the global option")));
/* But we find all the matches anyway */
re_flags.glob = true;
splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
PG_GETARG_TEXT_PP(1),
PG_GETARG_TEXT_PP_IF_EXISTS(2),
&re_flags,
PG_GET_COLLATION(),
true, false, true);
false, true);
while (splitctx->next_match <= splitctx->nmatches)
{
......
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 201608161
#define CATALOG_VERSION_NO 201608171
#endif
......@@ -1912,10 +1912,14 @@ DATA(insert OID = 2284 ( regexp_replace PGNSP PGUID 12 1 0 0 0 f f f f t f i
DESCR("replace text using regexp");
DATA(insert OID = 2285 ( regexp_replace PGNSP PGUID 12 1 0 0 0 f f f f t f i s 4 0 25 "25 25 25 25" _null_ _null_ _null_ _null_ _null_ textregexreplace _null_ _null_ _null_ ));
DESCR("replace text using regexp");
DATA(insert OID = 3396 ( regexp_match PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 1009 "25 25" _null_ _null_ _null_ _null_ _null_ regexp_match_no_flags _null_ _null_ _null_ ));
DESCR("find first match for regexp");
DATA(insert OID = 3397 ( regexp_match PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ _null_ regexp_match _null_ _null_ _null_ ));
DESCR("find first match for regexp");
DATA(insert OID = 2763 ( regexp_matches PGNSP PGUID 12 1 1 0 0 f f f f t t i s 2 0 1009 "25 25" _null_ _null_ _null_ _null_ _null_ regexp_matches_no_flags _null_ _null_ _null_ ));
DESCR("find all match groups for regexp");
DESCR("find match(es) for regexp");
DATA(insert OID = 2764 ( regexp_matches PGNSP PGUID 12 1 10 0 0 f f f f t t i s 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ _null_ regexp_matches _null_ _null_ _null_ ));
DESCR("find all match groups for regexp");
DESCR("find match(es) for regexp");
DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 25 "25 25 23" _null_ _null_ _null_ _null_ _null_ split_text _null_ _null_ _null_ ));
DESCR("split string by field_sep and return field_num");
DATA(insert OID = 2765 ( regexp_split_to_table PGNSP PGUID 12 1 1000 0 0 f f f f t t i s 2 0 25 "25 25" _null_ _null_ _null_ _null_ _null_ regexp_split_to_table_no_flags _null_ _null_ _null_ ));
......
......@@ -628,6 +628,8 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS);
extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
extern Datum textregexreplace(PG_FUNCTION_ARGS);
extern Datum similar_escape(PG_FUNCTION_ARGS);
extern Datum regexp_match(PG_FUNCTION_ARGS);
extern Datum regexp_match_no_flags(PG_FUNCTION_ARGS);
extern Datum regexp_matches(PG_FUNCTION_ARGS);
extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS);
extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
......
......@@ -90,6 +90,34 @@ select substring('a' from '((a)+)');
a
(1 row)
-- Test regexp_match()
select regexp_match('abc', '');
regexp_match
--------------
{""}
(1 row)
select regexp_match('abc', 'bc');
regexp_match
--------------
{bc}
(1 row)
select regexp_match('abc', 'd') is null;
?column?
----------
t
(1 row)
select regexp_match('abc', '(B)(c)', 'i');
regexp_match
--------------
{b,c}
(1 row)
select regexp_match('abc', 'Bd', 'ig'); -- error
ERROR: regexp_match does not support the global option
HINT: Use the regexp_matches function instead.
-- Test lookahead constraints
select regexp_matches('ab', 'a(?=b)b*');
regexp_matches
......
......@@ -681,9 +681,9 @@ SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e',
ERROR: invalid regexp option: "z"
-- global option meaningless for regexp_split
SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g') AS foo;
ERROR: regexp_split does not support the global option
ERROR: regexp_split_to_table does not support the global option
SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'g');
ERROR: regexp_split does not support the global option
ERROR: regexp_split_to_array does not support the global option
-- change NULL-display back
\pset null ''
-- E021-11 position expression
......
......@@ -25,6 +25,13 @@ select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)');
select substring('a' from '((a))+');
select substring('a' from '((a)+)');
-- Test regexp_match()
select regexp_match('abc', '');
select regexp_match('abc', 'bc');
select regexp_match('abc', 'd') is null;
select regexp_match('abc', '(B)(c)', 'i');
select regexp_match('abc', 'Bd', 'ig'); -- error
-- Test lookahead constraints
select regexp_matches('ab', 'a(?=b)b*');
select regexp_matches('a', 'a(?=b)b*');
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment