Fix performance issue in new regex match-all detection code.

Commit 824bf719 introduced a new search of the NFAs generated by regex compilation. I failed to think hard about the performance characteristics of that search, with the predictable outcome that it's bad: weird regexes can trigger exponential search time. Worse, there's no check-for-interrupt in that code, so you can't even cancel the query if this happens. Fix by introducing memo-ization of the search results, so that any one NFA state need be examined in detail just once. This potentially uses a lot of memory, but we can bound the memory usage by putting a limit on the number of states for which we'll try to prove match-all-ness. That is sane because we already have a limit (DUPINF) on the maximum finite string length that a matchall regex can match; and patterns that involve much more than DUPINF states would probably exceed that limit anyway. Also, rearrange the logic so that we check the basic is-the-graph- all-RAINBOW-arcs property before we start the recursive search to determine path lengths. This will ensure that we fall out quickly whenever the NFA couldn't possibly be matchall. Also stick in a check-for-interrupt, just in case these measures don't completely eliminate the risk of slowness. Discussion: https://postgr.es/m/3483895.1619898362@sss.pgh.pa.us

Fix performance issue in new regex match-all detection code.
Commit 824bf719 introduced a new search of the NFAs generated by regex compilation. I failed to think hard about the performance characteristics of that search, with the predictable outcome that it's bad: weird regexes can trigger exponential search time. Worse, there's no check-for-interrupt in that code, so you can't even cancel the query if this happens. Fix by introducing memo-ization of the search results, so that any one NFA state need be examined in detail just once. This potentially uses a lot of memory, but we can bound the memory usage by putting a limit on the number of states for which we'll try to prove match-all-ness. That is sane because we already have a limit (DUPINF) on the maximum finite string length that a matchall regex can match; and patterns that involve much more than DUPINF states would probably exceed that limit anyway. Also, rearrange the logic so that we check the basic is-the-graph- all-RAINBOW-arcs property before we start the recursive search to determine path lengths. This will ensure that we fall out quickly whenever the NFA couldn't possibly be matchall. Also stick in a check-for-interrupt, just in case these measures don't completely eliminate the risk of slowness. Discussion: https://postgr.es/m/3483895.1619898362@sss.pgh.pa.us
f68970e3 · Tom Lane · b94409a0 · f68970e3 · f68970e3 · f68970e3
Commit f68970e3 authored May 03, 2021 by Tom Lane
4 changed files
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -182,8 +182,7 @@ static void markreachable(struct nfa *, struct state *, struct state *, struct s
 static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
 static long analyze(struct nfa *);
 static void checkmatchall(struct nfa *);
-static bool checkmatchall_recurse(struct nfa *, struct state *,
-								  bool, int, bool *);
+static bool checkmatchall_recurse(struct nfa *, struct state *, bool **);
 static bool check_out_colors_match(struct state *, color, color);
 static bool check_in_colors_match(struct state *, color, color);
 static void compact(struct nfa *, struct cnfa *);

--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -567,6 +567,43 @@ select 'a' ~ '()+\1';
 t
 (1 row)

+-- Add coverage for some cases in checkmatchall
+select regexp_match('xy', '.|...');
+ regexp_match 
+--------------
+ {x}
+(1 row)
+
+select regexp_match('xyz', '.|...');
+ regexp_match 
+--------------
+ {xyz}
+(1 row)
+
+select regexp_match('xy', '.*');
+ regexp_match 
+--------------
+ {xy}
+(1 row)
+
+select regexp_match('fooba', '(?:..)*');
+ regexp_match 
+--------------
+ {foob}
+(1 row)
+
+select regexp_match('xyz', repeat('.', 260));
+ regexp_match 
+--------------
+ 
+(1 row)
+
+select regexp_match('foo', '(?:.|){99}');
+ regexp_match 
+--------------
+ {foo}
+(1 row)
+
 -- Error conditions
 select 'xyz' ~ 'x(\w)(?=\1)';  -- no backrefs in LACONs
 ERROR:  invalid regular expression: invalid backreference number

--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -135,6 +135,14 @@ select 'a' ~ '.. ()|\1';
 select 'a' ~ '()*\1';
 select 'a' ~ '()+\1';

+-- Add coverage for some cases in checkmatchall
+select regexp_match('xy', '.|...');
+select regexp_match('xyz', '.|...');
+select regexp_match('xy', '.*');
+select regexp_match('fooba', '(?:..)*');
+select regexp_match('xyz', repeat('.', 260));
+select regexp_match('foo', '(?:.|){99}');
+
 -- Error conditions
 select 'xyz' ~ 'x(\w)(?=\1)';  -- no backrefs in LACONs
 select 'xyz' ~ 'x(\w)(?=(\1))';