Fix potential infinite loop in regular expression execution.

In cfindloop(), if the initial call to shortest() reports that a zero-length match is possible at the current search start point, but then it is unable to construct any actual match to that, it'll just loop around with the same start point, and thus make no progress. We need to force the start point to be advanced. This is safe because the loop over "begin" points has already tried and failed to match starting at "close", so there is surely no need to try that again. This bug was introduced in commit e2bd9049, wherein we allowed continued searching after we'd run out of match possibilities, but evidently failed to think hard enough about exactly where we needed to search next. Because of the way this code works, such a match failure is only possible in the presence of backrefs --- otherwise, shortest()'s judgment that a match is possible should always be correct. That probably explains how come the bug has escaped detection for several years. The actual fix is a one-liner, but I took the trouble to add/improve some comments related to the loop logic. After fixing that, the submitted test case "()*\1" didn't loop anymore. But it reported failure, though it seems like it ought to match a zero-length string; both Tcl and Perl think it does. That seems to be from overenthusiastic optimization on my part when I rewrote the iteration match logic in commit 173e29aa: we can't just "declare victory" for a zero-length match without bothering to set match data for capturing parens inside the iterator node. Per fuzz testing by Greg Stark. The first part of this is a bug in all supported branches, and the second part is a bug since 9.2 where the iteration rewrite happened.

Fix potential infinite loop in regular expression execution.
In cfindloop(), if the initial call to shortest() reports that a zero-length match is possible at the current search start point, but then it is unable to construct any actual match to that, it'll just loop around with the same start point, and thus make no progress. We need to force the start point to be advanced. This is safe because the loop over "begin" points has already tried and failed to match starting at "close", so there is surely no need to try that again. This bug was introduced in commit e2bd9049, wherein we allowed continued searching after we'd run out of match possibilities, but evidently failed to think hard enough about exactly where we needed to search next. Because of the way this code works, such a match failure is only possible in the presence of backrefs --- otherwise, shortest()'s judgment that a match is possible should always be correct. That probably explains how come the bug has escaped detection for several years. The actual fix is a one-liner, but I took the trouble to add/improve some comments related to the loop logic. After fixing that, the submitted test case "()*\1" didn't loop anymore. But it reported failure, though it seems like it ought to match a zero-length string; both Tcl and Perl think it does. That seems to be from overenthusiastic optimization on my part when I rewrote the iteration match logic in commit 173e29aa: we can't just "declare victory" for a zero-length match without bothering to set match data for capturing parens inside the iterator node. Per fuzz testing by Greg Stark. The first part of this is a bug in all supported branches, and the second part is a bug since 9.2 where the iteration rewrite happened.
f2c4ffc3 · Tom Lane · 9fe8fe9c · f2c4ffc3 · f2c4ffc3 · f2c4ffc3
Commit f2c4ffc3 authored Oct 02, 2015 by Tom Lane
Showing with 68 additions and 13 deletions

src/backend/regex/regexec.c src/backend/regex/regexec.c +35 -13

src/test/regress/expected/regex.out src/test/regress/expected/regex.out +26 -0

src/test/regress/sql/regex.sql src/test/regress/sql/regex.sql +7 -0

No files found.
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -448,6 +448,7 @@ cfindloop(struct vars * v,
 	close = v->search_start;
 	do
 	{
+		/* Search with the search RE for match range at/beyond "close" */
 		MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
 		close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
 		if (ISERR())
@@ -456,10 +457,11 @@ cfindloop(struct vars * v,
 			return v->err;
 		}
 		if (close == NULL)
-			break;				/* NOTE BREAK */
+			break;				/* no more possible match anywhere */
 		assert(cold != NULL);
 		open = cold;
 		cold = NULL;
+		/* Search for matches starting between "open" and "close" inclusive */
 		MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
 		for (begin = open; begin <= close; begin++)
 		{
@@ -468,6 +470,7 @@ cfindloop(struct vars * v,
 			estop = v->stop;
 			for (;;)
 			{
+				/* Here we use the top node's detailed RE */
 				if (shorter)
 					end = shortest(v, d, begin, estart,
 								   estop, (chr **) NULL, &hitend);
@@ -482,8 +485,9 @@ cfindloop(struct vars * v,
 				if (hitend && cold == NULL)
 					cold = begin;
 				if (end == NULL)
-					break;		/* NOTE BREAK OUT */
+					break;		/* no match with this begin point, try next */
 				MDEBUG(("tentative end %ld\n", LOFF(end)));
+				/* Dissect the potential match to see if it really matches */
 				zapallsubs(v->pmatch, v->nmatch);
 				er = cdissect(v, v->g->tree, begin, end);
 				if (er == REG_OKAY)
@@ -502,21 +506,28 @@ cfindloop(struct vars * v,
 					*coldp = cold;
 					return er;
 				}
-				/* try next shorter/longer match with same begin point */
+				/* Try next longer/shorter match with same begin point */
 				if (shorter)
 				{
 					if (end == estop)
-						break;	/* NOTE BREAK OUT */
+						break;	/* no more, so try next begin point */
 					estart = end + 1;
 				}
 				else
 				{
 					if (end == begin)
-						break;	/* NOTE BREAK OUT */
+						break;	/* no more, so try next begin point */
 					estop = end - 1;
 				}
 			}					/* end loop over endpoint positions */
 		}						/* end loop over beginning positions */
+		/*
+		 * If we get here, there is no possible match starting at or before
+		 * "close", so consider matches beyond that.  We'll do a fresh search
+		 * with the search RE to find a new promising match range.
+		 */
+		close++;
 	} while (close < v->stop);
 	*coldp = cold;
@@ -963,17 +974,17 @@ citerdissect(struct vars * v,
 	assert(begin <= end);
 	/*
-	 * If zero matches are allowed, and target string is empty, just declare
+	 * For the moment, assume the minimum number of matches is 1.  If zero
-	 * victory.  OTOH, if target string isn't empty, zero matches can't work
+	 * matches are allowed, and the target string is empty, we are allowed to
-	 * so we pretend the min is 1.
+	 * match regardless of the contents of the iter node --- but we would
+	 * prefer to match once, so that capturing parens get set.  (An example of
+	 * the concern here is a pattern like "()*\1", which historically this
+	 * code has allowed to succeed.)  Therefore, we deal with the zero-matches
+	 * case at the bottom, after failing to find any other way to match.
 	 */
 	min_matches = t->min;
 	if (min_matches <= 0)
-	{
-		if (begin == end)
-			return REG_OKAY;
 		min_matches = 1;
-	}
 	/*
 	 * We need workspace to track the endpoints of each sub-match.  Normally
@@ -1123,8 +1134,19 @@ backtrack:
 	}
 	/* all possibilities exhausted */
-	MDEBUG(("%d failed\n", t->id));
 	FREE(endpts);
+	/*
+	 * Now consider the possibility that we can match to a zero-length string
+	 * by using zero repetitions.
+	 */
+	if (t->min == 0 && begin == end)
+	{
+		MDEBUG(("%d allowing zero matches\n", t->id));
+		return REG_OKAY;
+	}
+	MDEBUG(("%d failed\n", t->id));
 	return REG_NOMATCH;
 }

--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -196,3 +196,29 @@ select regexp_matches('foo/bar/baz',
 {foo,bar,baz}
 (1 row)
+-- Test for infinite loop in cfindloop with zero-length possible match
+-- but no actual match (can only happen in the presence of backrefs)
+select 'a' ~ '$()|^\1';
+ ?column? 
+----------
+ f
+(1 row)
+select 'a' ~ '.. ()|\1';
+ ?column? 
+----------
+ f
+(1 row)
+select 'a' ~ '()*\1';
+ ?column? 
+----------
+ t
+(1 row)
+select 'a' ~ '()+\1';
+ ?column? 
+----------
+ t
+(1 row)
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -50,3 +50,10 @@ select regexp_matches('Programmer', '(\w)(.*?\1)', 'g');
 -- Test for proper matching of non-greedy iteration (bug #11478)
 select regexp_matches('foo/bar/baz',
                      '^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', '');
+-- Test for infinite loop in cfindloop with zero-length possible match
+-- but no actual match (can only happen in the presence of backrefs)
+select 'a' ~ '$()|^\1';
+select 'a' ~ '.. ()|\1';
+select 'a' ~ '()*\1';
+select 'a' ~ '()+\1';