Fix the general case of quantified regex back-references.

Cases where a back-reference is part of a larger subexpression that is quantified have never worked in Spencer's regex engine, because he used a compile-time transformation that neglected the need to check the back-reference match in iterations before the last one. (That was okay for capturing parens, and we still do it if the regex has *only* capturing parens ... but it's not okay for backrefs.) To make this work properly, we have to add an "iteration" node type to the regex engine's vocabulary of sub-regex nodes. Since this is a moderately large change with a fair risk of introducing new bugs of its own, apply to HEAD only, even though it's a fix for a longstanding bug.

Fix the general case of quantified regex back-references.
Cases where a back-reference is part of a larger subexpression that is quantified have never worked in Spencer's regex engine, because he used a compile-time transformation that neglected the need to check the back-reference match in iterations before the last one. (That was okay for capturing parens, and we still do it if the regex has *only* capturing parens ... but it's not okay for backrefs.) To make this work properly, we have to add an "iteration" node type to the regex engine's vocabulary of sub-regex nodes. Since this is a moderately large change with a fair risk of introducing new bugs of its own, apply to HEAD only, even though it's a fix for a longstanding bug.
173e29aa · Tom Lane · 0c9e5d5e · 173e29aa · 173e29aa · 173e29aa
Commit 173e29aa authored Feb 24, 2012 by Tom Lane
6 changed files
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s).  Leaf tree nodes are
 either plain regular expressions (which are executed as DFAs in the manner
 described above) or back-references (which try to match the input to some
 previous substring).  Non-leaf nodes are capture nodes (which save the
-location of the substring currently matching their child node) or
+location of the substring currently matching their child node),
-concatenation or alternation nodes.  At execution time, the executor
+concatenation, alternation, or iteration nodes.  At execution time, the
-recursively scans the tree.  At concatenation or alternation nodes,
+executor recursively scans the tree.  At concatenation, alternation, or
-it considers each possible alternative way of matching the input string,
+iteration nodes, it considers each possible alternative way of matching the
-ie each place where the string could be split for a concatenation, or each
+input string, that is each place where the string could be split for a
-child node for an alternation.  It tries the next alternative if the match
+concatenation or iteration, or each child node for an alternation.  It
-fails according to the child nodes.  This is exactly the sort of
+tries the next alternative if the match fails according to the child nodes.
-backtracking search done by a traditional NFA regex engine.  If there are
+This is exactly the sort of backtracking search done by a traditional NFA
-many tree levels it can get very slow.
+regex engine.  If there are many tree levels it can get very slow.
 But all is not lost: we can still be smarter than the average pure NFA
 engine.  To do this, each subre node has an associated DFA, which

--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
 	/*----------
 	 * Prepare a general-purpose state skeleton.
 	 *
-	 *	  ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
+	 * In the no-backrefs case, we want this:
-	 *	 /											  /
-	 * [lp] ----> [s2] ----bypass---------------------
 	 *
-	 * where bypass is an empty, and prefix is some repetitions of atom
+	 * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+	 *
+	 * where prefix is some repetitions of atom.  In the general case we need
+	 *
+	 * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
+	 *
+	 * where the iterator wraps around [begin] ---atom---> [end]
+	 *
+	 * We make the s state here for both cases; s2 is made below if needed
 	 *----------
 	 */
 	s = newstate(v->nfa);		/* first, new endpoints for the atom */
@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
 	NOERR();
 	atom->begin = s;
 	atom->end = s2;
-	s = newstate(v->nfa);		/* and spots for prefix and bypass */
+	s = newstate(v->nfa);		/* set up starting state */
-	s2 = newstate(v->nfa);
 	NOERR();
 	EMPTYARC(lp, s);
-	EMPTYARC(lp, s2);
 	NOERR();
 	/* break remaining subRE into x{...} and what follows */
@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
 	}
 	/*
-	 * It's quantifier time.  If the atom is just a BACKREF, we'll let it deal
+	 * It's quantifier time.  If the atom is just a backref, we'll let it deal
-	 * with quantifiers internally.  Otherwise, the first step is to turn
+	 * with quantifiers internally.
-	 * x{0,...} into x{1,...}|empty
 	 */
-	if (m == 0 && atomtype != BACKREF)
-	{
-		EMPTYARC(s2, atom->end);	/* the bypass */
-		assert(PREF(qprefer) != 0);
-		f = COMBINE(qprefer, atom->flags);
-		t = subre(v, '|', f, lp, atom->end);
-		NOERR();
-		t->left = atom;
-		t->right = subre(v, '|', PREF(f), s2, atom->end);
-		NOERR();
-		t->right->left = subre(v, '=', 0, s2, atom->end);
-		NOERR();
-		*atomp = t;
-		atomp = &t->left;
-		m = 1;
-	}
-	/* deal with the rest of the quantifier */
 	if (atomtype == BACKREF)
 	{
 		/* special case:  backrefs have internal quantifiers */
@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
 		atom->min = (short) m;
 		atom->max = (short) n;
 		atom->flags |= COMBINE(qprefer, atom->flags);
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
 	}
 	else if (m == 1 && n == 1)
 	{
 		/* no/vacuous quantifier:  done */
 		EMPTYARC(s, atom->begin);		/* empty prefix */
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
 	}
-	else
+	else if (m > 0 && !(atom->flags & BACKR))
 	{
 		/*
-		 * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
+		 * If there's no backrefs involved, we can turn x{m,n} into
-		 * second x
+		 * x{m-1,n-1}x, with capturing parens in only the second x.  This
+		 * is valid because we only care about capturing matches from the
+		 * final iteration of the quantifier.  It's a win because we can
+		 * implement the backref-free left side as a plain DFA node, since
+		 * we don't really care where its submatches are.
 		 */
 		dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
 		assert(m >= 1 && m != INFINITY && n >= 1);
@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
 		NOERR();
 		t->right = atom;
 		*atomp = t;
+		/* rest of branch can be strung starting from atom->end */
+		s2 = atom->end;
+	}
+	else
+	{
+		/* general case: need an iteration node */
+		s2 = newstate(v->nfa);
+		NOERR();
+		moveouts(v->nfa, atom->end, s2);
+		NOERR();
+		dupnfa(v->nfa, atom->begin, atom->end, s, s2);
+		repeat(v, s, s2, m, n);
+		f = COMBINE(qprefer, atom->flags);
+		t = subre(v, '*', f, s, s2);
+		NOERR();
+		t->min = (short) m;
+		t->max = (short) n;
+		t->left = atom;
+		*atomp = t;
+		/* rest of branch is to be strung from iteration's end state */
 	}
 	/* and finally, look after that postponed recursion */
 	t = top->right;
 	if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
-		t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+		t->right = parsebranch(v, stopper, type, s2, rp, 1);
 	else
 	{
-		EMPTYARC(atom->end, rp);
+		EMPTYARC(s2, rp);
-		t->right = subre(v, '=', 0, atom->end, rp);
+		t->right = subre(v, '=', 0, s2, rp);
 	}
 	assert(SEE('|') || SEE(stopper) || SEE(EOS));
 	t->flags |= COMBINE(t->flags, t->right->flags);
@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
 /*
 * repeat - replicate subNFA for quantifiers
 *
+ * The sub-NFA strung from lp to rp is modified to represent m to n
+ * repetitions of its initial contents.
+ *
 * The duplication sequences used here are chosen carefully so that any
 * pointers starting out pointing into the subexpression end up pointing into
 * the last occurrence.  (Note that it may not be strung between the same
@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
 	   int n)
 {
 #define  SOME	 2
-#define  INF 3
+#define  INF	 3
 #define  PAIR(x, y)  ((x)*4 + (y))
 #define  REDUCE(x)	 ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
 	const int	rm = REDUCE(m);
@@ -1603,7 +1619,7 @@ subre(struct vars * v,
 		v->treechain = ret;
 	}
-	assert(strchr("|.b(=", op) != NULL);
+	assert(strchr("=b|.*(", op) != NULL);
 	ret->op = op;
 	ret->flags = flags;

--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -372,10 +372,28 @@ struct cnfa
 /*
 * subexpression tree
+ *
+ * "op" is one of:
+ *		'='  plain regex without interesting substructure (implemented as DFA)
+ *		'b'  back-reference (has no substructure either)
+ *		'('  capture node: captures the match of its single child
+ *		'.'  concatenation: matches a match for left, then a match for right
+ *		'|'  alternation: matches a match for left or a match for right
+ *		'*'  iteration: matches some number of matches of its single child
+ *
+ * Note: the right child of an alternation must be another alternation or
+ * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
+ * might expect.  This could stand to be changed.  Actually I'd rather see
+ * a single alternation node with N children, but that will take revising
+ * the representation of struct subre.
+ *
+ * Note: when a backref is directly quantified, we stick the min/max counts
+ * into the backref rather than plastering an iteration node on top.  This is
+ * for efficiency: there is no need to search for possible division points.
 */
 struct subre
 {
-	char		op;				/* '|', '.' (concat), 'b' (backref), '(', '=' */
+	char		op;				/* see type codes above */
 	char		flags;
 #define  LONGER  01				/* prefers longer match */
 #define  SHORTER 02				/* prefers shorter match */
@@ -393,8 +411,8 @@ struct subre
 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
 	short		retry;			/* index into retry memory */
 	int			subno;			/* subexpression number (for 'b' and '(') */
-	short		min;			/* min repetitions, for backref only */
+	short		min;			/* min repetitions for iteration or backref */
-	short		max;			/* max repetitions, for backref only */
+	short		max;			/* max repetitions for iteration or backref */
 	struct subre *left;			/* left child, if any (also freelist chain) */
 	struct subre *right;		/* right child, if any */
 	struct state *begin;		/* outarcs from here... */

--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t;
 t
 (1 row)
+-- Test quantified backref within a larger expression
+select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
+ t 
+---
+ t
+(1 row)
+select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
+ t 
+---
+ t
+(1 row)
+select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
+select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
+ f 
+---
+ f
+(1 row)
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t;
 select 'xxx' ~ '^([bc])\1*$' as f;
 select 'bbc' ~ '^([bc])\1*$' as f;
 select 'b' ~ '^([bc])\1*$' as t;
+-- Test quantified backref within a larger expression
+select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
+select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
+select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
+select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
+select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
+select 'abc abc abd' ~ '^(.+)( \1)+$' as f;