Fix regex engine to suppress useless concatenation sub-REs.

The comment for parsebranch() claims that it avoids generating unnecessary concatenation nodes in the "subre" tree, but it missed some significant cases. Once we've decided that a given atom is "messy" and can't be bundled with the preceding atom(s) of the current regex branch, parseqatom() always generated two new concat nodes, one to concat the messy atom to what follows it in the branch, and an upper node to concatenate the preceding part of the branch to that one. But one or both of these could be unnecessary, if the messy atom is the first, last, or only one in the branch. Improve the code to suppress such useless concat nodes, along with the no-op child nodes representing empty chunks of a branch. Reducing the number of subre tree nodes offers significant savings not only at execution but during compilation, because each subre node has its own NFA that has to be separately optimized. (Maybe someday we'll figure out how to share the optimization work across multiple tree nodes, but it doesn't look easy.) Eliminating upper tree nodes is especially useful because they tend to have larger NFAs. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us

Fix regex engine to suppress useless concatenation sub-REs.
The comment for parsebranch() claims that it avoids generating unnecessary concatenation nodes in the "subre" tree, but it missed some significant cases. Once we've decided that a given atom is "messy" and can't be bundled with the preceding atom(s) of the current regex branch, parseqatom() always generated two new concat nodes, one to concat the messy atom to what follows it in the branch, and an upper node to concatenate the preceding part of the branch to that one. But one or both of these could be unnecessary, if the messy atom is the first, last, or only one in the branch. Improve the code to suppress such useless concat nodes, along with the no-op child nodes representing empty chunks of a branch. Reducing the number of subre tree nodes offers significant savings not only at execution but during compilation, because each subre node has its own NFA that has to be separately optimized. (Maybe someday we'll figure out how to share the optimization work across multiple tree nodes, but it doesn't look easy.) Eliminating upper tree nodes is especially useful because they tend to have larger NFAs. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
cebc1d34 · Tom Lane · 824bf719 · cebc1d34
Commit cebc1d34 authored Feb 20, 2021 by Tom Lane
Hide whitespace changes
Inline Side-by-side

Showing with 76 additions and 7 deletions

src/backend/regex/regcomp.c src/backend/regex/regcomp.c +76 -7

No files found.
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1123,14 +1123,24 @@ parseqatom(struct vars *v,
 	t->left = atom;
 	atomp = &t->left;
-	/* here we should recurse... but we must postpone that to the end */
+	/*
+	 * Here we should recurse to fill t->right ... but we must postpone that
+	 * to the end.
+	 */
-	/* split top into prefix and remaining */
+	/*
+	 * Convert top node to a concatenation of the prefix (top->left, covering
+	 * whatever we parsed previously) and remaining (t).  Note that the prefix
+	 * could be empty, in which case this concatenation node is unnecessary.
+	 * To keep things simple, we operate in a general way for now, and get rid
+	 * of unnecessary subres below.
+	 */
 	assert(top->op == '=' && top->left == NULL && top->right == NULL);
 	top->left = subre(v, '=', top->flags, top->begin, lp);
 	NOERR();
 	top->op = '.';
 	top->right = t;
+	/* top->flags will get updated later */
 	/* if it's a backref, now is the time to replicate the subNFA */
 	if (atomtype == BACKREF)
@@ -1220,16 +1230,75 @@ parseqatom(struct vars *v,
 	/* and finally, look after that postponed recursion */
 	t = top->right;
 	if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
+	{
+		/* parse all the rest of the branch, and insert in t->right */
 		t->right = parsebranch(v, stopper, type, s2, rp, 1);
+		NOERR();
+		assert(SEE('|') || SEE(stopper) || SEE(EOS));
+		/* here's the promised update of the flags */
+		t->flags |= COMBINE(t->flags, t->right->flags);
+		top->flags |= COMBINE(top->flags, t->flags);
+		/*
+		 * At this point both top and t are concatenation (op == '.') subres,
+		 * and we have top->left = prefix of branch, top->right = t, t->left =
+		 * messy atom (with quantification superstructure if needed), t->right
+		 * = rest of branch.
+		 *
+		 * If the messy atom was the first thing in the branch, then top->left
+		 * is vacuous and we can get rid of one level of concatenation.  Since
+		 * the caller is holding a pointer to the top node, we can't remove
+		 * that node; but we're allowed to change its properties.
+		 */
+		assert(top->left->op == '=');
+		if (top->left->begin == top->left->end)
+		{
+			assert(!MESSY(top->left->flags));
+			freesubre(v, top->left);
+			top->left = t->left;
+			top->right = t->right;
+			t->left = t->right = NULL;
+			freesubre(v, t);
+		}
+	}
 	else
 	{
+		/*
+		 * There's nothing left in the branch, so we don't need the second
+		 * concatenation node 't'.  Just link s2 straight to rp.
+		 */
 		EMPTYARC(s2, rp);
-		t->right = subre(v, '=', 0, s2, rp);
+		top->right = t->left;
+		top->flags |= COMBINE(top->flags, top->right->flags);
+		t->left = t->right = NULL;
+		freesubre(v, t);
+		/*
+		 * Again, it could be that top->left is vacuous (if the messy atom was
+		 * in fact the only thing in the branch).  In that case we need no
+		 * concatenation at all; just replace top with top->right.
+		 */
+		assert(top->left->op == '=');
+		if (top->left->begin == top->left->end)
+		{
+			assert(!MESSY(top->left->flags));
+			freesubre(v, top->left);
+			t = top->right;
+			top->op = t->op;
+			top->flags = t->flags;
+			top->id = t->id;
+			top->subno = t->subno;
+			top->min = t->min;
+			top->max = t->max;
+			top->left = t->left;
+			top->right = t->right;
+			top->begin = t->begin;
+			top->end = t->end;
+			t->left = t->right = NULL;
+			freesubre(v, t);
+		}
 	}
-	NOERR();
-	assert(SEE('|') || SEE(stopper) || SEE(EOS));
-	t->flags |= COMBINE(t->flags, t->right->flags);
-	top->flags |= COMBINE(top->flags, t->flags);
 }
 /*