Commit 173e29aa authored by Tom Lane's avatar Tom Lane

Fix the general case of quantified regex back-references.

Cases where a back-reference is part of a larger subexpression that
is quantified have never worked in Spencer's regex engine, because
he used a compile-time transformation that neglected the need to
check the back-reference match in iterations before the last one.
(That was okay for capturing parens, and we still do it if the
regex has *only* capturing parens ... but it's not okay for backrefs.)

To make this work properly, we have to add an "iteration" node type
to the regex engine's vocabulary of sub-regex nodes.  Since this is a
moderately large change with a fair risk of introducing new bugs of its
own, apply to HEAD only, even though it's a fix for a longstanding bug.
parent 0c9e5d5e
...@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are ...@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are
either plain regular expressions (which are executed as DFAs in the manner either plain regular expressions (which are executed as DFAs in the manner
described above) or back-references (which try to match the input to some described above) or back-references (which try to match the input to some
previous substring). Non-leaf nodes are capture nodes (which save the previous substring). Non-leaf nodes are capture nodes (which save the
location of the substring currently matching their child node) or location of the substring currently matching their child node),
concatenation or alternation nodes. At execution time, the executor concatenation, alternation, or iteration nodes. At execution time, the
recursively scans the tree. At concatenation or alternation nodes, executor recursively scans the tree. At concatenation, alternation, or
it considers each possible alternative way of matching the input string, iteration nodes, it considers each possible alternative way of matching the
ie each place where the string could be split for a concatenation, or each input string, that is each place where the string could be split for a
child node for an alternation. It tries the next alternative if the match concatenation or iteration, or each child node for an alternation. It
fails according to the child nodes. This is exactly the sort of tries the next alternative if the match fails according to the child nodes.
backtracking search done by a traditional NFA regex engine. If there are This is exactly the sort of backtracking search done by a traditional NFA
many tree levels it can get very slow. regex engine. If there are many tree levels it can get very slow.
But all is not lost: we can still be smarter than the average pure NFA But all is not lost: we can still be smarter than the average pure NFA
engine. To do this, each subre node has an associated DFA, which engine. To do this, each subre node has an associated DFA, which
......
...@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v, ...@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
/*---------- /*----------
* Prepare a general-purpose state skeleton. * Prepare a general-purpose state skeleton.
* *
* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] * In the no-backrefs case, we want this:
* / /
* [lp] ----> [s2] ----bypass---------------------
* *
* where bypass is an empty, and prefix is some repetitions of atom * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
*
* where prefix is some repetitions of atom. In the general case we need
*
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
*
* where the iterator wraps around [begin] ---atom---> [end]
*
* We make the s state here for both cases; s2 is made below if needed
*---------- *----------
*/ */
s = newstate(v->nfa); /* first, new endpoints for the atom */ s = newstate(v->nfa); /* first, new endpoints for the atom */
...@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v, ...@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
NOERR(); NOERR();
atom->begin = s; atom->begin = s;
atom->end = s2; atom->end = s2;
s = newstate(v->nfa); /* and spots for prefix and bypass */ s = newstate(v->nfa); /* set up starting state */
s2 = newstate(v->nfa);
NOERR(); NOERR();
EMPTYARC(lp, s); EMPTYARC(lp, s);
EMPTYARC(lp, s2);
NOERR(); NOERR();
/* break remaining subRE into x{...} and what follows */ /* break remaining subRE into x{...} and what follows */
...@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v, ...@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
} }
/* /*
* It's quantifier time. If the atom is just a BACKREF, we'll let it deal * It's quantifier time. If the atom is just a backref, we'll let it deal
* with quantifiers internally. Otherwise, the first step is to turn * with quantifiers internally.
* x{0,...} into x{1,...}|empty
*/ */
if (m == 0 && atomtype != BACKREF)
{
EMPTYARC(s2, atom->end); /* the bypass */
assert(PREF(qprefer) != 0);
f = COMBINE(qprefer, atom->flags);
t = subre(v, '|', f, lp, atom->end);
NOERR();
t->left = atom;
t->right = subre(v, '|', PREF(f), s2, atom->end);
NOERR();
t->right->left = subre(v, '=', 0, s2, atom->end);
NOERR();
*atomp = t;
atomp = &t->left;
m = 1;
}
/* deal with the rest of the quantifier */
if (atomtype == BACKREF) if (atomtype == BACKREF)
{ {
/* special case: backrefs have internal quantifiers */ /* special case: backrefs have internal quantifiers */
...@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v, ...@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
atom->min = (short) m; atom->min = (short) m;
atom->max = (short) n; atom->max = (short) n;
atom->flags |= COMBINE(qprefer, atom->flags); atom->flags |= COMBINE(qprefer, atom->flags);
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
} }
else if (m == 1 && n == 1) else if (m == 1 && n == 1)
{ {
/* no/vacuous quantifier: done */ /* no/vacuous quantifier: done */
EMPTYARC(s, atom->begin); /* empty prefix */ EMPTYARC(s, atom->begin); /* empty prefix */
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
} }
else else if (m > 0 && !(atom->flags & BACKR))
{ {
/* /*
* Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the * If there's no backrefs involved, we can turn x{m,n} into
* second x * x{m-1,n-1}x, with capturing parens in only the second x. This
* is valid because we only care about capturing matches from the
* final iteration of the quantifier. It's a win because we can
* implement the backref-free left side as a plain DFA node, since
* we don't really care where its submatches are.
*/ */
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
assert(m >= 1 && m != INFINITY && n >= 1); assert(m >= 1 && m != INFINITY && n >= 1);
...@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v, ...@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
NOERR(); NOERR();
t->right = atom; t->right = atom;
*atomp = t; *atomp = t;
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
else
{
/* general case: need an iteration node */
s2 = newstate(v->nfa);
NOERR();
moveouts(v->nfa, atom->end, s2);
NOERR();
dupnfa(v->nfa, atom->begin, atom->end, s, s2);
repeat(v, s, s2, m, n);
f = COMBINE(qprefer, atom->flags);
t = subre(v, '*', f, s, s2);
NOERR();
t->min = (short) m;
t->max = (short) n;
t->left = atom;
*atomp = t;
/* rest of branch is to be strung from iteration's end state */
} }
/* and finally, look after that postponed recursion */ /* and finally, look after that postponed recursion */
t = top->right; t = top->right;
if (!(SEE('|') || SEE(stopper) || SEE(EOS))) if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
t->right = parsebranch(v, stopper, type, atom->end, rp, 1); t->right = parsebranch(v, stopper, type, s2, rp, 1);
else else
{ {
EMPTYARC(atom->end, rp); EMPTYARC(s2, rp);
t->right = subre(v, '=', 0, atom->end, rp); t->right = subre(v, '=', 0, s2, rp);
} }
assert(SEE('|') || SEE(stopper) || SEE(EOS)); assert(SEE('|') || SEE(stopper) || SEE(EOS));
t->flags |= COMBINE(t->flags, t->right->flags); t->flags |= COMBINE(t->flags, t->right->flags);
...@@ -1214,6 +1227,9 @@ scannum(struct vars * v) ...@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
/* /*
* repeat - replicate subNFA for quantifiers * repeat - replicate subNFA for quantifiers
* *
* The sub-NFA strung from lp to rp is modified to represent m to n
* repetitions of its initial contents.
*
* The duplication sequences used here are chosen carefully so that any * The duplication sequences used here are chosen carefully so that any
* pointers starting out pointing into the subexpression end up pointing into * pointers starting out pointing into the subexpression end up pointing into
* the last occurrence. (Note that it may not be strung between the same * the last occurrence. (Note that it may not be strung between the same
...@@ -1229,7 +1245,7 @@ repeat(struct vars * v, ...@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
int n) int n)
{ {
#define SOME 2 #define SOME 2
#define INF 3 #define INF 3
#define PAIR(x, y) ((x)*4 + (y)) #define PAIR(x, y) ((x)*4 + (y))
#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) ) #define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
const int rm = REDUCE(m); const int rm = REDUCE(m);
...@@ -1603,7 +1619,7 @@ subre(struct vars * v, ...@@ -1603,7 +1619,7 @@ subre(struct vars * v,
v->treechain = ret; v->treechain = ret;
} }
assert(strchr("|.b(=", op) != NULL); assert(strchr("=b|.*(", op) != NULL);
ret->op = op; ret->op = op;
ret->flags = flags; ret->flags = flags;
......
This diff is collapsed.
...@@ -372,10 +372,28 @@ struct cnfa ...@@ -372,10 +372,28 @@ struct cnfa
/* /*
* subexpression tree * subexpression tree
*
* "op" is one of:
* '=' plain regex without interesting substructure (implemented as DFA)
* 'b' back-reference (has no substructure either)
* '(' capture node: captures the match of its single child
* '.' concatenation: matches a match for left, then a match for right
* '|' alternation: matches a match for left or a match for right
* '*' iteration: matches some number of matches of its single child
*
* Note: the right child of an alternation must be another alternation or
* NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
* might expect. This could stand to be changed. Actually I'd rather see
* a single alternation node with N children, but that will take revising
* the representation of struct subre.
*
* Note: when a backref is directly quantified, we stick the min/max counts
* into the backref rather than plastering an iteration node on top. This is
* for efficiency: there is no need to search for possible division points.
*/ */
struct subre struct subre
{ {
char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */ char op; /* see type codes above */
char flags; char flags;
#define LONGER 01 /* prefers longer match */ #define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */ #define SHORTER 02 /* prefers shorter match */
...@@ -393,8 +411,8 @@ struct subre ...@@ -393,8 +411,8 @@ struct subre
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short retry; /* index into retry memory */ short retry; /* index into retry memory */
int subno; /* subexpression number (for 'b' and '(') */ int subno; /* subexpression number (for 'b' and '(') */
short min; /* min repetitions, for backref only */ short min; /* min repetitions for iteration or backref */
short max; /* max repetitions, for backref only */ short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */ struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */ struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */ struct state *begin; /* outarcs from here... */
......
...@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t; ...@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t;
t t
(1 row) (1 row)
-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
t
---
t
(1 row)
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
t
---
t
(1 row)
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
f
---
f
(1 row)
...@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t; ...@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t;
select 'xxx' ~ '^([bc])\1*$' as f; select 'xxx' ~ '^([bc])\1*$' as f;
select 'bbc' ~ '^([bc])\1*$' as f; select 'bbc' ~ '^([bc])\1*$' as f;
select 'b' ~ '^([bc])\1*$' as t; select 'b' ~ '^([bc])\1*$' as t;
-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment