Commit 173e29aa authored by Tom Lane's avatar Tom Lane

Fix the general case of quantified regex back-references.

Cases where a back-reference is part of a larger subexpression that
is quantified have never worked in Spencer's regex engine, because
he used a compile-time transformation that neglected the need to
check the back-reference match in iterations before the last one.
(That was okay for capturing parens, and we still do it if the
regex has *only* capturing parens ... but it's not okay for backrefs.)

To make this work properly, we have to add an "iteration" node type
to the regex engine's vocabulary of sub-regex nodes.  Since this is a
moderately large change with a fair risk of introducing new bugs of its
own, apply to HEAD only, even though it's a fix for a longstanding bug.
parent 0c9e5d5e
...@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are ...@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are
either plain regular expressions (which are executed as DFAs in the manner either plain regular expressions (which are executed as DFAs in the manner
described above) or back-references (which try to match the input to some described above) or back-references (which try to match the input to some
previous substring). Non-leaf nodes are capture nodes (which save the previous substring). Non-leaf nodes are capture nodes (which save the
location of the substring currently matching their child node) or location of the substring currently matching their child node),
concatenation or alternation nodes. At execution time, the executor concatenation, alternation, or iteration nodes. At execution time, the
recursively scans the tree. At concatenation or alternation nodes, executor recursively scans the tree. At concatenation, alternation, or
it considers each possible alternative way of matching the input string, iteration nodes, it considers each possible alternative way of matching the
ie each place where the string could be split for a concatenation, or each input string, that is each place where the string could be split for a
child node for an alternation. It tries the next alternative if the match concatenation or iteration, or each child node for an alternation. It
fails according to the child nodes. This is exactly the sort of tries the next alternative if the match fails according to the child nodes.
backtracking search done by a traditional NFA regex engine. If there are This is exactly the sort of backtracking search done by a traditional NFA
many tree levels it can get very slow. regex engine. If there are many tree levels it can get very slow.
But all is not lost: we can still be smarter than the average pure NFA But all is not lost: we can still be smarter than the average pure NFA
engine. To do this, each subre node has an associated DFA, which engine. To do this, each subre node has an associated DFA, which
......
...@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v, ...@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
/*---------- /*----------
* Prepare a general-purpose state skeleton. * Prepare a general-purpose state skeleton.
* *
* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] * In the no-backrefs case, we want this:
* / /
* [lp] ----> [s2] ----bypass---------------------
* *
* where bypass is an empty, and prefix is some repetitions of atom * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
*
* where prefix is some repetitions of atom. In the general case we need
*
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
*
* where the iterator wraps around [begin] ---atom---> [end]
*
* We make the s state here for both cases; s2 is made below if needed
*---------- *----------
*/ */
s = newstate(v->nfa); /* first, new endpoints for the atom */ s = newstate(v->nfa); /* first, new endpoints for the atom */
...@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v, ...@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
NOERR(); NOERR();
atom->begin = s; atom->begin = s;
atom->end = s2; atom->end = s2;
s = newstate(v->nfa); /* and spots for prefix and bypass */ s = newstate(v->nfa); /* set up starting state */
s2 = newstate(v->nfa);
NOERR(); NOERR();
EMPTYARC(lp, s); EMPTYARC(lp, s);
EMPTYARC(lp, s2);
NOERR(); NOERR();
/* break remaining subRE into x{...} and what follows */ /* break remaining subRE into x{...} and what follows */
...@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v, ...@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
} }
/* /*
* It's quantifier time. If the atom is just a BACKREF, we'll let it deal * It's quantifier time. If the atom is just a backref, we'll let it deal
* with quantifiers internally. Otherwise, the first step is to turn * with quantifiers internally.
* x{0,...} into x{1,...}|empty
*/ */
if (m == 0 && atomtype != BACKREF)
{
EMPTYARC(s2, atom->end); /* the bypass */
assert(PREF(qprefer) != 0);
f = COMBINE(qprefer, atom->flags);
t = subre(v, '|', f, lp, atom->end);
NOERR();
t->left = atom;
t->right = subre(v, '|', PREF(f), s2, atom->end);
NOERR();
t->right->left = subre(v, '=', 0, s2, atom->end);
NOERR();
*atomp = t;
atomp = &t->left;
m = 1;
}
/* deal with the rest of the quantifier */
if (atomtype == BACKREF) if (atomtype == BACKREF)
{ {
/* special case: backrefs have internal quantifiers */ /* special case: backrefs have internal quantifiers */
...@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v, ...@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
atom->min = (short) m; atom->min = (short) m;
atom->max = (short) n; atom->max = (short) n;
atom->flags |= COMBINE(qprefer, atom->flags); atom->flags |= COMBINE(qprefer, atom->flags);
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
} }
else if (m == 1 && n == 1) else if (m == 1 && n == 1)
{ {
/* no/vacuous quantifier: done */ /* no/vacuous quantifier: done */
EMPTYARC(s, atom->begin); /* empty prefix */ EMPTYARC(s, atom->begin); /* empty prefix */
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
} }
else else if (m > 0 && !(atom->flags & BACKR))
{ {
/* /*
* Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the * If there's no backrefs involved, we can turn x{m,n} into
* second x * x{m-1,n-1}x, with capturing parens in only the second x. This
* is valid because we only care about capturing matches from the
* final iteration of the quantifier. It's a win because we can
* implement the backref-free left side as a plain DFA node, since
* we don't really care where its submatches are.
*/ */
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
assert(m >= 1 && m != INFINITY && n >= 1); assert(m >= 1 && m != INFINITY && n >= 1);
...@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v, ...@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
NOERR(); NOERR();
t->right = atom; t->right = atom;
*atomp = t; *atomp = t;
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
else
{
/* general case: need an iteration node */
s2 = newstate(v->nfa);
NOERR();
moveouts(v->nfa, atom->end, s2);
NOERR();
dupnfa(v->nfa, atom->begin, atom->end, s, s2);
repeat(v, s, s2, m, n);
f = COMBINE(qprefer, atom->flags);
t = subre(v, '*', f, s, s2);
NOERR();
t->min = (short) m;
t->max = (short) n;
t->left = atom;
*atomp = t;
/* rest of branch is to be strung from iteration's end state */
} }
/* and finally, look after that postponed recursion */ /* and finally, look after that postponed recursion */
t = top->right; t = top->right;
if (!(SEE('|') || SEE(stopper) || SEE(EOS))) if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
t->right = parsebranch(v, stopper, type, atom->end, rp, 1); t->right = parsebranch(v, stopper, type, s2, rp, 1);
else else
{ {
EMPTYARC(atom->end, rp); EMPTYARC(s2, rp);
t->right = subre(v, '=', 0, atom->end, rp); t->right = subre(v, '=', 0, s2, rp);
} }
assert(SEE('|') || SEE(stopper) || SEE(EOS)); assert(SEE('|') || SEE(stopper) || SEE(EOS));
t->flags |= COMBINE(t->flags, t->right->flags); t->flags |= COMBINE(t->flags, t->right->flags);
...@@ -1214,6 +1227,9 @@ scannum(struct vars * v) ...@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
/* /*
* repeat - replicate subNFA for quantifiers * repeat - replicate subNFA for quantifiers
* *
* The sub-NFA strung from lp to rp is modified to represent m to n
* repetitions of its initial contents.
*
* The duplication sequences used here are chosen carefully so that any * The duplication sequences used here are chosen carefully so that any
* pointers starting out pointing into the subexpression end up pointing into * pointers starting out pointing into the subexpression end up pointing into
* the last occurrence. (Note that it may not be strung between the same * the last occurrence. (Note that it may not be strung between the same
...@@ -1229,7 +1245,7 @@ repeat(struct vars * v, ...@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
int n) int n)
{ {
#define SOME 2 #define SOME 2
#define INF 3 #define INF 3
#define PAIR(x, y) ((x)*4 + (y)) #define PAIR(x, y) ((x)*4 + (y))
#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) ) #define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
const int rm = REDUCE(m); const int rm = REDUCE(m);
...@@ -1603,7 +1619,7 @@ subre(struct vars * v, ...@@ -1603,7 +1619,7 @@ subre(struct vars * v,
v->treechain = ret; v->treechain = ret;
} }
assert(strchr("|.b(=", op) != NULL); assert(strchr("=b|.*(", op) != NULL);
ret->op = op; ret->op = op;
ret->flags = flags; ret->flags = flags;
......
...@@ -140,11 +140,15 @@ static void subset(struct vars *, struct subre *, chr *, chr *); ...@@ -140,11 +140,15 @@ static void subset(struct vars *, struct subre *, chr *, chr *);
static int dissect(struct vars *, struct subre *, chr *, chr *); static int dissect(struct vars *, struct subre *, chr *, chr *);
static int condissect(struct vars *, struct subre *, chr *, chr *); static int condissect(struct vars *, struct subre *, chr *, chr *);
static int altdissect(struct vars *, struct subre *, chr *, chr *); static int altdissect(struct vars *, struct subre *, chr *, chr *);
static int iterdissect(struct vars *, struct subre *, chr *, chr *);
static int reviterdissect(struct vars *, struct subre *, chr *, chr *);
static int cdissect(struct vars *, struct subre *, chr *, chr *); static int cdissect(struct vars *, struct subre *, chr *, chr *);
static int ccondissect(struct vars *, struct subre *, chr *, chr *); static int ccondissect(struct vars *, struct subre *, chr *, chr *);
static int crevdissect(struct vars *, struct subre *, chr *, chr *); static int crevdissect(struct vars *, struct subre *, chr *, chr *);
static int cbrdissect(struct vars *, struct subre *, chr *, chr *); static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
static int caltdissect(struct vars *, struct subre *, chr *, chr *); static int caltdissect(struct vars *, struct subre *, chr *, chr *);
static int citerdissect(struct vars *, struct subre *, chr *, chr *);
static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
/* === rege_dfa.c === */ /* === rege_dfa.c === */
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
...@@ -563,14 +567,17 @@ dissect(struct vars * v, ...@@ -563,14 +567,17 @@ dissect(struct vars * v,
case '=': /* terminal node */ case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL); assert(t->left == NULL && t->right == NULL);
return REG_OKAY; /* no action, parent did the work */ return REG_OKAY; /* no action, parent did the work */
case '|': /* alternation */
assert(t->left != NULL);
return altdissect(v, t, begin, end);
case 'b': /* back ref -- shouldn't be calling us! */ case 'b': /* back ref -- shouldn't be calling us! */
return REG_ASSERT; return REG_ASSERT;
case '.': /* concatenation */ case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL); assert(t->left != NULL && t->right != NULL);
return condissect(v, t, begin, end); return condissect(v, t, begin, end);
case '|': /* alternation */
assert(t->left != NULL);
return altdissect(v, t, begin, end);
case '*': /* iteration */
assert(t->left != NULL);
return iterdissect(v, t, begin, end);
case '(': /* capturing */ case '(': /* capturing */
assert(t->left != NULL && t->right == NULL); assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0); assert(t->subno > 0);
...@@ -696,6 +703,375 @@ altdissect(struct vars * v, ...@@ -696,6 +703,375 @@ altdissect(struct vars * v,
return REG_ASSERT; /* none of them matched?!? */ return REG_ASSERT; /* none of them matched?!? */
} }
/*
* iterdissect - iteration subexpression matches (uncomplicated)
*/
static int /* regexec return code */
iterdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(begin <= end);
if (t->left->flags & SHORTER) /* reverse scan */
return reviterdissect(v, t, begin, end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("iter %d\n", t->retry));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = end;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* try to find an endpoint for the k'th sub-match */
endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can shorten previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->retry, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to shorten some previous match */
k--;
goto backtrack;
}
/* reject zero-length match unless necessary to achieve min */
if (endpts[k] == endpts[k - 1] &&
(k >= min_matches || min_matches - k < end - endpts[k]))
goto backtrack;
k++;
limit = end;
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
er = dissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
freedfa(d);
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider shorter versions of the current sub-match. However,
* we'll only ask for a zero-length match if necessary.
*/
while (k > 0)
{
chr *prev_end = endpts[k - 1];
if (endpts[k] > prev_end)
{
limit = endpts[k] - 1;
if (limit > prev_end ||
(k < min_matches && min_matches - k >= end - prev_end))
{
/* break out of backtrack loop, continue the outer one */
break;
}
}
/* can't shorten k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
MDEBUG(("%d failed\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_ASSERT;
}
/*
* reviterdissect - shortest-first iteration subexpression matches
*/
static int /* regexec return code */
reviterdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->left->flags & SHORTER);
assert(begin <= end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("reviter %d\n", t->retry));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = begin;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* disallow zero-length match unless necessary to achieve min */
if (limit == endpts[k - 1] &&
limit != end &&
(k >= min_matches || min_matches - k < end - limit))
limit++;
/* try to find an endpoint for the k'th sub-match */
endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
(chr **) NULL, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can lengthen previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->retry, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to lengthen some previous match */
k--;
goto backtrack;
}
k++;
limit = endpts[k - 1];
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
er = dissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
freedfa(d);
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider longer versions of the current sub-match.
*/
while (k > 0)
{
if (endpts[k] < end)
{
limit = endpts[k] + 1;
/* break out of backtrack loop, continue the outer one */
break;
}
/* can't lengthen k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
MDEBUG(("%d failed\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_ASSERT;
}
/* /*
* cdissect - determine subexpression matches (with complications) * cdissect - determine subexpression matches (with complications)
* The retry memory stores the offset of the trial midpoint from begin, * The retry memory stores the offset of the trial midpoint from begin,
...@@ -717,15 +1093,18 @@ cdissect(struct vars * v, ...@@ -717,15 +1093,18 @@ cdissect(struct vars * v,
case '=': /* terminal node */ case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL); assert(t->left == NULL && t->right == NULL);
return REG_OKAY; /* no action, parent did the work */ return REG_OKAY; /* no action, parent did the work */
case '|': /* alternation */
assert(t->left != NULL);
return caltdissect(v, t, begin, end);
case 'b': /* back reference */ case 'b': /* back reference */
assert(t->left == NULL && t->right == NULL); assert(t->left == NULL && t->right == NULL);
return cbrdissect(v, t, begin, end); return cbrdissect(v, t, begin, end);
case '.': /* concatenation */ case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL); assert(t->left != NULL && t->right != NULL);
return ccondissect(v, t, begin, end); return ccondissect(v, t, begin, end);
case '|': /* alternation */
assert(t->left != NULL);
return caltdissect(v, t, begin, end);
case '*': /* iteration */
assert(t->left != NULL);
return citerdissect(v, t, begin, end);
case '(': /* capturing */ case '(': /* capturing */
assert(t->left != NULL && t->right == NULL); assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0); assert(t->subno > 0);
...@@ -847,7 +1226,7 @@ ccondissect(struct vars * v, ...@@ -847,7 +1226,7 @@ ccondissect(struct vars * v,
} }
/* /*
* crevdissect - determine backref shortest-first subexpression matches * crevdissect - shortest-first concatenation subexpression matches
* The retry memory stores the offset of the trial midpoint from begin, * The retry memory stores the offset of the trial midpoint from begin,
* plus 1 so that 0 uniquely means "clean slate". * plus 1 so that 0 uniquely means "clean slate".
*/ */
...@@ -1088,6 +1467,377 @@ caltdissect(struct vars * v, ...@@ -1088,6 +1467,377 @@ caltdissect(struct vars * v,
return caltdissect(v, t->right, begin, end); return caltdissect(v, t->right, begin, end);
} }
/*
* citerdissect - iteration subexpression matches (with complications)
*/
static int /* regexec return code */
citerdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(begin <= end);
if (t->left->flags & SHORTER) /* reverse scan */
return creviterdissect(v, t, begin, end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("citer %d\n", t->retry));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = end;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* try to find an endpoint for the k'th sub-match */
endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can shorten previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->retry, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to shorten some previous match */
k--;
goto backtrack;
}
/* reject zero-length match unless necessary to achieve min */
if (endpts[k] == endpts[k - 1] &&
(k >= min_matches || min_matches - k < end - endpts[k]))
goto backtrack;
k++;
limit = end;
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
zapmem(v, t->left);
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
freedfa(d);
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider shorter versions of the current sub-match. However,
* we'll only ask for a zero-length match if necessary.
*/
while (k > 0)
{
chr *prev_end = endpts[k - 1];
if (endpts[k] > prev_end)
{
limit = endpts[k] - 1;
if (limit > prev_end ||
(k < min_matches && min_matches - k >= end - prev_end))
{
/* break out of backtrack loop, continue the outer one */
break;
}
}
/* can't shorten k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted */
MDEBUG(("%d failed\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_NOMATCH;
}
/*
* creviterdissect - shortest-first iteration subexpression matches
*/
static int /* regexec return code */
creviterdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->left->flags & SHORTER);
assert(begin <= end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("creviter %d\n", t->retry));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = begin;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* disallow zero-length match unless necessary to achieve min */
if (limit == endpts[k - 1] &&
limit != end &&
(k >= min_matches || min_matches - k < end - limit))
limit++;
/* try to find an endpoint for the k'th sub-match */
endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
(chr **) NULL, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can lengthen previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->retry, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to lengthen some previous match */
k--;
goto backtrack;
}
k++;
limit = endpts[k - 1];
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
zapmem(v, t->left);
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
freedfa(d);
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider longer versions of the current sub-match.
*/
while (k > 0)
{
if (endpts[k] < end)
{
limit = endpts[k] + 1;
/* break out of backtrack loop, continue the outer one */
break;
}
/* can't lengthen k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted */
MDEBUG(("%d failed\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_NOMATCH;
}
#include "rege_dfa.c" #include "rege_dfa.c"
...@@ -372,10 +372,28 @@ struct cnfa ...@@ -372,10 +372,28 @@ struct cnfa
/* /*
* subexpression tree * subexpression tree
*
* "op" is one of:
* '=' plain regex without interesting substructure (implemented as DFA)
* 'b' back-reference (has no substructure either)
* '(' capture node: captures the match of its single child
* '.' concatenation: matches a match for left, then a match for right
* '|' alternation: matches a match for left or a match for right
* '*' iteration: matches some number of matches of its single child
*
* Note: the right child of an alternation must be another alternation or
* NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
* might expect. This could stand to be changed. Actually I'd rather see
* a single alternation node with N children, but that will take revising
* the representation of struct subre.
*
* Note: when a backref is directly quantified, we stick the min/max counts
* into the backref rather than plastering an iteration node on top. This is
* for efficiency: there is no need to search for possible division points.
*/ */
struct subre struct subre
{ {
char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */ char op; /* see type codes above */
char flags; char flags;
#define LONGER 01 /* prefers longer match */ #define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */ #define SHORTER 02 /* prefers shorter match */
...@@ -393,8 +411,8 @@ struct subre ...@@ -393,8 +411,8 @@ struct subre
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short retry; /* index into retry memory */ short retry; /* index into retry memory */
int subno; /* subexpression number (for 'b' and '(') */ int subno; /* subexpression number (for 'b' and '(') */
short min; /* min repetitions, for backref only */ short min; /* min repetitions for iteration or backref */
short max; /* max repetitions, for backref only */ short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */ struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */ struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */ struct state *begin; /* outarcs from here... */
......
...@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t; ...@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t;
t t
(1 row) (1 row)
-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
t
---
t
(1 row)
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
t
---
t
(1 row)
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
f
---
f
(1 row)
...@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t; ...@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t;
select 'xxx' ~ '^([bc])\1*$' as f; select 'xxx' ~ '^([bc])\1*$' as f;
select 'bbc' ~ '^([bc])\1*$' as f; select 'bbc' ~ '^([bc])\1*$' as f;
select 'b' ~ '^([bc])\1*$' as t; select 'b' ~ '^([bc])\1*$' as t;
-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment