Commit 173e29aa authored by Tom Lane's avatar Tom Lane

Fix the general case of quantified regex back-references.

Cases where a back-reference is part of a larger subexpression that
is quantified have never worked in Spencer's regex engine, because
he used a compile-time transformation that neglected the need to
check the back-reference match in iterations before the last one.
(That was okay for capturing parens, and we still do it if the
regex has *only* capturing parens ... but it's not okay for backrefs.)

To make this work properly, we have to add an "iteration" node type
to the regex engine's vocabulary of sub-regex nodes.  Since this is a
moderately large change with a fair risk of introducing new bugs of its
own, apply to HEAD only, even though it's a fix for a longstanding bug.
parent 0c9e5d5e
......@@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are
either plain regular expressions (which are executed as DFAs in the manner
described above) or back-references (which try to match the input to some
previous substring). Non-leaf nodes are capture nodes (which save the
location of the substring currently matching their child node) or
concatenation or alternation nodes. At execution time, the executor
recursively scans the tree. At concatenation or alternation nodes,
it considers each possible alternative way of matching the input string,
ie each place where the string could be split for a concatenation, or each
child node for an alternation. It tries the next alternative if the match
fails according to the child nodes. This is exactly the sort of
backtracking search done by a traditional NFA regex engine. If there are
many tree levels it can get very slow.
location of the substring currently matching their child node),
concatenation, alternation, or iteration nodes. At execution time, the
executor recursively scans the tree. At concatenation, alternation, or
iteration nodes, it considers each possible alternative way of matching the
input string, that is each place where the string could be split for a
concatenation or iteration, or each child node for an alternation. It
tries the next alternative if the match fails according to the child nodes.
This is exactly the sort of backtracking search done by a traditional NFA
regex engine. If there are many tree levels it can get very slow.
But all is not lost: we can still be smarter than the average pure NFA
engine. To do this, each subre node has an associated DFA, which
......
......@@ -1036,11 +1036,17 @@ parseqatom(struct vars * v,
/*----------
* Prepare a general-purpose state skeleton.
*
* ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
* / /
* [lp] ----> [s2] ----bypass---------------------
* In the no-backrefs case, we want this:
*
* where bypass is an empty, and prefix is some repetitions of atom
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
*
* where prefix is some repetitions of atom. In the general case we need
*
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
*
* where the iterator wraps around [begin] ---atom---> [end]
*
* We make the s state here for both cases; s2 is made below if needed
*----------
*/
s = newstate(v->nfa); /* first, new endpoints for the atom */
......@@ -1051,11 +1057,9 @@ parseqatom(struct vars * v,
NOERR();
atom->begin = s;
atom->end = s2;
s = newstate(v->nfa); /* and spots for prefix and bypass */
s2 = newstate(v->nfa);
s = newstate(v->nfa); /* set up starting state */
NOERR();
EMPTYARC(lp, s);
EMPTYARC(lp, s2);
NOERR();
/* break remaining subRE into x{...} and what follows */
......@@ -1089,28 +1093,9 @@ parseqatom(struct vars * v,
}
/*
* It's quantifier time. If the atom is just a BACKREF, we'll let it deal
* with quantifiers internally. Otherwise, the first step is to turn
* x{0,...} into x{1,...}|empty
* It's quantifier time. If the atom is just a backref, we'll let it deal
* with quantifiers internally.
*/
if (m == 0 && atomtype != BACKREF)
{
EMPTYARC(s2, atom->end); /* the bypass */
assert(PREF(qprefer) != 0);
f = COMBINE(qprefer, atom->flags);
t = subre(v, '|', f, lp, atom->end);
NOERR();
t->left = atom;
t->right = subre(v, '|', PREF(f), s2, atom->end);
NOERR();
t->right->left = subre(v, '=', 0, s2, atom->end);
NOERR();
*atomp = t;
atomp = &t->left;
m = 1;
}
/* deal with the rest of the quantifier */
if (atomtype == BACKREF)
{
/* special case: backrefs have internal quantifiers */
......@@ -1120,17 +1105,25 @@ parseqatom(struct vars * v,
atom->min = (short) m;
atom->max = (short) n;
atom->flags |= COMBINE(qprefer, atom->flags);
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
else if (m == 1 && n == 1)
{
/* no/vacuous quantifier: done */
EMPTYARC(s, atom->begin); /* empty prefix */
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
else
else if (m > 0 && !(atom->flags & BACKR))
{
/*
* Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the
* second x
* If there's no backrefs involved, we can turn x{m,n} into
* x{m-1,n-1}x, with capturing parens in only the second x. This
* is valid because we only care about capturing matches from the
* final iteration of the quantifier. It's a win because we can
* implement the backref-free left side as a plain DFA node, since
* we don't really care where its submatches are.
*/
dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
assert(m >= 1 && m != INFINITY && n >= 1);
......@@ -1142,16 +1135,36 @@ parseqatom(struct vars * v,
NOERR();
t->right = atom;
*atomp = t;
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
else
{
/* general case: need an iteration node */
s2 = newstate(v->nfa);
NOERR();
moveouts(v->nfa, atom->end, s2);
NOERR();
dupnfa(v->nfa, atom->begin, atom->end, s, s2);
repeat(v, s, s2, m, n);
f = COMBINE(qprefer, atom->flags);
t = subre(v, '*', f, s, s2);
NOERR();
t->min = (short) m;
t->max = (short) n;
t->left = atom;
*atomp = t;
/* rest of branch is to be strung from iteration's end state */
}
/* and finally, look after that postponed recursion */
t = top->right;
if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
t->right = parsebranch(v, stopper, type, s2, rp, 1);
else
{
EMPTYARC(atom->end, rp);
t->right = subre(v, '=', 0, atom->end, rp);
EMPTYARC(s2, rp);
t->right = subre(v, '=', 0, s2, rp);
}
assert(SEE('|') || SEE(stopper) || SEE(EOS));
t->flags |= COMBINE(t->flags, t->right->flags);
......@@ -1214,6 +1227,9 @@ scannum(struct vars * v)
/*
* repeat - replicate subNFA for quantifiers
*
* The sub-NFA strung from lp to rp is modified to represent m to n
* repetitions of its initial contents.
*
* The duplication sequences used here are chosen carefully so that any
* pointers starting out pointing into the subexpression end up pointing into
* the last occurrence. (Note that it may not be strung between the same
......@@ -1229,7 +1245,7 @@ repeat(struct vars * v,
int n)
{
#define SOME 2
#define INF 3
#define INF 3
#define PAIR(x, y) ((x)*4 + (y))
#define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
const int rm = REDUCE(m);
......@@ -1603,7 +1619,7 @@ subre(struct vars * v,
v->treechain = ret;
}
assert(strchr("|.b(=", op) != NULL);
assert(strchr("=b|.*(", op) != NULL);
ret->op = op;
ret->flags = flags;
......
......@@ -140,11 +140,15 @@ static void subset(struct vars *, struct subre *, chr *, chr *);
static int dissect(struct vars *, struct subre *, chr *, chr *);
static int condissect(struct vars *, struct subre *, chr *, chr *);
static int altdissect(struct vars *, struct subre *, chr *, chr *);
static int iterdissect(struct vars *, struct subre *, chr *, chr *);
static int reviterdissect(struct vars *, struct subre *, chr *, chr *);
static int cdissect(struct vars *, struct subre *, chr *, chr *);
static int ccondissect(struct vars *, struct subre *, chr *, chr *);
static int crevdissect(struct vars *, struct subre *, chr *, chr *);
static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
static int caltdissect(struct vars *, struct subre *, chr *, chr *);
static int citerdissect(struct vars *, struct subre *, chr *, chr *);
static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
/* === rege_dfa.c === */
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
......@@ -563,14 +567,17 @@ dissect(struct vars * v,
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
return REG_OKAY; /* no action, parent did the work */
case '|': /* alternation */
assert(t->left != NULL);
return altdissect(v, t, begin, end);
case 'b': /* back ref -- shouldn't be calling us! */
return REG_ASSERT;
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
return condissect(v, t, begin, end);
case '|': /* alternation */
assert(t->left != NULL);
return altdissect(v, t, begin, end);
case '*': /* iteration */
assert(t->left != NULL);
return iterdissect(v, t, begin, end);
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0);
......@@ -696,6 +703,375 @@ altdissect(struct vars * v,
return REG_ASSERT; /* none of them matched?!? */
}
/*
* iterdissect - iteration subexpression matches (uncomplicated)
*/
static int /* regexec return code */
iterdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(begin <= end);
if (t->left->flags & SHORTER) /* reverse scan */
return reviterdissect(v, t, begin, end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("iter %d\n", t->retry));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = end;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* try to find an endpoint for the k'th sub-match */
endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can shorten previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->retry, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to shorten some previous match */
k--;
goto backtrack;
}
/* reject zero-length match unless necessary to achieve min */
if (endpts[k] == endpts[k - 1] &&
(k >= min_matches || min_matches - k < end - endpts[k]))
goto backtrack;
k++;
limit = end;
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
er = dissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
freedfa(d);
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider shorter versions of the current sub-match. However,
* we'll only ask for a zero-length match if necessary.
*/
while (k > 0)
{
chr *prev_end = endpts[k - 1];
if (endpts[k] > prev_end)
{
limit = endpts[k] - 1;
if (limit > prev_end ||
(k < min_matches && min_matches - k >= end - prev_end))
{
/* break out of backtrack loop, continue the outer one */
break;
}
}
/* can't shorten k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
MDEBUG(("%d failed\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_ASSERT;
}
/*
* reviterdissect - shortest-first iteration subexpression matches
*/
static int /* regexec return code */
reviterdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->left->flags & SHORTER);
assert(begin <= end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("reviter %d\n", t->retry));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = begin;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* disallow zero-length match unless necessary to achieve min */
if (limit == endpts[k - 1] &&
limit != end &&
(k >= min_matches || min_matches - k < end - limit))
limit++;
/* try to find an endpoint for the k'th sub-match */
endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
(chr **) NULL, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can lengthen previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->retry, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to lengthen some previous match */
k--;
goto backtrack;
}
k++;
limit = endpts[k - 1];
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
er = dissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
freedfa(d);
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider longer versions of the current sub-match.
*/
while (k > 0)
{
if (endpts[k] < end)
{
limit = endpts[k] + 1;
/* break out of backtrack loop, continue the outer one */
break;
}
/* can't lengthen k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
MDEBUG(("%d failed\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_ASSERT;
}
/*
* cdissect - determine subexpression matches (with complications)
* The retry memory stores the offset of the trial midpoint from begin,
......@@ -717,15 +1093,18 @@ cdissect(struct vars * v,
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
return REG_OKAY; /* no action, parent did the work */
case '|': /* alternation */
assert(t->left != NULL);
return caltdissect(v, t, begin, end);
case 'b': /* back reference */
assert(t->left == NULL && t->right == NULL);
return cbrdissect(v, t, begin, end);
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
return ccondissect(v, t, begin, end);
case '|': /* alternation */
assert(t->left != NULL);
return caltdissect(v, t, begin, end);
case '*': /* iteration */
assert(t->left != NULL);
return citerdissect(v, t, begin, end);
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0);
......@@ -847,7 +1226,7 @@ ccondissect(struct vars * v,
}
/*
* crevdissect - determine backref shortest-first subexpression matches
* crevdissect - shortest-first concatenation subexpression matches
* The retry memory stores the offset of the trial midpoint from begin,
* plus 1 so that 0 uniquely means "clean slate".
*/
......@@ -1088,6 +1467,377 @@ caltdissect(struct vars * v,
return caltdissect(v, t->right, begin, end);
}
/*
* citerdissect - iteration subexpression matches (with complications)
*/
static int /* regexec return code */
citerdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(begin <= end);
if (t->left->flags & SHORTER) /* reverse scan */
return creviterdissect(v, t, begin, end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("citer %d\n", t->retry));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = end;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* try to find an endpoint for the k'th sub-match */
endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can shorten previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->retry, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to shorten some previous match */
k--;
goto backtrack;
}
/* reject zero-length match unless necessary to achieve min */
if (endpts[k] == endpts[k - 1] &&
(k >= min_matches || min_matches - k < end - endpts[k]))
goto backtrack;
k++;
limit = end;
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
zapmem(v, t->left);
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
freedfa(d);
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider shorter versions of the current sub-match. However,
* we'll only ask for a zero-length match if necessary.
*/
while (k > 0)
{
chr *prev_end = endpts[k - 1];
if (endpts[k] > prev_end)
{
limit = endpts[k] - 1;
if (limit > prev_end ||
(k < min_matches && min_matches - k >= end - prev_end))
{
/* break out of backtrack loop, continue the outer one */
break;
}
}
/* can't shorten k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted */
MDEBUG(("%d failed\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_NOMATCH;
}
/*
* creviterdissect - shortest-first iteration subexpression matches
*/
static int /* regexec return code */
creviterdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->left->flags & SHORTER);
assert(begin <= end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("creviter %d\n", t->retry));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = begin;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* disallow zero-length match unless necessary to achieve min */
if (limit == endpts[k - 1] &&
limit != end &&
(k >= min_matches || min_matches - k < end - limit))
limit++;
/* try to find an endpoint for the k'th sub-match */
endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
(chr **) NULL, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can lengthen previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->retry, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to lengthen some previous match */
k--;
goto backtrack;
}
k++;
limit = endpts[k - 1];
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
zapmem(v, t->left);
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
freedfa(d);
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider longer versions of the current sub-match.
*/
while (k > 0)
{
if (endpts[k] < end)
{
limit = endpts[k] + 1;
/* break out of backtrack loop, continue the outer one */
break;
}
/* can't lengthen k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted */
MDEBUG(("%d failed\n", t->retry));
freedfa(d);
FREE(endpts);
return REG_NOMATCH;
}
#include "rege_dfa.c"
......@@ -372,10 +372,28 @@ struct cnfa
/*
* subexpression tree
*
* "op" is one of:
* '=' plain regex without interesting substructure (implemented as DFA)
* 'b' back-reference (has no substructure either)
* '(' capture node: captures the match of its single child
* '.' concatenation: matches a match for left, then a match for right
* '|' alternation: matches a match for left or a match for right
* '*' iteration: matches some number of matches of its single child
*
* Note: the right child of an alternation must be another alternation or
* NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
* might expect. This could stand to be changed. Actually I'd rather see
* a single alternation node with N children, but that will take revising
* the representation of struct subre.
*
* Note: when a backref is directly quantified, we stick the min/max counts
* into the backref rather than plastering an iteration node on top. This is
* for efficiency: there is no need to search for possible division points.
*/
struct subre
{
char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */
char op; /* see type codes above */
char flags;
#define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */
......@@ -393,8 +411,8 @@ struct subre
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short retry; /* index into retry memory */
int subno; /* subexpression number (for 'b' and '(') */
short min; /* min repetitions, for backref only */
short max; /* max repetitions, for backref only */
short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */
struct state *begin; /* outarcs from here... */
......
......@@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t;
t
(1 row)
-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
t
---
t
(1 row)
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
t
---
t
(1 row)
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
f
---
f
(1 row)
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
f
---
f
(1 row)
......@@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t;
select 'xxx' ~ '^([bc])\1*$' as f;
select 'bbc' ~ '^([bc])\1*$' as f;
select 'b' ~ '^([bc])\1*$' as t;
-- Test quantified backref within a larger expression
select 'abc abc abc' ~ '^(\w+)( \1)+$' as t;
select 'abc abd abc' ~ '^(\w+)( \1)+$' as f;
select 'abc abc abd' ~ '^(\w+)( \1)+$' as f;
select 'abc abc abc' ~ '^(.+)( \1)+$' as t;
select 'abc abd abc' ~ '^(.+)( \1)+$' as f;
select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment