Commit 4dd78bf3 authored by Tom Lane's avatar Tom Lane

Merge dissect() into cdissect() to remove a pile of near-duplicate code.

The "uncomplicated" case isn't materially less complicated than the full
case, certainly not enough so to justify duplicating nearly 500 lines
of code.  The only extra work being done in the full path is zaptreesubs,
which is very cheap compared to everything else being done here, and
besides that I'm less than convinced that it's not needed in some cases
even without backrefs.
parent 58735947
......@@ -138,14 +138,9 @@ static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa
static void zapallsubs(regmatch_t *, size_t);
static void zaptreesubs(struct vars *, struct subre *);
static void subset(struct vars *, struct subre *, chr *, chr *);
static int dissect(struct vars *, struct subre *, chr *, chr *);
static int condissect(struct vars *, struct subre *, chr *, chr *);
static int altdissect(struct vars *, struct subre *, chr *, chr *);
static int iterdissect(struct vars *, struct subre *, chr *, chr *);
static int reviterdissect(struct vars *, struct subre *, chr *, chr *);
static int cdissect(struct vars *, struct subre *, chr *, chr *);
static int ccondissect(struct vars *, struct subre *, chr *, chr *);
static int crevdissect(struct vars *, struct subre *, chr *, chr *);
static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
static int caltdissect(struct vars *, struct subre *, chr *, chr *);
static int citerdissect(struct vars *, struct subre *, chr *, chr *);
......@@ -376,9 +371,9 @@ find(struct vars * v,
if (v->nmatch == 1) /* no need for submatches */
return REG_OKAY;
/* submatches */
/* find submatches */
zapallsubs(v->pmatch, v->nmatch);
return dissect(v, v->g->tree, begin, end);
return cdissect(v, v->g->tree, begin, end);
}
/*
......@@ -568,505 +563,19 @@ subset(struct vars * v,
}
/*
* dissect - determine subexpression matches (uncomplicated case)
*/
static int /* regexec return code */
dissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
assert(t != NULL);
MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
switch (t->op)
{
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
return REG_OKAY; /* no action, parent did the work */
case 'b': /* back ref -- shouldn't be calling us! */
return REG_ASSERT;
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
return condissect(v, t, begin, end);
case '|': /* alternation */
assert(t->left != NULL);
return altdissect(v, t, begin, end);
case '*': /* iteration */
assert(t->left != NULL);
return iterdissect(v, t, begin, end);
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0);
subset(v, t, begin, end);
return dissect(v, t->left, begin, end);
default:
return REG_ASSERT;
}
}
/*
* condissect - determine concatenation subexpression matches (uncomplicated)
*/
static int /* regexec return code */
condissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
struct dfa *d2;
chr *mid;
int i;
int shorter = (t->left->flags & SHORTER) ? 1 : 0;
chr *stop = (shorter) ? end : begin;
assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->right != NULL && t->right->cnfa.nstates > 0);
d = getsubdfa(v, t->left);
NOERR();
d2 = getsubdfa(v, t->right);
NOERR();
/* pick a tentative midpoint */
if (shorter)
mid = shortest(v, d, begin, begin, end, (chr **) NULL,
(int *) NULL);
else
mid = longest(v, d, begin, end, (int *) NULL);
if (mid == NULL)
return REG_ASSERT;
MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
/* iterate until satisfaction or failure */
while (longest(v, d2, mid, end, (int *) NULL) != end)
{
/* that midpoint didn't work, find a new one */
if (mid == stop)
{
/* all possibilities exhausted! */
MDEBUG(("no midpoint!\n"));
return REG_ASSERT;
}
if (shorter)
mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL,
(int *) NULL);
else
mid = longest(v, d, begin, mid - 1, (int *) NULL);
if (mid == NULL)
{
/* failed to find a new one! */
MDEBUG(("failed midpoint!\n"));
return REG_ASSERT;
}
MDEBUG(("new midpoint %ld\n", LOFF(mid)));
}
/* satisfaction */
MDEBUG(("successful\n"));
i = dissect(v, t->left, begin, mid);
if (i != REG_OKAY)
return i;
return dissect(v, t->right, mid, end);
}
/*
* altdissect - determine alternative subexpression matches (uncomplicated)
*/
static int /* regexec return code */
altdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
int i;
assert(t != NULL);
assert(t->op == '|');
for (i = 0; t != NULL; t = t->right, i++)
{
MDEBUG(("trying %dth\n", i));
assert(t->left != NULL && t->left->cnfa.nstates > 0);
d = getsubdfa(v, t->left);
NOERR();
if (longest(v, d, begin, end, (int *) NULL) == end)
{
MDEBUG(("success\n"));
return dissect(v, t->left, begin, end);
}
}
return REG_ASSERT; /* none of them matched?!? */
}
/*
* iterdissect - iteration subexpression matches (uncomplicated)
*/
static int /* regexec return code */
iterdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(begin <= end);
if (t->left->flags & SHORTER) /* reverse scan */
return reviterdissect(v, t, begin, end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
* cdissect - check backrefs and determine subexpression matches
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = getsubdfa(v, t->left);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("iter %d\n", t->id));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = end;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* try to find an endpoint for the k'th sub-match */
endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can shorten previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->id, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to shorten some previous match */
k--;
goto backtrack;
}
/* reject zero-length match unless necessary to achieve min */
if (endpts[k] == endpts[k - 1] &&
(k >= min_matches || min_matches - k < end - endpts[k]))
goto backtrack;
k++;
limit = end;
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
er = dissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->id));
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider shorter versions of the current sub-match. However,
* we'll only ask for a zero-length match if necessary.
*/
while (k > 0)
{
chr *prev_end = endpts[k - 1];
if (endpts[k] > prev_end)
{
limit = endpts[k] - 1;
if (limit > prev_end ||
(k < min_matches && min_matches - k >= end - prev_end))
{
/* break out of backtrack loop, continue the outer one */
break;
}
}
/* can't shorten k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
MDEBUG(("%d failed\n", t->id));
FREE(endpts);
return REG_ASSERT;
}
/*
* reviterdissect - shortest-first iteration subexpression matches
*/
static int /* regexec return code */
reviterdissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct dfa *d;
chr **endpts;
chr *limit;
int min_matches;
size_t max_matches;
int nverified;
int k;
int i;
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->left->flags & SHORTER);
assert(begin <= end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
* so we pretend the min is 1.
*/
min_matches = t->min;
if (min_matches <= 0)
{
if (begin == end)
return REG_OKAY;
min_matches = 1;
}
/*
* We need workspace to track the endpoints of each sub-match. Normally
* we consider only nonzero-length sub-matches, so there can be at most
* end-begin of them. However, if min is larger than that, we will also
* consider zero-length sub-matches in order to find enough matches.
* cdissect recursively processes a subre tree to check matching of backrefs
* and/or identify submatch boundaries for capture nodes. The proposed match
* runs from "begin" to "end" (not including "end"), and we are basically
* "dissecting" it to see where the submatches are.
*
* For convenience, endpts[0] contains the "begin" pointer and we store
* sub-match endpoints in endpts[1..max_matches].
*/
max_matches = end - begin;
if (max_matches > t->max && t->max != INFINITY)
max_matches = t->max;
if (max_matches < min_matches)
max_matches = min_matches;
endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
if (endpts == NULL)
return REG_ESPACE;
endpts[0] = begin;
d = getsubdfa(v, t->left);
if (ISERR())
{
FREE(endpts);
return v->err;
}
MDEBUG(("reviter %d\n", t->id));
/*
* Our strategy is to first find a set of sub-match endpoints that are
* valid according to the child node's DFA, and then recursively dissect
* each sub-match to confirm validity. If any validity check fails,
* backtrack the last sub-match and try again. And, when we next try for
* a validity check, we need not recheck any successfully verified
* sub-matches that we didn't move the endpoints of. nverified remembers
* how many sub-matches are currently known okay.
*/
/* initialize to consider first sub-match */
nverified = 0;
k = 1;
limit = begin;
/* iterate until satisfaction or failure */
while (k > 0)
{
/* disallow zero-length match unless necessary to achieve min */
if (limit == endpts[k - 1] &&
limit != end &&
(k >= min_matches || min_matches - k < end - limit))
limit++;
/* try to find an endpoint for the k'th sub-match */
endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
(chr **) NULL, (int *) NULL);
if (endpts[k] == NULL)
{
/* no match possible, so see if we can lengthen previous one */
k--;
goto backtrack;
}
MDEBUG(("%d: working endpoint %d: %ld\n",
t->id, k, LOFF(endpts[k])));
/* k'th sub-match can no longer be considered verified */
if (nverified >= k)
nverified = k - 1;
if (endpts[k] != end)
{
/* haven't reached end yet, try another iteration if allowed */
if (k >= max_matches)
{
/* must try to lengthen some previous match */
k--;
goto backtrack;
}
k++;
limit = endpts[k - 1];
continue;
}
/*
* We've identified a way to divide the string into k sub-matches
* that works so far as the child DFA can tell. If k is an allowed
* number of matches, start the slow part: recurse to verify each
* sub-match. We always have k <= max_matches, needn't check that.
*/
if (k < min_matches)
goto backtrack;
MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
for (i = nverified + 1; i <= k; i++)
{
er = dissect(v, t->left, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
continue;
}
if (er == REG_NOMATCH)
break;
/* oops, something failed */
FREE(endpts);
return er;
}
if (i > k)
{
/* satisfaction */
MDEBUG(("%d successful\n", t->id));
FREE(endpts);
return REG_OKAY;
}
/* match failed to verify, so backtrack */
backtrack:
/*
* Must consider longer versions of the current sub-match.
*/
while (k > 0)
{
if (endpts[k] < end)
{
limit = endpts[k] + 1;
/* break out of backtrack loop, continue the outer one */
break;
}
/* can't lengthen k'th sub-match any more, consider previous one */
k--;
}
}
/* all possibilities exhausted - shouldn't happen in uncomplicated mode */
MDEBUG(("%d failed\n", t->id));
FREE(endpts);
return REG_ASSERT;
}
/*
* cdissect - determine subexpression matches (with complications)
* Before calling any level of cdissect, the caller must have run the node's
* DFA and found that the proposed substring satisfies the DFA. (We make
* the caller do that because in concatenation and iteration nodes, it's
* much faster to check all the substrings against the child DFAs before we
* recurse.) Also, caller must have cleared subexpression match data via
* zaptreesubs (or zapallsubs at the top level).
*/
static int /* regexec return code */
cdissect(struct vars * v,
......@@ -1083,33 +592,54 @@ cdissect(struct vars * v,
{
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
return REG_OKAY; /* no action, parent did the work */
er = REG_OKAY; /* no action, parent did the work */
break;
case 'b': /* back reference */
assert(t->left == NULL && t->right == NULL);
return cbrdissect(v, t, begin, end);
er = cbrdissect(v, t, begin, end);
break;
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
return ccondissect(v, t, begin, end);
if (t->left->flags & SHORTER) /* reverse scan */
er = crevcondissect(v, t, begin, end);
else
er = ccondissect(v, t, begin, end);
break;
case '|': /* alternation */
assert(t->left != NULL);
return caltdissect(v, t, begin, end);
er = caltdissect(v, t, begin, end);
break;
case '*': /* iteration */
assert(t->left != NULL);
return citerdissect(v, t, begin, end);
if (t->left->flags & SHORTER) /* reverse scan */
er = creviterdissect(v, t, begin, end);
else
er = citerdissect(v, t, begin, end);
break;
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->subno > 0);
er = cdissect(v, t->left, begin, end);
if (er == REG_OKAY)
subset(v, t, begin, end);
return er;
break;
default:
return REG_ASSERT;
er = REG_ASSERT;
break;
}
/*
* We should never have a match failure unless backrefs lurk below;
* otherwise, either caller failed to check the DFA, or there's some
* inconsistency between the DFA and the node's innards.
*/
assert(er != REG_NOMATCH || (t->flags & BACKR));
return er;
}
/*
* ccondissect - concatenation subexpression matches (with complications)
* ccondissect - dissect match for concatenation node
*/
static int /* regexec return code */
ccondissect(struct vars * v,
......@@ -1125,9 +655,7 @@ ccondissect(struct vars * v,
assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->right != NULL && t->right->cnfa.nstates > 0);
if (t->left->flags & SHORTER) /* reverse scan */
return crevdissect(v, t, begin, end);
assert(!(t->left->flags & SHORTER));
d = getsubdfa(v, t->left);
NOERR();
......@@ -1158,7 +686,7 @@ ccondissect(struct vars * v,
return REG_OKAY;
}
}
if (er != REG_OKAY && er != REG_NOMATCH)
if (er != REG_NOMATCH)
return er;
}
......@@ -1186,10 +714,10 @@ ccondissect(struct vars * v,
}
/*
* crevdissect - shortest-first concatenation subexpression matches
* crevcondissect - dissect match for concatenation node, shortest-first
*/
static int /* regexec return code */
crevdissect(struct vars * v,
crevcondissect(struct vars * v,
struct subre * t,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
......@@ -1204,12 +732,11 @@ crevdissect(struct vars * v,
assert(t->right != NULL && t->right->cnfa.nstates > 0);
assert(t->left->flags & SHORTER);
/* concatenation -- need to split the substring between parts */
d = getsubdfa(v, t->left);
NOERR();
d2 = getsubdfa(v, t->right);
NOERR();
MDEBUG(("crev %d\n", t->id));
MDEBUG(("crevcon %d\n", t->id));
/* pick a tentative midpoint */
mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
......@@ -1234,7 +761,7 @@ crevdissect(struct vars * v,
return REG_OKAY;
}
}
if (er != REG_OKAY && er != REG_NOMATCH)
if (er != REG_NOMATCH)
return er;
}
......@@ -1262,7 +789,7 @@ crevdissect(struct vars * v,
}
/*
* cbrdissect - determine backref subexpression matches
* cbrdissect - dissect match for backref node
*/
static int /* regexec return code */
cbrdissect(struct vars * v,
......@@ -1343,7 +870,7 @@ cbrdissect(struct vars * v,
}
/*
* caltdissect - determine alternative subexpression matches (w. complications)
* caltdissect - dissect match for alternation node
*/
static int /* regexec return code */
caltdissect(struct vars * v,
......@@ -1354,29 +881,32 @@ caltdissect(struct vars * v,
struct dfa *d;
int er;
if (t == NULL)
return REG_NOMATCH;
/* We loop, rather than tail-recurse, to handle a chain of alternatives */
while (t != NULL)
{
assert(t->op == '|');
assert(t->left != NULL);
assert(t->left != NULL && t->left->cnfa.nstates > 0);
MDEBUG(("calt n%d\n", t->id));
d = getsubdfa(v, t->left);
NOERR();
if (longest(v, d, begin, end, (int *) NULL) != end)
return caltdissect(v, t->right, begin, end);
if (longest(v, d, begin, end, (int *) NULL) == end)
{
MDEBUG(("calt matched\n"));
er = cdissect(v, t->left, begin, end);
if (er != REG_NOMATCH)
return er;
}
t = t->right;
}
return caltdissect(v, t->right, begin, end);
return REG_NOMATCH;
}
/*
* citerdissect - iteration subexpression matches (with complications)
* citerdissect - dissect match for iteration node
*/
static int /* regexec return code */
citerdissect(struct vars * v,
......@@ -1396,11 +926,9 @@ citerdissect(struct vars * v,
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(!(t->left->flags & SHORTER));
assert(begin <= end);
if (t->left->flags & SHORTER) /* reverse scan */
return creviterdissect(v, t, begin, end);
/*
* If zero matches are allowed, and target string is empty, just declare
* victory. OTOH, if target string isn't empty, zero matches can't work
......@@ -1562,7 +1090,7 @@ backtrack:
}
/*
* creviterdissect - shortest-first iteration subexpression matches
* creviterdissect - dissect match for iteration node, shortest-first
*/
static int /* regexec return code */
creviterdissect(struct vars * v,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment