Commit 58104308 authored by Tom Lane's avatar Tom Lane

Convert regex engine's subre tree from binary to N-ary style.

Instead of having left and right child links in subre structs,
have a single child link plus a sibling link.  Multiple children
of a tree node are now reached by chasing the sibling chain.

The beneficiary of this is alternation tree nodes.  A regular
expression with N (>1) branches is now represented by one alternation
node with N children, rather than a tree that includes N alternation
nodes as well as N children.  While the old representation didn't
really cost anything extra at execution time, it was pretty horrid
for compilation purposes, because each of the alternation nodes had
its own NFA, which we were too stupid not to separately optimize.
(To make matters worse, all of those NFAs described the entire
alternation pattern, not just the portion of it that one might
expect from the tree structure.)

We continue to require concatenation nodes to have exactly two
children.  This data structure is now prepared to support more,
but the executor's logic would need some careful redesign, and
it's not clear that a lot of benefit could be had.

This is part of a patch series that in total reduces the regex engine's
runtime by about a factor of four on a large corpus of real-world regexes.

Patch by me, reviewed by Joel Jacobson

Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
parent cebc1d34
...@@ -129,9 +129,9 @@ If not, we can reject the match immediately without iterating through many ...@@ -129,9 +129,9 @@ If not, we can reject the match immediately without iterating through many
possibilities. possibilities.
As an example, consider the regex "(a[bc]+)\1". The compiled As an example, consider the regex "(a[bc]+)\1". The compiled
representation will have a top-level concatenation subre node. Its left representation will have a top-level concatenation subre node. Its first
child is a capture node, and the child of that is a plain DFA node for child is a capture node, and the child of that is a plain DFA node for
"a[bc]+". The concatenation's right child is a backref node for \1. "a[bc]+". The concatenation's second child is a backref node for \1.
The DFA associated with the concatenation node will be "a[bc]+a[bc]+", The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
where the backref has been replaced by a copy of the DFA for its referent where the backref has been replaced by a copy of the DFA for its referent
expression. When executed, the concatenation node will have to search for expression. When executed, the concatenation node will have to search for
......
...@@ -58,6 +58,7 @@ static void processlacon(struct vars *, struct state *, struct state *, int, ...@@ -58,6 +58,7 @@ static void processlacon(struct vars *, struct state *, struct state *, int,
struct state *, struct state *); struct state *, struct state *);
static struct subre *subre(struct vars *, int, int, struct state *, struct state *); static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
static void freesubre(struct vars *, struct subre *); static void freesubre(struct vars *, struct subre *);
static void freesubreandsiblings(struct vars *, struct subre *);
static void freesrnode(struct vars *, struct subre *); static void freesrnode(struct vars *, struct subre *);
static void optst(struct vars *, struct subre *); static void optst(struct vars *, struct subre *);
static int numst(struct subre *, int); static int numst(struct subre *, int);
...@@ -652,8 +653,8 @@ makesearch(struct vars *v, ...@@ -652,8 +653,8 @@ makesearch(struct vars *v,
* parse - parse an RE * parse - parse an RE
* *
* This is actually just the top level, which parses a bunch of branches * This is actually just the top level, which parses a bunch of branches
* tied together with '|'. They appear in the tree as the left children * tied together with '|'. If there's more than one, they appear in the
* of a chain of '|' subres. * tree as the children of a '|' subre.
*/ */
static struct subre * static struct subre *
parse(struct vars *v, parse(struct vars *v,
...@@ -662,41 +663,34 @@ parse(struct vars *v, ...@@ -662,41 +663,34 @@ parse(struct vars *v,
struct state *init, /* initial state */ struct state *init, /* initial state */
struct state *final) /* final state */ struct state *final) /* final state */
{ {
struct state *left; /* scaffolding for branch */
struct state *right;
struct subre *branches; /* top level */ struct subre *branches; /* top level */
struct subre *branch; /* current branch */ struct subre *lastbranch; /* latest branch */
struct subre *t; /* temporary */
int firstbranch; /* is this the first branch? */
assert(stopper == ')' || stopper == EOS); assert(stopper == ')' || stopper == EOS);
branches = subre(v, '|', LONGER, init, final); branches = subre(v, '|', LONGER, init, final);
NOERRN(); NOERRN();
branch = branches; lastbranch = NULL;
firstbranch = 1;
do do
{ /* a branch */ { /* a branch */
if (!firstbranch) struct subre *branch;
{ struct state *left; /* scaffolding for branch */
/* need a place to hang it */ struct state *right;
branch->right = subre(v, '|', LONGER, init, final);
NOERRN();
branch = branch->right;
}
firstbranch = 0;
left = newstate(v->nfa); left = newstate(v->nfa);
right = newstate(v->nfa); right = newstate(v->nfa);
NOERRN(); NOERRN();
EMPTYARC(init, left); EMPTYARC(init, left);
EMPTYARC(right, final); EMPTYARC(right, final);
NOERRN(); NOERRN();
branch->left = parsebranch(v, stopper, type, left, right, 0); branch = parsebranch(v, stopper, type, left, right, 0);
NOERRN(); NOERRN();
branch->flags |= UP(branch->flags | branch->left->flags); if (lastbranch)
if ((branch->flags & ~branches->flags) != 0) /* new flags */ lastbranch->sibling = branch;
for (t = branches; t != branch; t = t->right) else
t->flags |= branch->flags; branches->child = branch;
branches->flags |= UP(branches->flags | branch->flags);
lastbranch = branch;
} while (EAT('|')); } while (EAT('|'));
assert(SEE(stopper) || SEE(EOS)); assert(SEE(stopper) || SEE(EOS));
...@@ -707,20 +701,16 @@ parse(struct vars *v, ...@@ -707,20 +701,16 @@ parse(struct vars *v,
} }
/* optimize out simple cases */ /* optimize out simple cases */
if (branch == branches) if (lastbranch == branches->child)
{ /* only one branch */ { /* only one branch */
assert(branch->right == NULL); assert(lastbranch->sibling == NULL);
t = branch->left; freesrnode(v, branches);
branch->left = NULL; branches = lastbranch;
freesubre(v, branches);
branches = t;
} }
else if (!MESSY(branches->flags)) else if (!MESSY(branches->flags))
{ /* no interesting innards */ { /* no interesting innards */
freesubre(v, branches->left); freesubreandsiblings(v, branches->child);
branches->left = NULL; branches->child = NULL;
freesubre(v, branches->right);
branches->right = NULL;
branches->op = '='; branches->op = '=';
} }
...@@ -972,7 +962,7 @@ parseqatom(struct vars *v, ...@@ -972,7 +962,7 @@ parseqatom(struct vars *v,
t = subre(v, '(', atom->flags | CAP, lp, rp); t = subre(v, '(', atom->flags | CAP, lp, rp);
NOERR(); NOERR();
t->subno = subno; t->subno = subno;
t->left = atom; t->child = atom;
atom = t; atom = t;
} }
/* postpone everything else pending possible {0} */ /* postpone everything else pending possible {0} */
...@@ -1120,26 +1110,27 @@ parseqatom(struct vars *v, ...@@ -1120,26 +1110,27 @@ parseqatom(struct vars *v,
/* break remaining subRE into x{...} and what follows */ /* break remaining subRE into x{...} and what follows */
t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp); t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
NOERR(); NOERR();
t->left = atom; t->child = atom;
atomp = &t->left; atomp = &t->child;
/* /*
* Here we should recurse to fill t->right ... but we must postpone that * Here we should recurse to fill t->child->sibling ... but we must
* to the end. * postpone that to the end. One reason is that t->child may be replaced
* below, and we don't want to worry about its sibling link.
*/ */
/* /*
* Convert top node to a concatenation of the prefix (top->left, covering * Convert top node to a concatenation of the prefix (top->child, covering
* whatever we parsed previously) and remaining (t). Note that the prefix * whatever we parsed previously) and remaining (t). Note that the prefix
* could be empty, in which case this concatenation node is unnecessary. * could be empty, in which case this concatenation node is unnecessary.
* To keep things simple, we operate in a general way for now, and get rid * To keep things simple, we operate in a general way for now, and get rid
* of unnecessary subres below. * of unnecessary subres below.
*/ */
assert(top->op == '=' && top->left == NULL && top->right == NULL); assert(top->op == '=' && top->child == NULL);
top->left = subre(v, '=', top->flags, top->begin, lp); top->child = subre(v, '=', top->flags, top->begin, lp);
NOERR(); NOERR();
top->op = '.'; top->op = '.';
top->right = t; top->child->sibling = t;
/* top->flags will get updated later */ /* top->flags will get updated later */
/* if it's a backref, now is the time to replicate the subNFA */ /* if it's a backref, now is the time to replicate the subNFA */
...@@ -1201,9 +1192,9 @@ parseqatom(struct vars *v, ...@@ -1201,9 +1192,9 @@ parseqatom(struct vars *v,
f = COMBINE(qprefer, atom->flags); f = COMBINE(qprefer, atom->flags);
t = subre(v, '.', f, s, atom->end); /* prefix and atom */ t = subre(v, '.', f, s, atom->end); /* prefix and atom */
NOERR(); NOERR();
t->left = subre(v, '=', PREF(f), s, atom->begin); t->child = subre(v, '=', PREF(f), s, atom->begin);
NOERR(); NOERR();
t->right = atom; t->child->sibling = atom;
*atomp = t; *atomp = t;
/* rest of branch can be strung starting from atom->end */ /* rest of branch can be strung starting from atom->end */
s2 = atom->end; s2 = atom->end;
...@@ -1222,44 +1213,43 @@ parseqatom(struct vars *v, ...@@ -1222,44 +1213,43 @@ parseqatom(struct vars *v,
NOERR(); NOERR();
t->min = (short) m; t->min = (short) m;
t->max = (short) n; t->max = (short) n;
t->left = atom; t->child = atom;
*atomp = t; *atomp = t;
/* rest of branch is to be strung from iteration's end state */ /* rest of branch is to be strung from iteration's end state */
} }
/* and finally, look after that postponed recursion */ /* and finally, look after that postponed recursion */
t = top->right; t = top->child->sibling;
if (!(SEE('|') || SEE(stopper) || SEE(EOS))) if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
{ {
/* parse all the rest of the branch, and insert in t->right */ /* parse all the rest of the branch, and insert in t->child->sibling */
t->right = parsebranch(v, stopper, type, s2, rp, 1); t->child->sibling = parsebranch(v, stopper, type, s2, rp, 1);
NOERR(); NOERR();
assert(SEE('|') || SEE(stopper) || SEE(EOS)); assert(SEE('|') || SEE(stopper) || SEE(EOS));
/* here's the promised update of the flags */ /* here's the promised update of the flags */
t->flags |= COMBINE(t->flags, t->right->flags); t->flags |= COMBINE(t->flags, t->child->sibling->flags);
top->flags |= COMBINE(top->flags, t->flags); top->flags |= COMBINE(top->flags, t->flags);
/* /*
* At this point both top and t are concatenation (op == '.') subres, * At this point both top and t are concatenation (op == '.') subres,
* and we have top->left = prefix of branch, top->right = t, t->left = * and we have top->child = prefix of branch, top->child->sibling = t,
* messy atom (with quantification superstructure if needed), t->right * t->child = messy atom (with quantification superstructure if
* = rest of branch. * needed), t->child->sibling = rest of branch.
* *
* If the messy atom was the first thing in the branch, then top->left * If the messy atom was the first thing in the branch, then
* is vacuous and we can get rid of one level of concatenation. Since * top->child is vacuous and we can get rid of one level of
* the caller is holding a pointer to the top node, we can't remove * concatenation. Since the caller is holding a pointer to the top
* that node; but we're allowed to change its properties. * node, we can't remove that node; but we're allowed to change its
* properties.
*/ */
assert(top->left->op == '='); assert(top->child->op == '=');
if (top->left->begin == top->left->end) if (top->child->begin == top->child->end)
{ {
assert(!MESSY(top->left->flags)); assert(!MESSY(top->child->flags));
freesubre(v, top->left); freesubre(v, top->child);
top->left = t->left; top->child = t->child;
top->right = t->right; freesrnode(v, t);
t->left = t->right = NULL;
freesubre(v, t);
} }
} }
else else
...@@ -1269,34 +1259,31 @@ parseqatom(struct vars *v, ...@@ -1269,34 +1259,31 @@ parseqatom(struct vars *v,
* concatenation node 't'. Just link s2 straight to rp. * concatenation node 't'. Just link s2 straight to rp.
*/ */
EMPTYARC(s2, rp); EMPTYARC(s2, rp);
top->right = t->left; top->child->sibling = t->child;
top->flags |= COMBINE(top->flags, top->right->flags); top->flags |= COMBINE(top->flags, top->child->sibling->flags);
t->left = t->right = NULL; freesrnode(v, t);
freesubre(v, t);
/* /*
* Again, it could be that top->left is vacuous (if the messy atom was * Again, it could be that top->child is vacuous (if the messy atom
* in fact the only thing in the branch). In that case we need no * was in fact the only thing in the branch). In that case we need no
* concatenation at all; just replace top with top->right. * concatenation at all; just replace top with top->child->sibling.
*/ */
assert(top->left->op == '='); assert(top->child->op == '=');
if (top->left->begin == top->left->end) if (top->child->begin == top->child->end)
{ {
assert(!MESSY(top->left->flags)); assert(!MESSY(top->child->flags));
freesubre(v, top->left); t = top->child->sibling;
t = top->right; freesubre(v, top->child);
top->op = t->op; top->op = t->op;
top->flags = t->flags; top->flags = t->flags;
top->id = t->id; top->id = t->id;
top->subno = t->subno; top->subno = t->subno;
top->min = t->min; top->min = t->min;
top->max = t->max; top->max = t->max;
top->left = t->left; top->child = t->child;
top->right = t->right;
top->begin = t->begin; top->begin = t->begin;
top->end = t->end; top->end = t->end;
t->left = t->right = NULL; freesrnode(v, t);
freesubre(v, t);
} }
} }
} }
...@@ -1786,7 +1773,7 @@ subre(struct vars *v, ...@@ -1786,7 +1773,7 @@ subre(struct vars *v,
} }
if (ret != NULL) if (ret != NULL)
v->treefree = ret->left; v->treefree = ret->child;
else else
{ {
ret = (struct subre *) MALLOC(sizeof(struct subre)); ret = (struct subre *) MALLOC(sizeof(struct subre));
...@@ -1806,8 +1793,8 @@ subre(struct vars *v, ...@@ -1806,8 +1793,8 @@ subre(struct vars *v,
ret->id = 0; /* will be assigned later */ ret->id = 0; /* will be assigned later */
ret->subno = 0; ret->subno = 0;
ret->min = ret->max = 1; ret->min = ret->max = 1;
ret->left = NULL; ret->child = NULL;
ret->right = NULL; ret->sibling = NULL;
ret->begin = begin; ret->begin = begin;
ret->end = end; ret->end = end;
ZAPCNFA(ret->cnfa); ZAPCNFA(ret->cnfa);
...@@ -1817,6 +1804,9 @@ subre(struct vars *v, ...@@ -1817,6 +1804,9 @@ subre(struct vars *v,
/* /*
* freesubre - free a subRE subtree * freesubre - free a subRE subtree
*
* This frees child node(s) of the given subRE too,
* but not its siblings.
*/ */
static void static void
freesubre(struct vars *v, /* might be NULL */ freesubre(struct vars *v, /* might be NULL */
...@@ -1825,14 +1815,31 @@ freesubre(struct vars *v, /* might be NULL */ ...@@ -1825,14 +1815,31 @@ freesubre(struct vars *v, /* might be NULL */
if (sr == NULL) if (sr == NULL)
return; return;
if (sr->left != NULL) if (sr->child != NULL)
freesubre(v, sr->left); freesubreandsiblings(v, sr->child);
if (sr->right != NULL)
freesubre(v, sr->right);
freesrnode(v, sr); freesrnode(v, sr);
} }
/*
* freesubreandsiblings - free a subRE subtree
*
* This frees child node(s) of the given subRE too,
* as well as any following siblings.
*/
static void
freesubreandsiblings(struct vars *v, /* might be NULL */
struct subre *sr)
{
while (sr != NULL)
{
struct subre *next = sr->sibling;
freesubre(v, sr);
sr = next;
}
}
/* /*
* freesrnode - free one node in a subRE subtree * freesrnode - free one node in a subRE subtree
*/ */
...@@ -1850,7 +1857,7 @@ freesrnode(struct vars *v, /* might be NULL */ ...@@ -1850,7 +1857,7 @@ freesrnode(struct vars *v, /* might be NULL */
if (v != NULL && v->treechain != NULL) if (v != NULL && v->treechain != NULL)
{ {
/* we're still parsing, maybe we can reuse the subre */ /* we're still parsing, maybe we can reuse the subre */
sr->left = v->treefree; sr->child = v->treefree;
v->treefree = sr; v->treefree = sr;
} }
else else
...@@ -1881,15 +1888,14 @@ numst(struct subre *t, ...@@ -1881,15 +1888,14 @@ numst(struct subre *t,
int start) /* starting point for subtree numbers */ int start) /* starting point for subtree numbers */
{ {
int i; int i;
struct subre *t2;
assert(t != NULL); assert(t != NULL);
i = start; i = start;
t->id = (short) i++; t->id = (short) i++;
if (t->left != NULL) for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
i = numst(t->left, i); i = numst(t2, i);
if (t->right != NULL)
i = numst(t->right, i);
return i; return i;
} }
...@@ -1913,13 +1919,13 @@ numst(struct subre *t, ...@@ -1913,13 +1919,13 @@ numst(struct subre *t,
static void static void
markst(struct subre *t) markst(struct subre *t)
{ {
struct subre *t2;
assert(t != NULL); assert(t != NULL);
t->flags |= INUSE; t->flags |= INUSE;
if (t->left != NULL) for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
markst(t->left); markst(t2);
if (t->right != NULL)
markst(t->right);
} }
/* /*
...@@ -1949,12 +1955,12 @@ nfatree(struct vars *v, ...@@ -1949,12 +1955,12 @@ nfatree(struct vars *v,
struct subre *t, struct subre *t,
FILE *f) /* for debug output */ FILE *f) /* for debug output */
{ {
struct subre *t2;
assert(t != NULL && t->begin != NULL); assert(t != NULL && t->begin != NULL);
if (t->left != NULL) for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
(DISCARD) nfatree(v, t->left, f); (DISCARD) nfatree(v, t2, f);
if (t->right != NULL)
(DISCARD) nfatree(v, t->right, f);
return nfanode(v, t, 0, f); return nfanode(v, t, 0, f);
} }
...@@ -2206,6 +2212,7 @@ stdump(struct subre *t, ...@@ -2206,6 +2212,7 @@ stdump(struct subre *t,
int nfapresent) /* is the original NFA still around? */ int nfapresent) /* is the original NFA still around? */
{ {
char idbuf[50]; char idbuf[50];
struct subre *t2;
fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op); fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
if (t->flags & LONGER) if (t->flags & LONGER)
...@@ -2231,20 +2238,21 @@ stdump(struct subre *t, ...@@ -2231,20 +2238,21 @@ stdump(struct subre *t,
} }
if (nfapresent) if (nfapresent)
fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no); fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
if (t->left != NULL) if (t->child != NULL)
fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf))); fprintf(f, " C:%s", stid(t->child, idbuf, sizeof(idbuf)));
if (t->right != NULL) /* printing second child isn't necessary, but it is often helpful */
fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf))); if (t->child != NULL && t->child->sibling != NULL)
fprintf(f, " C2:%s", stid(t->child->sibling, idbuf, sizeof(idbuf)));
if (t->sibling != NULL)
fprintf(f, " S:%s", stid(t->sibling, idbuf, sizeof(idbuf)));
if (!NULLCNFA(t->cnfa)) if (!NULLCNFA(t->cnfa))
{ {
fprintf(f, "\n"); fprintf(f, "\n");
dumpcnfa(&t->cnfa, f); dumpcnfa(&t->cnfa, f);
} }
fprintf(f, "\n"); fprintf(f, "\n");
if (t->left != NULL) for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
stdump(t->left, f, nfapresent); stdump(t2, f, nfapresent);
if (t->right != NULL)
stdump(t->right, f, nfapresent);
} }
/* /*
......
...@@ -640,6 +640,8 @@ static void ...@@ -640,6 +640,8 @@ static void
zaptreesubs(struct vars *v, zaptreesubs(struct vars *v,
struct subre *t) struct subre *t)
{ {
struct subre *t2;
if (t->op == '(') if (t->op == '(')
{ {
int n = t->subno; int n = t->subno;
...@@ -652,10 +654,8 @@ zaptreesubs(struct vars *v, ...@@ -652,10 +654,8 @@ zaptreesubs(struct vars *v,
} }
} }
if (t->left != NULL) for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
zaptreesubs(v, t->left); zaptreesubs(v, t2);
if (t->right != NULL)
zaptreesubs(v, t->right);
} }
/* /*
...@@ -714,35 +714,35 @@ cdissect(struct vars *v, ...@@ -714,35 +714,35 @@ cdissect(struct vars *v,
switch (t->op) switch (t->op)
{ {
case '=': /* terminal node */ case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL); assert(t->child == NULL);
er = REG_OKAY; /* no action, parent did the work */ er = REG_OKAY; /* no action, parent did the work */
break; break;
case 'b': /* back reference */ case 'b': /* back reference */
assert(t->left == NULL && t->right == NULL); assert(t->child == NULL);
er = cbrdissect(v, t, begin, end); er = cbrdissect(v, t, begin, end);
break; break;
case '.': /* concatenation */ case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL); assert(t->child != NULL);
if (t->left->flags & SHORTER) /* reverse scan */ if (t->child->flags & SHORTER) /* reverse scan */
er = crevcondissect(v, t, begin, end); er = crevcondissect(v, t, begin, end);
else else
er = ccondissect(v, t, begin, end); er = ccondissect(v, t, begin, end);
break; break;
case '|': /* alternation */ case '|': /* alternation */
assert(t->left != NULL); assert(t->child != NULL);
er = caltdissect(v, t, begin, end); er = caltdissect(v, t, begin, end);
break; break;
case '*': /* iteration */ case '*': /* iteration */
assert(t->left != NULL); assert(t->child != NULL);
if (t->left->flags & SHORTER) /* reverse scan */ if (t->child->flags & SHORTER) /* reverse scan */
er = creviterdissect(v, t, begin, end); er = creviterdissect(v, t, begin, end);
else else
er = citerdissect(v, t, begin, end); er = citerdissect(v, t, begin, end);
break; break;
case '(': /* capturing */ case '(': /* capturing */
assert(t->left != NULL && t->right == NULL); assert(t->child != NULL);
assert(t->subno > 0); assert(t->subno > 0);
er = cdissect(v, t->left, begin, end); er = cdissect(v, t->child, begin, end);
if (er == REG_OKAY) if (er == REG_OKAY)
subset(v, t, begin, end); subset(v, t, begin, end);
break; break;
...@@ -770,19 +770,22 @@ ccondissect(struct vars *v, ...@@ -770,19 +770,22 @@ ccondissect(struct vars *v,
chr *begin, /* beginning of relevant substring */ chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */ chr *end) /* end of same */
{ {
struct subre *left = t->child;
struct subre *right = left->sibling;
struct dfa *d; struct dfa *d;
struct dfa *d2; struct dfa *d2;
chr *mid; chr *mid;
int er; int er;
assert(t->op == '.'); assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0); assert(left != NULL && left->cnfa.nstates > 0);
assert(t->right != NULL && t->right->cnfa.nstates > 0); assert(right != NULL && right->cnfa.nstates > 0);
assert(!(t->left->flags & SHORTER)); assert(right->sibling == NULL);
assert(!(left->flags & SHORTER));
d = getsubdfa(v, t->left); d = getsubdfa(v, left);
NOERR(); NOERR();
d2 = getsubdfa(v, t->right); d2 = getsubdfa(v, right);
NOERR(); NOERR();
MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
...@@ -799,10 +802,10 @@ ccondissect(struct vars *v, ...@@ -799,10 +802,10 @@ ccondissect(struct vars *v,
/* try this midpoint on for size */ /* try this midpoint on for size */
if (longest(v, d2, mid, end, (int *) NULL) == end) if (longest(v, d2, mid, end, (int *) NULL) == end)
{ {
er = cdissect(v, t->left, begin, mid); er = cdissect(v, left, begin, mid);
if (er == REG_OKAY) if (er == REG_OKAY)
{ {
er = cdissect(v, t->right, mid, end); er = cdissect(v, right, mid, end);
if (er == REG_OKAY) if (er == REG_OKAY)
{ {
/* satisfaction */ /* satisfaction */
...@@ -831,8 +834,8 @@ ccondissect(struct vars *v, ...@@ -831,8 +834,8 @@ ccondissect(struct vars *v,
return REG_NOMATCH; return REG_NOMATCH;
} }
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
zaptreesubs(v, t->left); zaptreesubs(v, left);
zaptreesubs(v, t->right); zaptreesubs(v, right);
} }
/* can't get here */ /* can't get here */
...@@ -848,19 +851,22 @@ crevcondissect(struct vars *v, ...@@ -848,19 +851,22 @@ crevcondissect(struct vars *v,
chr *begin, /* beginning of relevant substring */ chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */ chr *end) /* end of same */
{ {
struct subre *left = t->child;
struct subre *right = left->sibling;
struct dfa *d; struct dfa *d;
struct dfa *d2; struct dfa *d2;
chr *mid; chr *mid;
int er; int er;
assert(t->op == '.'); assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0); assert(left != NULL && left->cnfa.nstates > 0);
assert(t->right != NULL && t->right->cnfa.nstates > 0); assert(right != NULL && right->cnfa.nstates > 0);
assert(t->left->flags & SHORTER); assert(right->sibling == NULL);
assert(left->flags & SHORTER);
d = getsubdfa(v, t->left); d = getsubdfa(v, left);
NOERR(); NOERR();
d2 = getsubdfa(v, t->right); d2 = getsubdfa(v, right);
NOERR(); NOERR();
MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
...@@ -877,10 +883,10 @@ crevcondissect(struct vars *v, ...@@ -877,10 +883,10 @@ crevcondissect(struct vars *v,
/* try this midpoint on for size */ /* try this midpoint on for size */
if (longest(v, d2, mid, end, (int *) NULL) == end) if (longest(v, d2, mid, end, (int *) NULL) == end)
{ {
er = cdissect(v, t->left, begin, mid); er = cdissect(v, left, begin, mid);
if (er == REG_OKAY) if (er == REG_OKAY)
{ {
er = cdissect(v, t->right, mid, end); er = cdissect(v, right, mid, end);
if (er == REG_OKAY) if (er == REG_OKAY)
{ {
/* satisfaction */ /* satisfaction */
...@@ -909,8 +915,8 @@ crevcondissect(struct vars *v, ...@@ -909,8 +915,8 @@ crevcondissect(struct vars *v,
return REG_NOMATCH; return REG_NOMATCH;
} }
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
zaptreesubs(v, t->left); zaptreesubs(v, left);
zaptreesubs(v, t->right); zaptreesubs(v, right);
} }
/* can't get here */ /* can't get here */
...@@ -1011,26 +1017,30 @@ caltdissect(struct vars *v, ...@@ -1011,26 +1017,30 @@ caltdissect(struct vars *v,
struct dfa *d; struct dfa *d;
int er; int er;
/* We loop, rather than tail-recurse, to handle a chain of alternatives */ assert(t->op == '|');
t = t->child;
/* there should be at least 2 alternatives */
assert(t != NULL && t->sibling != NULL);
while (t != NULL) while (t != NULL)
{ {
assert(t->op == '|'); assert(t->cnfa.nstates > 0);
assert(t->left != NULL && t->left->cnfa.nstates > 0);
MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
d = getsubdfa(v, t->left); d = getsubdfa(v, t);
NOERR(); NOERR();
if (longest(v, d, begin, end, (int *) NULL) == end) if (longest(v, d, begin, end, (int *) NULL) == end)
{ {
MDEBUG(("%d: caltdissect matched\n", t->id)); MDEBUG(("%d: caltdissect matched\n", t->id));
er = cdissect(v, t->left, begin, end); er = cdissect(v, t, begin, end);
if (er != REG_NOMATCH) if (er != REG_NOMATCH)
return er; return er;
} }
NOERR(); NOERR();
t = t->right; t = t->sibling;
} }
return REG_NOMATCH; return REG_NOMATCH;
...@@ -1056,8 +1066,8 @@ citerdissect(struct vars *v, ...@@ -1056,8 +1066,8 @@ citerdissect(struct vars *v,
int er; int er;
assert(t->op == '*'); assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0); assert(t->child != NULL && t->child->cnfa.nstates > 0);
assert(!(t->left->flags & SHORTER)); assert(!(t->child->flags & SHORTER));
assert(begin <= end); assert(begin <= end);
MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
...@@ -1094,7 +1104,7 @@ citerdissect(struct vars *v, ...@@ -1094,7 +1104,7 @@ citerdissect(struct vars *v,
return REG_ESPACE; return REG_ESPACE;
endpts[0] = begin; endpts[0] = begin;
d = getsubdfa(v, t->left); d = getsubdfa(v, t->child);
if (ISERR()) if (ISERR())
{ {
FREE(endpts); FREE(endpts);
...@@ -1172,8 +1182,8 @@ citerdissect(struct vars *v, ...@@ -1172,8 +1182,8 @@ citerdissect(struct vars *v,
for (i = nverified + 1; i <= k; i++) for (i = nverified + 1; i <= k; i++)
{ {
zaptreesubs(v, t->left); zaptreesubs(v, t->child);
er = cdissect(v, t->left, endpts[i - 1], endpts[i]); er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY) if (er == REG_OKAY)
{ {
nverified = i; nverified = i;
...@@ -1258,8 +1268,8 @@ creviterdissect(struct vars *v, ...@@ -1258,8 +1268,8 @@ creviterdissect(struct vars *v,
int er; int er;
assert(t->op == '*'); assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0); assert(t->child != NULL && t->child->cnfa.nstates > 0);
assert(t->left->flags & SHORTER); assert(t->child->flags & SHORTER);
assert(begin <= end); assert(begin <= end);
MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end))); MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
...@@ -1299,7 +1309,7 @@ creviterdissect(struct vars *v, ...@@ -1299,7 +1309,7 @@ creviterdissect(struct vars *v,
return REG_ESPACE; return REG_ESPACE;
endpts[0] = begin; endpts[0] = begin;
d = getsubdfa(v, t->left); d = getsubdfa(v, t->child);
if (ISERR()) if (ISERR())
{ {
FREE(endpts); FREE(endpts);
...@@ -1383,8 +1393,8 @@ creviterdissect(struct vars *v, ...@@ -1383,8 +1393,8 @@ creviterdissect(struct vars *v,
for (i = nverified + 1; i <= k; i++) for (i = nverified + 1; i <= k; i++)
{ {
zaptreesubs(v, t->left); zaptreesubs(v, t->child);
er = cdissect(v, t->left, endpts[i - 1], endpts[i]); er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY) if (er == REG_OKAY)
{ {
nverified = i; nverified = i;
......
...@@ -423,15 +423,17 @@ struct cnfa ...@@ -423,15 +423,17 @@ struct cnfa
* '=' plain regex without interesting substructure (implemented as DFA) * '=' plain regex without interesting substructure (implemented as DFA)
* 'b' back-reference (has no substructure either) * 'b' back-reference (has no substructure either)
* '(' capture node: captures the match of its single child * '(' capture node: captures the match of its single child
* '.' concatenation: matches a match for left, then a match for right * '.' concatenation: matches a match for first child, then second child
* '|' alternation: matches a match for left or a match for right * '|' alternation: matches a match for any of its children
* '*' iteration: matches some number of matches of its single child * '*' iteration: matches some number of matches of its single child
* *
* Note: the right child of an alternation must be another alternation or * An alternation node can have any number of children (but at least two),
* NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you * linked through their sibling fields.
* might expect. This could stand to be changed. Actually I'd rather see *
* a single alternation node with N children, but that will take revising * A concatenation node must have exactly two children. It might be useful
* the representation of struct subre. * to support more, but that would complicate the executor. Note that it is
* the first child's greediness that determines the node's preference for
* where to split a match.
* *
* Note: when a backref is directly quantified, we stick the min/max counts * Note: when a backref is directly quantified, we stick the min/max counts
* into the backref rather than plastering an iteration node on top. This is * into the backref rather than plastering an iteration node on top. This is
...@@ -460,8 +462,8 @@ struct subre ...@@ -460,8 +462,8 @@ struct subre
* LATYPE code for lookaround constraint */ * LATYPE code for lookaround constraint */
short min; /* min repetitions for iteration or backref */ short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */ short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */ struct subre *child; /* first child, if any (also freelist chain) */
struct subre *right; /* right child, if any */ struct subre *sibling; /* next child of same parent, if any */
struct state *begin; /* outarcs from here... */ struct state *begin; /* outarcs from here... */
struct state *end; /* ...ending in inarcs here */ struct state *end; /* ...ending in inarcs here */
struct cnfa cnfa; /* compacted NFA, if any */ struct cnfa cnfa; /* compacted NFA, if any */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment