Commit 58104308 authored by Tom Lane's avatar Tom Lane

Convert regex engine's subre tree from binary to N-ary style.

Instead of having left and right child links in subre structs,
have a single child link plus a sibling link.  Multiple children
of a tree node are now reached by chasing the sibling chain.

The beneficiary of this is alternation tree nodes.  A regular
expression with N (>1) branches is now represented by one alternation
node with N children, rather than a tree that includes N alternation
nodes as well as N children.  While the old representation didn't
really cost anything extra at execution time, it was pretty horrid
for compilation purposes, because each of the alternation nodes had
its own NFA, which we were too stupid not to separately optimize.
(To make matters worse, all of those NFAs described the entire
alternation pattern, not just the portion of it that one might
expect from the tree structure.)

We continue to require concatenation nodes to have exactly two
children.  This data structure is now prepared to support more,
but the executor's logic would need some careful redesign, and
it's not clear that a lot of benefit could be had.

This is part of a patch series that in total reduces the regex engine's
runtime by about a factor of four on a large corpus of real-world regexes.

Patch by me, reviewed by Joel Jacobson

Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
parent cebc1d34
......@@ -129,9 +129,9 @@ If not, we can reject the match immediately without iterating through many
possibilities.
As an example, consider the regex "(a[bc]+)\1". The compiled
representation will have a top-level concatenation subre node. Its left
representation will have a top-level concatenation subre node. Its first
child is a capture node, and the child of that is a plain DFA node for
"a[bc]+". The concatenation's right child is a backref node for \1.
"a[bc]+". The concatenation's second child is a backref node for \1.
The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
where the backref has been replaced by a copy of the DFA for its referent
expression. When executed, the concatenation node will have to search for
......
This diff is collapsed.
......@@ -640,6 +640,8 @@ static void
zaptreesubs(struct vars *v,
struct subre *t)
{
struct subre *t2;
if (t->op == '(')
{
int n = t->subno;
......@@ -652,10 +654,8 @@ zaptreesubs(struct vars *v,
}
}
if (t->left != NULL)
zaptreesubs(v, t->left);
if (t->right != NULL)
zaptreesubs(v, t->right);
for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
zaptreesubs(v, t2);
}
/*
......@@ -714,35 +714,35 @@ cdissect(struct vars *v,
switch (t->op)
{
case '=': /* terminal node */
assert(t->left == NULL && t->right == NULL);
assert(t->child == NULL);
er = REG_OKAY; /* no action, parent did the work */
break;
case 'b': /* back reference */
assert(t->left == NULL && t->right == NULL);
assert(t->child == NULL);
er = cbrdissect(v, t, begin, end);
break;
case '.': /* concatenation */
assert(t->left != NULL && t->right != NULL);
if (t->left->flags & SHORTER) /* reverse scan */
assert(t->child != NULL);
if (t->child->flags & SHORTER) /* reverse scan */
er = crevcondissect(v, t, begin, end);
else
er = ccondissect(v, t, begin, end);
break;
case '|': /* alternation */
assert(t->left != NULL);
assert(t->child != NULL);
er = caltdissect(v, t, begin, end);
break;
case '*': /* iteration */
assert(t->left != NULL);
if (t->left->flags & SHORTER) /* reverse scan */
assert(t->child != NULL);
if (t->child->flags & SHORTER) /* reverse scan */
er = creviterdissect(v, t, begin, end);
else
er = citerdissect(v, t, begin, end);
break;
case '(': /* capturing */
assert(t->left != NULL && t->right == NULL);
assert(t->child != NULL);
assert(t->subno > 0);
er = cdissect(v, t->left, begin, end);
er = cdissect(v, t->child, begin, end);
if (er == REG_OKAY)
subset(v, t, begin, end);
break;
......@@ -770,19 +770,22 @@ ccondissect(struct vars *v,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct subre *left = t->child;
struct subre *right = left->sibling;
struct dfa *d;
struct dfa *d2;
chr *mid;
int er;
assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->right != NULL && t->right->cnfa.nstates > 0);
assert(!(t->left->flags & SHORTER));
assert(left != NULL && left->cnfa.nstates > 0);
assert(right != NULL && right->cnfa.nstates > 0);
assert(right->sibling == NULL);
assert(!(left->flags & SHORTER));
d = getsubdfa(v, t->left);
d = getsubdfa(v, left);
NOERR();
d2 = getsubdfa(v, t->right);
d2 = getsubdfa(v, right);
NOERR();
MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
......@@ -799,10 +802,10 @@ ccondissect(struct vars *v,
/* try this midpoint on for size */
if (longest(v, d2, mid, end, (int *) NULL) == end)
{
er = cdissect(v, t->left, begin, mid);
er = cdissect(v, left, begin, mid);
if (er == REG_OKAY)
{
er = cdissect(v, t->right, mid, end);
er = cdissect(v, right, mid, end);
if (er == REG_OKAY)
{
/* satisfaction */
......@@ -831,8 +834,8 @@ ccondissect(struct vars *v,
return REG_NOMATCH;
}
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
zaptreesubs(v, t->left);
zaptreesubs(v, t->right);
zaptreesubs(v, left);
zaptreesubs(v, right);
}
/* can't get here */
......@@ -848,19 +851,22 @@ crevcondissect(struct vars *v,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
struct subre *left = t->child;
struct subre *right = left->sibling;
struct dfa *d;
struct dfa *d2;
chr *mid;
int er;
assert(t->op == '.');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->right != NULL && t->right->cnfa.nstates > 0);
assert(t->left->flags & SHORTER);
assert(left != NULL && left->cnfa.nstates > 0);
assert(right != NULL && right->cnfa.nstates > 0);
assert(right->sibling == NULL);
assert(left->flags & SHORTER);
d = getsubdfa(v, t->left);
d = getsubdfa(v, left);
NOERR();
d2 = getsubdfa(v, t->right);
d2 = getsubdfa(v, right);
NOERR();
MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
......@@ -877,10 +883,10 @@ crevcondissect(struct vars *v,
/* try this midpoint on for size */
if (longest(v, d2, mid, end, (int *) NULL) == end)
{
er = cdissect(v, t->left, begin, mid);
er = cdissect(v, left, begin, mid);
if (er == REG_OKAY)
{
er = cdissect(v, t->right, mid, end);
er = cdissect(v, right, mid, end);
if (er == REG_OKAY)
{
/* satisfaction */
......@@ -909,8 +915,8 @@ crevcondissect(struct vars *v,
return REG_NOMATCH;
}
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
zaptreesubs(v, t->left);
zaptreesubs(v, t->right);
zaptreesubs(v, left);
zaptreesubs(v, right);
}
/* can't get here */
......@@ -1011,26 +1017,30 @@ caltdissect(struct vars *v,
struct dfa *d;
int er;
/* We loop, rather than tail-recurse, to handle a chain of alternatives */
assert(t->op == '|');
t = t->child;
/* there should be at least 2 alternatives */
assert(t != NULL && t->sibling != NULL);
while (t != NULL)
{
assert(t->op == '|');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->cnfa.nstates > 0);
MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
d = getsubdfa(v, t->left);
d = getsubdfa(v, t);
NOERR();
if (longest(v, d, begin, end, (int *) NULL) == end)
{
MDEBUG(("%d: caltdissect matched\n", t->id));
er = cdissect(v, t->left, begin, end);
er = cdissect(v, t, begin, end);
if (er != REG_NOMATCH)
return er;
}
NOERR();
t = t->right;
t = t->sibling;
}
return REG_NOMATCH;
......@@ -1056,8 +1066,8 @@ citerdissect(struct vars *v,
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(!(t->left->flags & SHORTER));
assert(t->child != NULL && t->child->cnfa.nstates > 0);
assert(!(t->child->flags & SHORTER));
assert(begin <= end);
MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
......@@ -1094,7 +1104,7 @@ citerdissect(struct vars *v,
return REG_ESPACE;
endpts[0] = begin;
d = getsubdfa(v, t->left);
d = getsubdfa(v, t->child);
if (ISERR())
{
FREE(endpts);
......@@ -1172,8 +1182,8 @@ citerdissect(struct vars *v,
for (i = nverified + 1; i <= k; i++)
{
zaptreesubs(v, t->left);
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
zaptreesubs(v, t->child);
er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
......@@ -1258,8 +1268,8 @@ creviterdissect(struct vars *v,
int er;
assert(t->op == '*');
assert(t->left != NULL && t->left->cnfa.nstates > 0);
assert(t->left->flags & SHORTER);
assert(t->child != NULL && t->child->cnfa.nstates > 0);
assert(t->child->flags & SHORTER);
assert(begin <= end);
MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
......@@ -1299,7 +1309,7 @@ creviterdissect(struct vars *v,
return REG_ESPACE;
endpts[0] = begin;
d = getsubdfa(v, t->left);
d = getsubdfa(v, t->child);
if (ISERR())
{
FREE(endpts);
......@@ -1383,8 +1393,8 @@ creviterdissect(struct vars *v,
for (i = nverified + 1; i <= k; i++)
{
zaptreesubs(v, t->left);
er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
zaptreesubs(v, t->child);
er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
......
......@@ -423,15 +423,17 @@ struct cnfa
* '=' plain regex without interesting substructure (implemented as DFA)
* 'b' back-reference (has no substructure either)
* '(' capture node: captures the match of its single child
* '.' concatenation: matches a match for left, then a match for right
* '|' alternation: matches a match for left or a match for right
* '.' concatenation: matches a match for first child, then second child
* '|' alternation: matches a match for any of its children
* '*' iteration: matches some number of matches of its single child
*
* Note: the right child of an alternation must be another alternation or
* NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
* might expect. This could stand to be changed. Actually I'd rather see
* a single alternation node with N children, but that will take revising
* the representation of struct subre.
* An alternation node can have any number of children (but at least two),
* linked through their sibling fields.
*
* A concatenation node must have exactly two children. It might be useful
* to support more, but that would complicate the executor. Note that it is
* the first child's greediness that determines the node's preference for
* where to split a match.
*
* Note: when a backref is directly quantified, we stick the min/max counts
* into the backref rather than plastering an iteration node on top. This is
......@@ -460,8 +462,8 @@ struct subre
* LATYPE code for lookaround constraint */
short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
struct subre *right; /* right child, if any */
struct subre *child; /* first child, if any (also freelist chain) */
struct subre *sibling; /* next child of same parent, if any */
struct state *begin; /* outarcs from here... */
struct state *end; /* ...ending in inarcs here */
struct cnfa cnfa; /* compacted NFA, if any */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment