Commit 5227d998 authored by Tom Lane's avatar Tom Lane

Rethink regexp engine's backref-related compilation state.

I had committer's remorse almost immediately after pushing cb76fbd7e,
upon finding that removing capturing subexpressions' subREs from the
data structure broke my proposed patch for REG_NOSUB optimization.
Revert that data structure change.  Instead, address the concern
about not changing capturing subREs' endpoints by not changing the
endpoints.  We don't need to, because the point of that bit was just
to ensure that the atom has endpoints distinct from the outer state
pair that we're stringing the branch between.  We already made
suitable states in the parenthesized-subexpression case, so the
additional ones were just useless overhead.  This seems more
understandable than Spencer's original coding, and it ought to be
a shade faster too by saving a few state creations and arc changes.
(I actually see a couple percent improvement on Jacobson's web
corpus, though that's barely above the noise floor so I wouldn't
put much stock in that result.)

Also, fix the logic added by ea1268f6 to ensure that the subRE
recorded in v->subs[subno] is exactly the one with capno == subno.
Spencer's original coding recorded the child subRE of the capture
node, which is okay so far as having the right endpoint states is
concerned, but as of cb76fbd7e the capturing subRE itself always
has those endpoints too.  I think the inconsistency is confusing
for the REG_NOSUB optimization.

As before, backpatch to v14.

Discussion: https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
parent 5e6ad63c
...@@ -233,13 +233,6 @@ static int cmp(const chr *, const chr *, size_t); ...@@ -233,13 +233,6 @@ static int cmp(const chr *, const chr *, size_t);
static int casecmp(const chr *, const chr *, size_t); static int casecmp(const chr *, const chr *, size_t);
/* info we need during compilation about a known capturing subexpression */
struct subinfo
{
struct state *left; /* left end of its sub-NFA */
struct state *right; /* right end of its sub-NFA */
};
/* internal variables, bundled for easy passing around */ /* internal variables, bundled for easy passing around */
struct vars struct vars
{ {
...@@ -252,10 +245,10 @@ struct vars ...@@ -252,10 +245,10 @@ struct vars
int nexttype; /* type of next token */ int nexttype; /* type of next token */
chr nextvalue; /* value (if any) of next token */ chr nextvalue; /* value (if any) of next token */
int lexcon; /* lexical context type (see regc_lex.c) */ int lexcon; /* lexical context type (see regc_lex.c) */
int nsubexp; /* number of known capturing subexpressions */ int nsubexp; /* subexpression count */
struct subinfo *subs; /* info about known capturing subexpressions */ struct subre **subs; /* subRE pointer vector */
size_t nsubs; /* allocated length of subs[] vector */ size_t nsubs; /* length of vector */
struct subinfo sub10[10]; /* initial vector, enough for most */ struct subre *sub10[10]; /* initial vector, enough for most */
struct nfa *nfa; /* the NFA */ struct nfa *nfa; /* the NFA */
struct colormap *cm; /* character color map */ struct colormap *cm; /* character color map */
color nlcolor; /* color of newline */ color nlcolor; /* color of newline */
...@@ -375,7 +368,7 @@ pg_regcomp(regex_t *re, ...@@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
v->subs = v->sub10; v->subs = v->sub10;
v->nsubs = 10; v->nsubs = 10;
for (j = 0; j < v->nsubs; j++) for (j = 0; j < v->nsubs; j++)
v->subs[j].left = v->subs[j].right = NULL; v->subs[j] = NULL;
v->nfa = NULL; v->nfa = NULL;
v->cm = NULL; v->cm = NULL;
v->nlcolor = COLORLESS; v->nlcolor = COLORLESS;
...@@ -511,13 +504,13 @@ pg_regcomp(regex_t *re, ...@@ -511,13 +504,13 @@ pg_regcomp(regex_t *re,
} }
/* /*
* moresubs - enlarge capturing-subexpressions vector * moresubs - enlarge subRE vector
*/ */
static void static void
moresubs(struct vars *v, moresubs(struct vars *v,
int wanted) /* want enough room for this one */ int wanted) /* want enough room for this one */
{ {
struct subinfo *p; struct subre **p;
size_t n; size_t n;
assert(wanted > 0 && (size_t) wanted >= v->nsubs); assert(wanted > 0 && (size_t) wanted >= v->nsubs);
...@@ -525,13 +518,13 @@ moresubs(struct vars *v, ...@@ -525,13 +518,13 @@ moresubs(struct vars *v,
if (v->subs == v->sub10) if (v->subs == v->sub10)
{ {
p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo)); p = (struct subre **) MALLOC(n * sizeof(struct subre *));
if (p != NULL) if (p != NULL)
memcpy(VS(p), VS(v->subs), memcpy(VS(p), VS(v->subs),
v->nsubs * sizeof(struct subinfo)); v->nsubs * sizeof(struct subre *));
} }
else else
p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo)); p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
if (p == NULL) if (p == NULL)
{ {
ERR(REG_ESPACE); ERR(REG_ESPACE);
...@@ -539,7 +532,7 @@ moresubs(struct vars *v, ...@@ -539,7 +532,7 @@ moresubs(struct vars *v,
} }
v->subs = p; v->subs = p;
for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
p->left = p->right = NULL; *p = NULL;
assert(v->nsubs == n); assert(v->nsubs == n);
assert((size_t) wanted < v->nsubs); assert((size_t) wanted < v->nsubs);
} }
...@@ -988,6 +981,7 @@ parseqatom(struct vars *v, ...@@ -988,6 +981,7 @@ parseqatom(struct vars *v,
s = newstate(v->nfa); s = newstate(v->nfa);
s2 = newstate(v->nfa); s2 = newstate(v->nfa);
NOERRN(); NOERRN();
/* We may not need these arcs, but keep things connected for now */
EMPTYARC(lp, s); EMPTYARC(lp, s);
EMPTYARC(s2, rp); EMPTYARC(s2, rp);
NOERRN(); NOERRN();
...@@ -997,10 +991,6 @@ parseqatom(struct vars *v, ...@@ -997,10 +991,6 @@ parseqatom(struct vars *v,
NOERRN(); NOERRN();
if (cap) if (cap)
{ {
/* save the sub-NFA's endpoints for future backrefs to use */
assert(v->subs[subno].left == NULL);
v->subs[subno].left = s;
v->subs[subno].right = s2;
if (atom->capno == 0) if (atom->capno == 0)
{ {
/* normal case: just mark the atom as capturing */ /* normal case: just mark the atom as capturing */
...@@ -1016,13 +1006,15 @@ parseqatom(struct vars *v, ...@@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
t->child = atom; t->child = atom;
atom = t; atom = t;
} }
assert(v->subs[subno] == NULL);
v->subs[subno] = atom;
} }
/* postpone everything else pending possible {0} */ /* postpone everything else pending possible {0} */
break; break;
case BACKREF: /* the Feature From The Black Lagoon */ case BACKREF: /* the Feature From The Black Lagoon */
INSIST(type != LACON, REG_ESUBREG); INSIST(type != LACON, REG_ESUBREG);
INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG); INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
NOERRN(); NOERRN();
assert(v->nextvalue > 0); assert(v->nextvalue > 0);
atom = subre(v, 'b', BACKR, lp, rp); atom = subre(v, 'b', BACKR, lp, rp);
...@@ -1097,7 +1089,7 @@ parseqatom(struct vars *v, ...@@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
if (atom != NULL) if (atom != NULL)
freesubre(v, atom); freesubre(v, atom);
if (atomtype == '(') if (atomtype == '(')
v->subs[subno].left = v->subs[subno].right = NULL; v->subs[subno] = NULL;
delsub(v->nfa, lp, rp); delsub(v->nfa, lp, rp);
EMPTYARC(lp, rp); EMPTYARC(lp, rp);
return top; return top;
...@@ -1130,30 +1122,48 @@ parseqatom(struct vars *v, ...@@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
NOERRN(); NOERRN();
} }
/*
* For what follows, we need the atom to have its own begin/end states
* that are distinct from lp/rp, so that we can wrap iteration structure
* around it. The parenthesized-atom case above already made suitable
* states (and we don't want to modify a capturing subre, since it's
* already recorded in v->subs[]). Otherwise, we need more states.
*/
if (atom->begin == lp || atom->end == rp)
{
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERRN();
moveouts(v->nfa, lp, s);
moveins(v->nfa, rp, s2);
atom->begin = s;
atom->end = s2;
}
else
{
/* The atom's OK, but we must temporarily disconnect it from lp/rp */
/* (this removes the EMPTY arcs we made above) */
delsub(v->nfa, lp, atom->begin);
delsub(v->nfa, atom->end, rp);
}
/*---------- /*----------
* Prepare a general-purpose state skeleton. * Prepare a general-purpose state skeleton.
* *
* In the no-backrefs case, we want this: * In the no-backrefs case, we want this:
* *
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp] * [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
* *
* where prefix is some repetitions of atom. In the general case we need * where prefix is some repetitions of atom, and "rest" is the remainder
* of the branch. In the general case we need:
* *
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp] * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
* *
* where the iterator wraps around [begin] ---atom---> [end] * where the iterator wraps around the atom.
* *
* We make the s state here for both cases; s2 is made below if needed * We make the s state here for both cases; s2 is made below if needed
*---------- *----------
*/ */
s = newstate(v->nfa); /* first, new endpoints for the atom */
s2 = newstate(v->nfa);
NOERRN();
moveouts(v->nfa, lp, s);
moveins(v->nfa, rp, s2);
NOERRN();
atom->begin = s;
atom->end = s2;
s = newstate(v->nfa); /* set up starting state */ s = newstate(v->nfa); /* set up starting state */
NOERRN(); NOERRN();
EMPTYARC(lp, s); EMPTYARC(lp, s);
...@@ -1190,14 +1200,14 @@ parseqatom(struct vars *v, ...@@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
{ {
assert(atom->begin->nouts == 1); /* just the EMPTY */ assert(atom->begin->nouts == 1); /* just the EMPTY */
delsub(v->nfa, atom->begin, atom->end); delsub(v->nfa, atom->begin, atom->end);
assert(v->subs[subno].left != NULL); assert(v->subs[subno] != NULL);
/* /*
* And here's why the recursion got postponed: it must wait until the * And here's why the recursion got postponed: it must wait until the
* skeleton is filled in, because it may hit a backref that wants to * skeleton is filled in, because it may hit a backref that wants to
* copy the filled-in skeleton. * copy the filled-in skeleton.
*/ */
dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right, dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
atom->begin, atom->end); atom->begin, atom->end);
NOERRN(); NOERRN();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment