Commit 5e6ad63c authored by Tom Lane's avatar Tom Lane

Make regexp engine's backref-related compilation state more bulletproof.

Up to now, we remembered the definition of a capturing parenthesis
subexpression by storing a pointer to the associated subRE node.
That was okay before, because that subRE didn't get modified anymore
while parsing the rest of the regexp.  However, in the wake of
commit ea1268f6, that's no longer true: the outer invocation of
parseqatom() feels free to scribble on that subRE.  This seems to
work anyway, because the states we jam into the child atom in the
"prepare a general-purpose state skeleton" stanza aren't really
semantically different from the original endpoints of the child atom.
But that would be mighty easy to break, and it's definitely not how
things worked before.

Between this and the issue fixed in the prior commit, it seems best
to get rid of this dependence on subRE nodes entirely.  We don't need
the whole child subRE for future backrefs, only its starting and ending
NFA states; so let's just store pointers to those.

Also, in the corner case where we make an extra subRE to handle
immediately-nested capturing parentheses, it seems like it'd be smart
to have the extra subRE have the same begin/end states as the original
child subRE does (s/s2 not lp/rp).  I think that linking it from lp to
rp might actually be semantically wrong, though since Spencer's original
code did it that way, I'm not totally certain.  Using s/s2 is certainly
not wrong, in any case.

Per report from Mark Dilger.  Back-patch to v14 where the problematic
patches came in.

Discussion: https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
parent f42ea835
...@@ -233,6 +233,13 @@ static int cmp(const chr *, const chr *, size_t); ...@@ -233,6 +233,13 @@ static int cmp(const chr *, const chr *, size_t);
static int casecmp(const chr *, const chr *, size_t); static int casecmp(const chr *, const chr *, size_t);
/* info we need during compilation about a known capturing subexpression */
struct subinfo
{
struct state *left; /* left end of its sub-NFA */
struct state *right; /* right end of its sub-NFA */
};
/* internal variables, bundled for easy passing around */ /* internal variables, bundled for easy passing around */
struct vars struct vars
{ {
...@@ -245,10 +252,10 @@ struct vars ...@@ -245,10 +252,10 @@ struct vars
int nexttype; /* type of next token */ int nexttype; /* type of next token */
chr nextvalue; /* value (if any) of next token */ chr nextvalue; /* value (if any) of next token */
int lexcon; /* lexical context type (see regc_lex.c) */ int lexcon; /* lexical context type (see regc_lex.c) */
int nsubexp; /* subexpression count */ int nsubexp; /* number of known capturing subexpressions */
struct subre **subs; /* subRE pointer vector */ struct subinfo *subs; /* info about known capturing subexpressions */
size_t nsubs; /* length of vector */ size_t nsubs; /* allocated length of subs[] vector */
struct subre *sub10[10]; /* initial vector, enough for most */ struct subinfo sub10[10]; /* initial vector, enough for most */
struct nfa *nfa; /* the NFA */ struct nfa *nfa; /* the NFA */
struct colormap *cm; /* character color map */ struct colormap *cm; /* character color map */
color nlcolor; /* color of newline */ color nlcolor; /* color of newline */
...@@ -368,7 +375,7 @@ pg_regcomp(regex_t *re, ...@@ -368,7 +375,7 @@ pg_regcomp(regex_t *re,
v->subs = v->sub10; v->subs = v->sub10;
v->nsubs = 10; v->nsubs = 10;
for (j = 0; j < v->nsubs; j++) for (j = 0; j < v->nsubs; j++)
v->subs[j] = NULL; v->subs[j].left = v->subs[j].right = NULL;
v->nfa = NULL; v->nfa = NULL;
v->cm = NULL; v->cm = NULL;
v->nlcolor = COLORLESS; v->nlcolor = COLORLESS;
...@@ -504,13 +511,13 @@ pg_regcomp(regex_t *re, ...@@ -504,13 +511,13 @@ pg_regcomp(regex_t *re,
} }
/* /*
* moresubs - enlarge subRE vector * moresubs - enlarge capturing-subexpressions vector
*/ */
static void static void
moresubs(struct vars *v, moresubs(struct vars *v,
int wanted) /* want enough room for this one */ int wanted) /* want enough room for this one */
{ {
struct subre **p; struct subinfo *p;
size_t n; size_t n;
assert(wanted > 0 && (size_t) wanted >= v->nsubs); assert(wanted > 0 && (size_t) wanted >= v->nsubs);
...@@ -518,13 +525,13 @@ moresubs(struct vars *v, ...@@ -518,13 +525,13 @@ moresubs(struct vars *v,
if (v->subs == v->sub10) if (v->subs == v->sub10)
{ {
p = (struct subre **) MALLOC(n * sizeof(struct subre *)); p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
if (p != NULL) if (p != NULL)
memcpy(VS(p), VS(v->subs), memcpy(VS(p), VS(v->subs),
v->nsubs * sizeof(struct subre *)); v->nsubs * sizeof(struct subinfo));
} }
else else
p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *)); p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
if (p == NULL) if (p == NULL)
{ {
ERR(REG_ESPACE); ERR(REG_ESPACE);
...@@ -532,7 +539,7 @@ moresubs(struct vars *v, ...@@ -532,7 +539,7 @@ moresubs(struct vars *v,
} }
v->subs = p; v->subs = p;
for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
*p = NULL; p->left = p->right = NULL;
assert(v->nsubs == n); assert(v->nsubs == n);
assert((size_t) wanted < v->nsubs); assert((size_t) wanted < v->nsubs);
} }
...@@ -969,10 +976,14 @@ parseqatom(struct vars *v, ...@@ -969,10 +976,14 @@ parseqatom(struct vars *v,
NEXT(); NEXT();
/* /*
* Make separate endpoints to ensure we keep this sub-NFA cleanly * Make separate endpoint states to keep this sub-NFA distinct
* separate from what surrounds it. We need to be sure that when * from what surrounds it. We need to be sure that when we
* we duplicate the sub-NFA for a backref, we get the right states * duplicate the sub-NFA for a backref, we get the right
* and no others. * states/arcs and no others. In particular, letting a backref
* duplicate the sub-NFA from lp to rp would be quite wrong,
* because we may add quantification superstructure around this
* atom below. (Perhaps we could skip the extra states for
* non-capturing parens, but it seems not worth the trouble.)
*/ */
s = newstate(v->nfa); s = newstate(v->nfa);
s2 = newstate(v->nfa); s2 = newstate(v->nfa);
...@@ -986,8 +997,10 @@ parseqatom(struct vars *v, ...@@ -986,8 +997,10 @@ parseqatom(struct vars *v,
NOERRN(); NOERRN();
if (cap) if (cap)
{ {
assert(v->subs[subno] == NULL); /* save the sub-NFA's endpoints for future backrefs to use */
v->subs[subno] = atom; assert(v->subs[subno].left == NULL);
v->subs[subno].left = s;
v->subs[subno].right = s2;
if (atom->capno == 0) if (atom->capno == 0)
{ {
/* normal case: just mark the atom as capturing */ /* normal case: just mark the atom as capturing */
...@@ -997,7 +1010,7 @@ parseqatom(struct vars *v, ...@@ -997,7 +1010,7 @@ parseqatom(struct vars *v,
else else
{ {
/* generate no-op wrapper node to handle "((x))" */ /* generate no-op wrapper node to handle "((x))" */
t = subre(v, '(', atom->flags | CAP, lp, rp); t = subre(v, '(', atom->flags | CAP, s, s2);
NOERRN(); NOERRN();
t->capno = subno; t->capno = subno;
t->child = atom; t->child = atom;
...@@ -1009,7 +1022,7 @@ parseqatom(struct vars *v, ...@@ -1009,7 +1022,7 @@ parseqatom(struct vars *v,
case BACKREF: /* the Feature From The Black Lagoon */ case BACKREF: /* the Feature From The Black Lagoon */
INSIST(type != LACON, REG_ESUBREG); INSIST(type != LACON, REG_ESUBREG);
INSIST(v->nextvalue < v->nsubs, REG_ESUBREG); INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG); INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
NOERRN(); NOERRN();
assert(v->nextvalue > 0); assert(v->nextvalue > 0);
atom = subre(v, 'b', BACKR, lp, rp); atom = subre(v, 'b', BACKR, lp, rp);
...@@ -1084,7 +1097,7 @@ parseqatom(struct vars *v, ...@@ -1084,7 +1097,7 @@ parseqatom(struct vars *v,
if (atom != NULL) if (atom != NULL)
freesubre(v, atom); freesubre(v, atom);
if (atomtype == '(') if (atomtype == '(')
v->subs[subno] = NULL; v->subs[subno].left = v->subs[subno].right = NULL;
delsub(v->nfa, lp, rp); delsub(v->nfa, lp, rp);
EMPTYARC(lp, rp); EMPTYARC(lp, rp);
return top; return top;
...@@ -1177,14 +1190,14 @@ parseqatom(struct vars *v, ...@@ -1177,14 +1190,14 @@ parseqatom(struct vars *v,
{ {
assert(atom->begin->nouts == 1); /* just the EMPTY */ assert(atom->begin->nouts == 1); /* just the EMPTY */
delsub(v->nfa, atom->begin, atom->end); delsub(v->nfa, atom->begin, atom->end);
assert(v->subs[subno] != NULL); assert(v->subs[subno].left != NULL);
/* /*
* And here's why the recursion got postponed: it must wait until the * And here's why the recursion got postponed: it must wait until the
* skeleton is filled in, because it may hit a backref that wants to * skeleton is filled in, because it may hit a backref that wants to
* copy the filled-in skeleton. * copy the filled-in skeleton.
*/ */
dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
atom->begin, atom->end); atom->begin, atom->end);
NOERRN(); NOERRN();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment