Commit 779557bd authored by Tom Lane's avatar Tom Lane

Prevent regexp back-refs from sometimes matching when they shouldn't.

The recursion in cdissect() was careless about clearing match data
for capturing parentheses after rejecting a partial match.  This
could allow a later back-reference to succeed when by rights it
should fail for lack of a defined referent.

To fix, think a little more rigorously about what the contract
between different levels of cdissect's recursion needs to be.
With the right spec, we can fix this using fewer rather than more
resets of the match data; the key decision being that a failed
sub-match is now explicitly responsible for clearing any matches
it may have set.

There are enough other cross-checks and optimizations in the code
that it's not especially easy to exhibit this problem; usually, the
match will fail as-expected.  Plus, regexps that are even potentially
vulnerable are most likely user errors, since there's just not much
point in writing a back-ref that doesn't always have a referent.
These facts perhaps explain why the issue hasn't been detected,
even though it's almost certainly a couple of decades old.

Discussion: https://postgr.es/m/151435.1629733387@sss.pgh.pa.us
parent e3fb6170
...@@ -230,6 +230,8 @@ pg_regexec(regex_t *re, ...@@ -230,6 +230,8 @@ pg_regexec(regex_t *re,
} }
else else
v->pmatch = pmatch; v->pmatch = pmatch;
if (v->nmatch > 0)
zapallsubs(v->pmatch, v->nmatch);
v->details = details; v->details = details;
v->start = (chr *) string; v->start = (chr *) string;
v->search_start = (chr *) string + search_start; v->search_start = (chr *) string + search_start;
...@@ -473,7 +475,6 @@ find(struct vars *v, ...@@ -473,7 +475,6 @@ find(struct vars *v,
return REG_OKAY; return REG_OKAY;
/* find submatches */ /* find submatches */
zapallsubs(v->pmatch, v->nmatch);
return cdissect(v, v->g->tree, begin, end); return cdissect(v, v->g->tree, begin, end);
} }
...@@ -584,7 +585,6 @@ cfindloop(struct vars *v, ...@@ -584,7 +585,6 @@ cfindloop(struct vars *v,
break; /* no match with this begin point, try next */ break; /* no match with this begin point, try next */
MDEBUG(("tentative end %ld\n", LOFF(end))); MDEBUG(("tentative end %ld\n", LOFF(end)));
/* Dissect the potential match to see if it really matches */ /* Dissect the potential match to see if it really matches */
zapallsubs(v->pmatch, v->nmatch);
er = cdissect(v, v->g->tree, begin, end); er = cdissect(v, v->g->tree, begin, end);
if (er == REG_OKAY) if (er == REG_OKAY)
{ {
...@@ -632,6 +632,8 @@ cfindloop(struct vars *v, ...@@ -632,6 +632,8 @@ cfindloop(struct vars *v,
/* /*
* zapallsubs - initialize all subexpression matches to "no match" * zapallsubs - initialize all subexpression matches to "no match"
*
* Note that p[0], the overall-match location, is not touched.
*/ */
static void static void
zapallsubs(regmatch_t *p, zapallsubs(regmatch_t *p,
...@@ -701,8 +703,30 @@ subset(struct vars *v, ...@@ -701,8 +703,30 @@ subset(struct vars *v,
* DFA and found that the proposed substring satisfies the DFA. (We make * DFA and found that the proposed substring satisfies the DFA. (We make
* the caller do that because in concatenation and iteration nodes, it's * the caller do that because in concatenation and iteration nodes, it's
* much faster to check all the substrings against the child DFAs before we * much faster to check all the substrings against the child DFAs before we
* recurse.) Also, caller must have cleared subexpression match data via * recurse.)
* zaptreesubs (or zapallsubs at the top level). *
* A side-effect of a successful match is to save match locations for
* capturing subexpressions in v->pmatch[]. This is a little bit tricky,
* so we make the following rules:
* 1. Before initial entry to cdissect, all match data must have been
* cleared (this is seen to by zapallsubs).
* 2. Before any recursive entry to cdissect, the match data for that
* subexpression tree must be guaranteed clear (see zaptreesubs).
* 3. When returning REG_OKAY, each level of cdissect will have saved
* any relevant match locations.
* 4. When returning REG_NOMATCH, each level of cdissect will guarantee
* that its subexpression match locations are again clear.
* 5. No guarantees are made for error cases (i.e., other result codes).
* 6. When a level of cdissect abandons a successful sub-match, it will
* clear that subtree's match locations with zaptreesubs before trying
* any new DFA match or cdissect call for that subtree or any subtree
* to its right (that is, any subtree that could have a backref into the
* abandoned match).
* This may seem overly complicated, but it's difficult to simplify it
* because of the provision that match locations must be reset before
* any fresh DFA match (a rule that is needed to make dfa_backref safe).
* That means it won't work to just reset relevant match locations at the
* start of each cdissect level.
*/ */
static int /* regexec return code */ static int /* regexec return code */
cdissect(struct vars *v, cdissect(struct vars *v,
...@@ -827,6 +851,8 @@ ccondissect(struct vars *v, ...@@ -827,6 +851,8 @@ ccondissect(struct vars *v,
MDEBUG(("%d: successful\n", t->id)); MDEBUG(("%d: successful\n", t->id));
return REG_OKAY; return REG_OKAY;
} }
/* Reset left's matches (right should have done so itself) */
zaptreesubs(v, left);
} }
if (er != REG_NOMATCH) if (er != REG_NOMATCH)
return er; return er;
...@@ -849,8 +875,6 @@ ccondissect(struct vars *v, ...@@ -849,8 +875,6 @@ ccondissect(struct vars *v,
return REG_NOMATCH; return REG_NOMATCH;
} }
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
zaptreesubs(v, left);
zaptreesubs(v, right);
} }
/* can't get here */ /* can't get here */
...@@ -908,6 +932,8 @@ crevcondissect(struct vars *v, ...@@ -908,6 +932,8 @@ crevcondissect(struct vars *v,
MDEBUG(("%d: successful\n", t->id)); MDEBUG(("%d: successful\n", t->id));
return REG_OKAY; return REG_OKAY;
} }
/* Reset left's matches (right should have done so itself) */
zaptreesubs(v, left);
} }
if (er != REG_NOMATCH) if (er != REG_NOMATCH)
return er; return er;
...@@ -930,8 +956,6 @@ crevcondissect(struct vars *v, ...@@ -930,8 +956,6 @@ crevcondissect(struct vars *v,
return REG_NOMATCH; return REG_NOMATCH;
} }
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid))); MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
zaptreesubs(v, left);
zaptreesubs(v, right);
} }
/* can't get here */ /* can't get here */
...@@ -1200,6 +1224,7 @@ citerdissect(struct vars *v, ...@@ -1200,6 +1224,7 @@ citerdissect(struct vars *v,
for (i = nverified + 1; i <= k; i++) for (i = nverified + 1; i <= k; i++)
{ {
/* zap any match data from a non-last iteration */
zaptreesubs(v, t->child); zaptreesubs(v, t->child);
er = cdissect(v, t->child, endpts[i - 1], endpts[i]); er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY) if (er == REG_OKAY)
...@@ -1412,6 +1437,7 @@ creviterdissect(struct vars *v, ...@@ -1412,6 +1437,7 @@ creviterdissect(struct vars *v,
for (i = nverified + 1; i <= k; i++) for (i = nverified + 1; i <= k; i++)
{ {
/* zap any match data from a non-last iteration */
zaptreesubs(v, t->child); zaptreesubs(v, t->child);
er = cdissect(v, t->child, endpts[i - 1], endpts[i]); er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY) if (er == REG_OKAY)
......
...@@ -2636,6 +2636,20 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP'); ...@@ -2636,6 +2636,20 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP');
{2,REG_UBACKREF,REG_UNONPOSIX} {2,REG_UBACKREF,REG_UNONPOSIX}
(1 row) (1 row)
-- expectNomatch 14.30 RP {^(.)\1|\1.} {abcdef}
select * from test_regex('^(.)\1|\1.', 'abcdef', 'RP');
test_regex
--------------------------------
{1,REG_UBACKREF,REG_UNONPOSIX}
(1 row)
-- expectNomatch 14.31 RP {^((.)\2|..)\2} {abadef}
select * from test_regex('^((.)\2|..)\2', 'abadef', 'RP');
test_regex
--------------------------------
{2,REG_UBACKREF,REG_UNONPOSIX}
(1 row)
-- back reference only matches the string, not any constraints -- back reference only matches the string, not any constraints
select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP'); select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
test_regex test_regex
......
...@@ -769,6 +769,10 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abc', 'RP'); ...@@ -769,6 +769,10 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abc', 'RP');
select * from test_regex('^(.+)( \1)+$', 'abc abd abc', 'RP'); select * from test_regex('^(.+)( \1)+$', 'abc abd abc', 'RP');
-- expectNomatch 14.29 RP {^(.+)( \1)+$} {abc abc abd} -- expectNomatch 14.29 RP {^(.+)( \1)+$} {abc abc abd}
select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP'); select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP');
-- expectNomatch 14.30 RP {^(.)\1|\1.} {abcdef}
select * from test_regex('^(.)\1|\1.', 'abcdef', 'RP');
-- expectNomatch 14.31 RP {^((.)\2|..)\2} {abadef}
select * from test_regex('^((.)\2|..)\2', 'abadef', 'RP');
-- back reference only matches the string, not any constraints -- back reference only matches the string, not any constraints
select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP'); select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
......
...@@ -567,6 +567,19 @@ select 'a' ~ '()+\1'; ...@@ -567,6 +567,19 @@ select 'a' ~ '()+\1';
t t
(1 row) (1 row)
-- Test ancient oversight in when to apply zaptreesubs
select 'abcdef' ~ '^(.)\1|\1.' as f;
f
---
f
(1 row)
select 'abadef' ~ '^((.)\2|..)\2' as f;
f
---
f
(1 row)
-- Add coverage for some cases in checkmatchall -- Add coverage for some cases in checkmatchall
select regexp_match('xy', '.|...'); select regexp_match('xy', '.|...');
regexp_match regexp_match
......
...@@ -135,6 +135,10 @@ select 'a' ~ '.. ()|\1'; ...@@ -135,6 +135,10 @@ select 'a' ~ '.. ()|\1';
select 'a' ~ '()*\1'; select 'a' ~ '()*\1';
select 'a' ~ '()+\1'; select 'a' ~ '()+\1';
-- Test ancient oversight in when to apply zaptreesubs
select 'abcdef' ~ '^(.)\1|\1.' as f;
select 'abadef' ~ '^((.)\2|..)\2' as f;
-- Add coverage for some cases in checkmatchall -- Add coverage for some cases in checkmatchall
select regexp_match('xy', '.|...'); select regexp_match('xy', '.|...');
select regexp_match('xyz', '.|...'); select regexp_match('xyz', '.|...');
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment