Commit 824bf719 authored by Tom Lane's avatar Tom Lane

Recognize "match-all" NFAs within the regex engine.

This builds on the previous "rainbow" patch to detect NFAs that will
match any string, though possibly with constraints on the string length.
This definition is chosen to match constructs such as ".*", ".+", and
".{1,100}".  Recognizing such an NFA after the optimization pass is
fairly cheap, since we basically just have to verify that all arcs
are RAINBOW arcs and count the number of steps to the end state.
(Well, there's a bit of complication with pseudo-color arcs for string
boundary conditions, but not much.)

Once we have these markings, the regex executor functions longest(),
shortest(), and matchuntil() don't have to expend per-character work
to determine whether a given substring satisfies such an NFA; they
just need to check its length against the bounds.  Since some matching
problems require O(N) invocations of these functions, we've reduced
the runtime for an N-character string from O(N^2) to O(N).  Of course,
this is no help for non-matchall sub-patterns, but those usually have
constraints that allow us to avoid needing O(N) substring checks in the
first place.  It's precisely the unconstrained "match-all" cases that
cause the most headaches.

This is part of a patch series that in total reduces the regex engine's
runtime by about a factor of four on a large corpus of real-world regexes.

Patch by me, reviewed by Joel Jacobson

Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
parent 08c0d6ad
This diff is collapsed.
...@@ -175,6 +175,11 @@ static void cleanup(struct nfa *); ...@@ -175,6 +175,11 @@ static void cleanup(struct nfa *);
static void markreachable(struct nfa *, struct state *, struct state *, struct state *); static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *); static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
static long analyze(struct nfa *); static long analyze(struct nfa *);
static void checkmatchall(struct nfa *);
static bool checkmatchall_recurse(struct nfa *, struct state *,
bool, int, bool *);
static bool check_out_colors_match(struct state *, color, color);
static bool check_in_colors_match(struct state *, color, color);
static void compact(struct nfa *, struct cnfa *); static void compact(struct nfa *, struct cnfa *);
static void carcsort(struct carc *, size_t); static void carcsort(struct carc *, size_t);
static int carc_cmp(const void *, const void *); static int carc_cmp(const void *, const void *);
......
...@@ -58,6 +58,29 @@ longest(struct vars *v, ...@@ -58,6 +58,29 @@ longest(struct vars *v,
if (hitstopp != NULL) if (hitstopp != NULL)
*hitstopp = 0; *hitstopp = 0;
/* fast path for matchall NFAs */
if (d->cnfa->flags & MATCHALL)
{
size_t nchr = stop - start;
size_t maxmatchall = d->cnfa->maxmatchall;
if (nchr < d->cnfa->minmatchall)
return NULL;
if (maxmatchall == DUPINF)
{
if (stop == v->stop && hitstopp != NULL)
*hitstopp = 1;
}
else
{
if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
*hitstopp = 1;
if (nchr > maxmatchall)
return start + maxmatchall;
}
return stop;
}
/* initialize */ /* initialize */
css = initialize(v, d, start); css = initialize(v, d, start);
if (css == NULL) if (css == NULL)
...@@ -187,6 +210,24 @@ shortest(struct vars *v, ...@@ -187,6 +210,24 @@ shortest(struct vars *v,
if (hitstopp != NULL) if (hitstopp != NULL)
*hitstopp = 0; *hitstopp = 0;
/* fast path for matchall NFAs */
if (d->cnfa->flags & MATCHALL)
{
size_t nchr = min - start;
if (d->cnfa->maxmatchall != DUPINF &&
nchr > d->cnfa->maxmatchall)
return NULL;
if ((max - start) < d->cnfa->minmatchall)
return NULL;
if (nchr < d->cnfa->minmatchall)
min = start + d->cnfa->minmatchall;
if (coldp != NULL)
*coldp = start;
/* there is no case where we should set *hitstopp */
return min;
}
/* initialize */ /* initialize */
css = initialize(v, d, start); css = initialize(v, d, start);
if (css == NULL) if (css == NULL)
...@@ -312,6 +353,22 @@ matchuntil(struct vars *v, ...@@ -312,6 +353,22 @@ matchuntil(struct vars *v,
struct sset *ss; struct sset *ss;
struct colormap *cm = d->cm; struct colormap *cm = d->cm;
/* fast path for matchall NFAs */
if (d->cnfa->flags & MATCHALL)
{
size_t nchr = probe - v->start;
/*
* It might seem that we should check maxmatchall too, but the .* at
* the front of the pattern absorbs any extra characters (and it was
* tacked on *after* computing minmatchall/maxmatchall). Thus, we
* should match if there are at least minmatchall characters.
*/
if (nchr < d->cnfa->minmatchall)
return 0;
return 1;
}
/* initialize and startup, or restart, if necessary */ /* initialize and startup, or restart, if necessary */
if (cp == NULL || cp > probe) if (cp == NULL || cp > probe)
{ {
......
...@@ -77,6 +77,10 @@ pg_regprefix(regex_t *re, ...@@ -77,6 +77,10 @@ pg_regprefix(regex_t *re,
assert(g->tree != NULL); assert(g->tree != NULL);
cnfa = &g->tree->cnfa; cnfa = &g->tree->cnfa;
/* matchall NFAs never have a fixed prefix */
if (cnfa->flags & MATCHALL)
return REG_NOMATCH;
/* /*
* Since a correct NFA should never contain any exit-free loops, it should * Since a correct NFA should never contain any exit-free loops, it should
* not be possible for our traversal to return to a previously visited NFA * not be possible for our traversal to return to a previously visited NFA
......
...@@ -331,6 +331,9 @@ struct nfa ...@@ -331,6 +331,9 @@ struct nfa
struct colormap *cm; /* the color map */ struct colormap *cm; /* the color map */
color bos[2]; /* colors, if any, assigned to BOS and BOL */ color bos[2]; /* colors, if any, assigned to BOS and BOL */
color eos[2]; /* colors, if any, assigned to EOS and EOL */ color eos[2]; /* colors, if any, assigned to EOS and EOL */
int flags; /* flags to pass forward to cNFA */
int minmatchall; /* min number of chrs to match, if matchall */
int maxmatchall; /* max number of chrs to match, or DUPINF */
struct vars *v; /* simplifies compile error reporting */ struct vars *v; /* simplifies compile error reporting */
struct nfa *parent; /* parent NFA, if any */ struct nfa *parent; /* parent NFA, if any */
}; };
...@@ -353,6 +356,14 @@ struct nfa ...@@ -353,6 +356,14 @@ struct nfa
* *
* Note that in a plain arc, "co" can be RAINBOW; since that's negative, * Note that in a plain arc, "co" can be RAINBOW; since that's negative,
* it doesn't break the rule about how to recognize LACON arcs. * it doesn't break the rule about how to recognize LACON arcs.
*
* We have special markings for "trivial" NFAs that can match any string
* (possibly with limits on the number of characters therein). In such a
* case, flags & MATCHALL is set (and HASLACONS can't be set). Then the
* fields minmatchall and maxmatchall give the minimum and maximum numbers
* of characters to match. For example, ".*" produces minmatchall = 0
* and maxmatchall = DUPINF, while ".+" produces minmatchall = 1 and
* maxmatchall = DUPINF.
*/ */
struct carc struct carc
{ {
...@@ -366,6 +377,7 @@ struct cnfa ...@@ -366,6 +377,7 @@ struct cnfa
int ncolors; /* number of colors (max color in use + 1) */ int ncolors; /* number of colors (max color in use + 1) */
int flags; int flags;
#define HASLACONS 01 /* uses lookaround constraints */ #define HASLACONS 01 /* uses lookaround constraints */
#define MATCHALL 02 /* matches all strings of a range of lengths */
int pre; /* setup state number */ int pre; /* setup state number */
int post; /* teardown state number */ int post; /* teardown state number */
color bos[2]; /* colors, if any, assigned to BOS and BOL */ color bos[2]; /* colors, if any, assigned to BOS and BOL */
...@@ -375,6 +387,9 @@ struct cnfa ...@@ -375,6 +387,9 @@ struct cnfa
struct carc **states; /* vector of pointers to outarc lists */ struct carc **states; /* vector of pointers to outarc lists */
/* states[n] are pointers into a single malloc'd array of arcs */ /* states[n] are pointers into a single malloc'd array of arcs */
struct carc *arcs; /* the area for the lists */ struct carc *arcs; /* the area for the lists */
/* these fields are used only in a MATCHALL NFA (else they're -1): */
int minmatchall; /* min number of chrs to match */
int maxmatchall; /* max number of chrs to match, or DUPINF */
}; };
/* /*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment