Recognize "match-all" NFAs within the regex engine.

This builds on the previous "rainbow" patch to detect NFAs that will
match any string, though possibly with constraints on the string length.
This definition is chosen to match constructs such as ".*", ".+", and
".{1,100}".  Recognizing such an NFA after the optimization pass is
fairly cheap, since we basically just have to verify that all arcs
are RAINBOW arcs and count the number of steps to the end state.
(Well, there's a bit of complication with pseudo-color arcs for string
boundary conditions, but not much.)

Once we have these markings, the regex executor functions longest(),
shortest(), and matchuntil() don't have to expend per-character work
to determine whether a given substring satisfies such an NFA; they
just need to check its length against the bounds.  Since some matching
problems require O(N) invocations of these functions, we've reduced
the runtime for an N-character string from O(N^2) to O(N).  Of course,
this is no help for non-matchall sub-patterns, but those usually have
constraints that allow us to avoid needing O(N) substring checks in the
first place.  It's precisely the unconstrained "match-all" cases that
cause the most headaches.

This is part of a patch series that in total reduces the regex engine's
runtime by about a factor of four on a large corpus of real-world regexes.

Patch by me, reviewed by Joel Jacobson

Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
This commit is contained in:
Tom Lane 2021-02-20 18:31:19 -05:00
parent 08c0d6ad65
commit 824bf71902
5 changed files with 376 additions and 1 deletions

View File

@ -65,6 +65,8 @@ newnfa(struct vars *v,
nfa->v = v;
nfa->bos[0] = nfa->bos[1] = COLORLESS;
nfa->eos[0] = nfa->eos[1] = COLORLESS;
nfa->flags = 0;
nfa->minmatchall = nfa->maxmatchall = -1;
nfa->parent = parent; /* Precedes newfstate so parent is valid. */
nfa->post = newfstate(nfa, '@'); /* number 0 */
nfa->pre = newfstate(nfa, '>'); /* number 1 */
@ -2875,8 +2877,14 @@ analyze(struct nfa *nfa)
if (NISERR())
return 0;
/* Detect whether NFA can't match anything */
if (nfa->pre->outs == NULL)
return REG_UIMPOSSIBLE;
/* Detect whether NFA matches all strings (possibly with length bounds) */
checkmatchall(nfa);
/* Detect whether NFA can possibly match a zero-length string */
for (a = nfa->pre->outs; a != NULL; a = a->outchain)
for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
if (aa->to == nfa->post)
@ -2884,6 +2892,282 @@ analyze(struct nfa *nfa)
return 0;
}
/*
* checkmatchall - does the NFA represent no more than a string length test?
*
* If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
* to begin with) and set the MATCHALL bit in nfa->flags.
*
* To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
* for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the
* post state via RAINBOW arcs, and if there are any loops in the graph, they
* must be loop-to-self arcs, ensuring that each loop iteration consumes
* exactly one character. (Longer loops are problematic because they create
* non-consecutive possible match lengths; we have no good way to represent
* that situation for lengths beyond the DUPINF limit.)
*
* Pseudocolor arcs complicate things a little. We know that they can only
* appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
* EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs,
* e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
* and BOL outarcs to state 2, and no others. Missing or extra pseudocolor
* arcs can occur, meaning that the NFA involves some constraint on the
* adjacent characters, which makes it not a matchall NFA.
*/
static void
checkmatchall(struct nfa *nfa)
{
bool hasmatch[DUPINF + 1];
int minmatch,
maxmatch,
morematch;
/*
* hasmatch[i] will be set true if a match of length i is feasible, for i
* from 0 to DUPINF-1. hasmatch[DUPINF] will be set true if every match
* length of DUPINF or more is feasible.
*/
memset(hasmatch, 0, sizeof(hasmatch));
/*
* Recursively search the graph for all-RAINBOW paths to the "post" state,
* starting at the "pre" state. The -1 initial depth accounts for the
* fact that transitions out of the "pre" state are not part of the
* matched string. We likewise don't count the final transition to the
* "post" state as part of the match length. (But we still insist that
* those transitions have RAINBOW arcs, otherwise there are lookbehind or
* lookahead constraints at the start/end of the pattern.)
*/
if (!checkmatchall_recurse(nfa, nfa->pre, false, -1, hasmatch))
return;
/*
* We found some all-RAINBOW paths, and not anything that we couldn't
* handle. Now verify that pseudocolor arcs adjacent to the pre and post
* states match the RAINBOW arcs there. (We could do this while
* recursing, but it's expensive and unlikely to fail, so do it last.)
*/
if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
!check_out_colors_match(nfa->pre, nfa->bos[0], RAINBOW) ||
!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
!check_out_colors_match(nfa->pre, nfa->bos[1], RAINBOW))
return;
if (!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
!check_in_colors_match(nfa->post, nfa->eos[0], RAINBOW) ||
!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]) ||
!check_in_colors_match(nfa->post, nfa->eos[1], RAINBOW))
return;
/*
* hasmatch[] now represents the set of possible match lengths; but we
* want to reduce that to a min and max value, because it doesn't seem
* worth complicating regexec.c to deal with nonconsecutive possible match
* lengths. Find min and max of first run of lengths, then verify there
* are no nonconsecutive lengths.
*/
for (minmatch = 0; minmatch <= DUPINF; minmatch++)
{
if (hasmatch[minmatch])
break;
}
assert(minmatch <= DUPINF); /* else checkmatchall_recurse lied */
for (maxmatch = minmatch; maxmatch < DUPINF; maxmatch++)
{
if (!hasmatch[maxmatch + 1])
break;
}
for (morematch = maxmatch + 1; morematch <= DUPINF; morematch++)
{
if (hasmatch[morematch])
return; /* fail, there are nonconsecutive lengths */
}
/* Success, so record the info */
nfa->minmatchall = minmatch;
nfa->maxmatchall = maxmatch;
nfa->flags |= MATCHALL;
}
/*
* checkmatchall_recurse - recursive search for checkmatchall
*
* s is the current state
* foundloop is true if any predecessor state has a loop-to-self
* depth is the current recursion depth (starting at -1)
* hasmatch[] is the output area for recording feasible match lengths
*
* We return true if there is at least one all-RAINBOW path to the "post"
* state and no non-matchall paths; otherwise false. Note we assume that
* any dead-end paths have already been removed, else we might return
* false unnecessarily.
*/
static bool
checkmatchall_recurse(struct nfa *nfa, struct state *s,
bool foundloop, int depth,
bool *hasmatch)
{
bool result = false;
struct arc *a;
/*
* Since this is recursive, it could be driven to stack overflow. But we
* need not treat that as a hard failure; just deem the NFA non-matchall.
*/
if (STACK_TOO_DEEP(nfa->v->re))
return false;
/*
* Likewise, if we get to a depth too large to represent correctly in
* maxmatchall, fail quietly.
*/
if (depth >= DUPINF)
return false;
/*
* Scan the outarcs to detect cases we can't handle, and to see if there
* is a loop-to-self here. We need to know about any such loop before we
* recurse, so it's hard to avoid making two passes over the outarcs. In
* any case, checking for showstoppers before we recurse is probably best.
*/
for (a = s->outs; a != NULL; a = a->outchain)
{
if (a->type != PLAIN)
return false; /* any LACONs make it non-matchall */
if (a->co != RAINBOW)
{
if (nfa->cm->cd[a->co].flags & PSEUDO)
{
/*
* Pseudocolor arc: verify it's in a valid place (this seems
* quite unlikely to fail, but let's be sure).
*/
if (s == nfa->pre &&
(a->co == nfa->bos[0] || a->co == nfa->bos[1]))
/* okay BOS/BOL arc */ ;
else if (a->to == nfa->post &&
(a->co == nfa->eos[0] || a->co == nfa->eos[1]))
/* okay EOS/EOL arc */ ;
else
return false; /* unexpected pseudocolor arc */
/* We'll finish checking these arcs after the recursion */
continue;
}
return false; /* any other color makes it non-matchall */
}
if (a->to == s)
{
/*
* We found a cycle of length 1, so remember that to pass down to
* successor states. (It doesn't matter if there was also such a
* loop at a predecessor state.)
*/
foundloop = true;
}
else if (a->to->tmp)
{
/* We found a cycle of length > 1, so fail. */
return false;
}
}
/* We need to recurse, so mark state as under consideration */
assert(s->tmp == NULL);
s->tmp = s;
for (a = s->outs; a != NULL; a = a->outchain)
{
if (a->co != RAINBOW)
continue; /* ignore pseudocolor arcs */
if (a->to == nfa->post)
{
/* We found an all-RAINBOW path to the post state */
result = true;
/* Record potential match lengths */
assert(depth >= 0);
hasmatch[depth] = true;
if (foundloop)
{
/* A predecessor loop makes all larger lengths match, too */
int i;
for (i = depth + 1; i <= DUPINF; i++)
hasmatch[i] = true;
}
}
else if (a->to != s)
{
/* This is a new path forward; recurse to investigate */
result = checkmatchall_recurse(nfa, a->to,
foundloop, depth + 1,
hasmatch);
/* Fail if any recursive path fails */
if (!result)
break;
}
}
s->tmp = NULL;
return result;
}
/*
* check_out_colors_match - subroutine for checkmatchall
*
* Check if every s outarc of color co1 has a matching outarc of color co2.
* (checkmatchall_recurse already verified that all of the outarcs are PLAIN,
* so we need not examine arc types here. Also, since it verified that there
* are only RAINBOW and pseudocolor arcs, there shouldn't be enough arcs for
* this brute-force O(N^2) implementation to cause problems.)
*/
static bool
check_out_colors_match(struct state *s, color co1, color co2)
{
struct arc *a1;
struct arc *a2;
for (a1 = s->outs; a1 != NULL; a1 = a1->outchain)
{
if (a1->co != co1)
continue;
for (a2 = s->outs; a2 != NULL; a2 = a2->outchain)
{
if (a2->co == co2 && a2->to == a1->to)
break;
}
if (a2 == NULL)
return false;
}
return true;
}
/*
* check_in_colors_match - subroutine for checkmatchall
*
* Check if every s inarc of color co1 has a matching inarc of color co2.
* (For paranoia's sake, ignore any non-PLAIN arcs here. But we're still
* not expecting very many arcs.)
*/
static bool
check_in_colors_match(struct state *s, color co1, color co2)
{
struct arc *a1;
struct arc *a2;
for (a1 = s->ins; a1 != NULL; a1 = a1->inchain)
{
if (a1->type != PLAIN || a1->co != co1)
continue;
for (a2 = s->ins; a2 != NULL; a2 = a2->inchain)
{
if (a2->type == PLAIN && a2->co == co2 && a2->from == a1->from)
break;
}
if (a2 == NULL)
return false;
}
return true;
}
/*
* compact - construct the compact representation of an NFA
*/
@ -2930,7 +3214,9 @@ compact(struct nfa *nfa,
cnfa->eos[0] = nfa->eos[0];
cnfa->eos[1] = nfa->eos[1];
cnfa->ncolors = maxcolor(nfa->cm) + 1;
cnfa->flags = 0;
cnfa->flags = nfa->flags;
cnfa->minmatchall = nfa->minmatchall;
cnfa->maxmatchall = nfa->maxmatchall;
ca = cnfa->arcs;
for (s = nfa->states; s != NULL; s = s->next)
@ -3034,6 +3320,11 @@ dumpnfa(struct nfa *nfa,
fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
if (nfa->eos[1] != COLORLESS)
fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
if (nfa->flags & HASLACONS)
fprintf(f, ", haslacons");
if (nfa->flags & MATCHALL)
fprintf(f, ", minmatchall %d, maxmatchall %d",
nfa->minmatchall, nfa->maxmatchall);
fprintf(f, "\n");
for (s = nfa->states; s != NULL; s = s->next)
{
@ -3201,6 +3492,9 @@ dumpcnfa(struct cnfa *cnfa,
fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
if (cnfa->flags & HASLACONS)
fprintf(f, ", haslacons");
if (cnfa->flags & MATCHALL)
fprintf(f, ", minmatchall %d, maxmatchall %d",
cnfa->minmatchall, cnfa->maxmatchall);
fprintf(f, "\n");
for (st = 0; st < cnfa->nstates; st++)
dumpcstate(st, cnfa, f);

View File

@ -175,6 +175,11 @@ static void cleanup(struct nfa *);
static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
static long analyze(struct nfa *);
static void checkmatchall(struct nfa *);
static bool checkmatchall_recurse(struct nfa *, struct state *,
bool, int, bool *);
static bool check_out_colors_match(struct state *, color, color);
static bool check_in_colors_match(struct state *, color, color);
static void compact(struct nfa *, struct cnfa *);
static void carcsort(struct carc *, size_t);
static int carc_cmp(const void *, const void *);

View File

@ -58,6 +58,29 @@ longest(struct vars *v,
if (hitstopp != NULL)
*hitstopp = 0;
/* fast path for matchall NFAs */
if (d->cnfa->flags & MATCHALL)
{
size_t nchr = stop - start;
size_t maxmatchall = d->cnfa->maxmatchall;
if (nchr < d->cnfa->minmatchall)
return NULL;
if (maxmatchall == DUPINF)
{
if (stop == v->stop && hitstopp != NULL)
*hitstopp = 1;
}
else
{
if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
*hitstopp = 1;
if (nchr > maxmatchall)
return start + maxmatchall;
}
return stop;
}
/* initialize */
css = initialize(v, d, start);
if (css == NULL)
@ -187,6 +210,24 @@ shortest(struct vars *v,
if (hitstopp != NULL)
*hitstopp = 0;
/* fast path for matchall NFAs */
if (d->cnfa->flags & MATCHALL)
{
size_t nchr = min - start;
if (d->cnfa->maxmatchall != DUPINF &&
nchr > d->cnfa->maxmatchall)
return NULL;
if ((max - start) < d->cnfa->minmatchall)
return NULL;
if (nchr < d->cnfa->minmatchall)
min = start + d->cnfa->minmatchall;
if (coldp != NULL)
*coldp = start;
/* there is no case where we should set *hitstopp */
return min;
}
/* initialize */
css = initialize(v, d, start);
if (css == NULL)
@ -312,6 +353,22 @@ matchuntil(struct vars *v,
struct sset *ss;
struct colormap *cm = d->cm;
/* fast path for matchall NFAs */
if (d->cnfa->flags & MATCHALL)
{
size_t nchr = probe - v->start;
/*
* It might seem that we should check maxmatchall too, but the .* at
* the front of the pattern absorbs any extra characters (and it was
* tacked on *after* computing minmatchall/maxmatchall). Thus, we
* should match if there are at least minmatchall characters.
*/
if (nchr < d->cnfa->minmatchall)
return 0;
return 1;
}
/* initialize and startup, or restart, if necessary */
if (cp == NULL || cp > probe)
{

View File

@ -77,6 +77,10 @@ pg_regprefix(regex_t *re,
assert(g->tree != NULL);
cnfa = &g->tree->cnfa;
/* matchall NFAs never have a fixed prefix */
if (cnfa->flags & MATCHALL)
return REG_NOMATCH;
/*
* Since a correct NFA should never contain any exit-free loops, it should
* not be possible for our traversal to return to a previously visited NFA

View File

@ -331,6 +331,9 @@ struct nfa
struct colormap *cm; /* the color map */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
color eos[2]; /* colors, if any, assigned to EOS and EOL */
int flags; /* flags to pass forward to cNFA */
int minmatchall; /* min number of chrs to match, if matchall */
int maxmatchall; /* max number of chrs to match, or DUPINF */
struct vars *v; /* simplifies compile error reporting */
struct nfa *parent; /* parent NFA, if any */
};
@ -353,6 +356,14 @@ struct nfa
*
* Note that in a plain arc, "co" can be RAINBOW; since that's negative,
* it doesn't break the rule about how to recognize LACON arcs.
*
* We have special markings for "trivial" NFAs that can match any string
* (possibly with limits on the number of characters therein). In such a
* case, flags & MATCHALL is set (and HASLACONS can't be set). Then the
* fields minmatchall and maxmatchall give the minimum and maximum numbers
* of characters to match. For example, ".*" produces minmatchall = 0
* and maxmatchall = DUPINF, while ".+" produces minmatchall = 1 and
* maxmatchall = DUPINF.
*/
struct carc
{
@ -366,6 +377,7 @@ struct cnfa
int ncolors; /* number of colors (max color in use + 1) */
int flags;
#define HASLACONS 01 /* uses lookaround constraints */
#define MATCHALL 02 /* matches all strings of a range of lengths */
int pre; /* setup state number */
int post; /* teardown state number */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
@ -375,6 +387,9 @@ struct cnfa
struct carc **states; /* vector of pointers to outarc lists */
/* states[n] are pointers into a single malloc'd array of arcs */
struct carc *arcs; /* the area for the lists */
/* these fields are used only in a MATCHALL NFA (else they're -1): */
int minmatchall; /* min number of chrs to match */
int maxmatchall; /* max number of chrs to match, or DUPINF */
};
/*