Rethink regexp engine's backref-related compilation state.

I had committer's remorse almost immediately after pushing cb76fbd7e,
upon finding that removing capturing subexpressions' subREs from the
data structure broke my proposed patch for REG_NOSUB optimization.
Revert that data structure change.  Instead, address the concern
about not changing capturing subREs' endpoints by not changing the
endpoints.  We don't need to, because the point of that bit was just
to ensure that the atom has endpoints distinct from the outer state
pair that we're stringing the branch between.  We already made
suitable states in the parenthesized-subexpression case, so the
additional ones were just useless overhead.  This seems more
understandable than Spencer's original coding, and it ought to be
a shade faster too by saving a few state creations and arc changes.
(I actually see a couple percent improvement on Jacobson's web
corpus, though that's barely above the noise floor so I wouldn't
put much stock in that result.)

Also, fix the logic added by ea1268f63 to ensure that the subRE
recorded in v->subs[subno] is exactly the one with capno == subno.
Spencer's original coding recorded the child subRE of the capture
node, which is okay so far as having the right endpoint states is
concerned, but as of cb76fbd7e the capturing subRE itself always
has those endpoints too.  I think the inconsistency is confusing
for the REG_NOSUB optimization.

As before, backpatch to v14.

Discussion: https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
This commit is contained in:
Tom Lane 2021-08-08 11:56:29 -04:00
parent 75a2d132ea
commit 00116dee5a
1 changed files with 47 additions and 37 deletions

View File

@ -233,13 +233,6 @@ static int cmp(const chr *, const chr *, size_t);
static int casecmp(const chr *, const chr *, size_t);
/* info we need during compilation about a known capturing subexpression */
struct subinfo
{
struct state *left; /* left end of its sub-NFA */
struct state *right; /* right end of its sub-NFA */
};
/* internal variables, bundled for easy passing around */
struct vars
{
@ -252,10 +245,10 @@ struct vars
int nexttype; /* type of next token */
chr nextvalue; /* value (if any) of next token */
int lexcon; /* lexical context type (see regc_lex.c) */
int nsubexp; /* number of known capturing subexpressions */
struct subinfo *subs; /* info about known capturing subexpressions */
size_t nsubs; /* allocated length of subs[] vector */
struct subinfo sub10[10]; /* initial vector, enough for most */
int nsubexp; /* subexpression count */
struct subre **subs; /* subRE pointer vector */
size_t nsubs; /* length of vector */
struct subre *sub10[10]; /* initial vector, enough for most */
struct nfa *nfa; /* the NFA */
struct colormap *cm; /* character color map */
color nlcolor; /* color of newline */
@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
v->subs = v->sub10;
v->nsubs = 10;
for (j = 0; j < v->nsubs; j++)
v->subs[j].left = v->subs[j].right = NULL;
v->subs[j] = NULL;
v->nfa = NULL;
v->cm = NULL;
v->nlcolor = COLORLESS;
@ -511,13 +504,13 @@ pg_regcomp(regex_t *re,
}
/*
* moresubs - enlarge capturing-subexpressions vector
* moresubs - enlarge subRE vector
*/
static void
moresubs(struct vars *v,
int wanted) /* want enough room for this one */
{
struct subinfo *p;
struct subre **p;
size_t n;
assert(wanted > 0 && (size_t) wanted >= v->nsubs);
@ -525,13 +518,13 @@ moresubs(struct vars *v,
if (v->subs == v->sub10)
{
p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
p = (struct subre **) MALLOC(n * sizeof(struct subre *));
if (p != NULL)
memcpy(VS(p), VS(v->subs),
v->nsubs * sizeof(struct subinfo));
v->nsubs * sizeof(struct subre *));
}
else
p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
if (p == NULL)
{
ERR(REG_ESPACE);
@ -539,7 +532,7 @@ moresubs(struct vars *v,
}
v->subs = p;
for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
p->left = p->right = NULL;
*p = NULL;
assert(v->nsubs == n);
assert((size_t) wanted < v->nsubs);
}
@ -988,6 +981,7 @@ parseqatom(struct vars *v,
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERRN();
/* We may not need these arcs, but keep things connected for now */
EMPTYARC(lp, s);
EMPTYARC(s2, rp);
NOERRN();
@ -997,10 +991,6 @@ parseqatom(struct vars *v,
NOERRN();
if (cap)
{
/* save the sub-NFA's endpoints for future backrefs to use */
assert(v->subs[subno].left == NULL);
v->subs[subno].left = s;
v->subs[subno].right = s2;
if (atom->capno == 0)
{
/* normal case: just mark the atom as capturing */
@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
t->child = atom;
atom = t;
}
assert(v->subs[subno] == NULL);
v->subs[subno] = atom;
}
/* postpone everything else pending possible {0} */
break;
case BACKREF: /* the Feature From The Black Lagoon */
INSIST(type != LACON, REG_ESUBREG);
INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
NOERRN();
assert(v->nextvalue > 0);
atom = subre(v, 'b', BACKR, lp, rp);
@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
if (atom != NULL)
freesubre(v, atom);
if (atomtype == '(')
v->subs[subno].left = v->subs[subno].right = NULL;
v->subs[subno] = NULL;
delsub(v->nfa, lp, rp);
EMPTYARC(lp, rp);
return top;
@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
NOERRN();
}
/*
* For what follows, we need the atom to have its own begin/end states
* that are distinct from lp/rp, so that we can wrap iteration structure
* around it. The parenthesized-atom case above already made suitable
* states (and we don't want to modify a capturing subre, since it's
* already recorded in v->subs[]). Otherwise, we need more states.
*/
if (atom->begin == lp || atom->end == rp)
{
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERRN();
moveouts(v->nfa, lp, s);
moveins(v->nfa, rp, s2);
atom->begin = s;
atom->end = s2;
}
else
{
/* The atom's OK, but we must temporarily disconnect it from lp/rp */
/* (this removes the EMPTY arcs we made above) */
delsub(v->nfa, lp, atom->begin);
delsub(v->nfa, atom->end, rp);
}
/*----------
* Prepare a general-purpose state skeleton.
*
* In the no-backrefs case, we want this:
*
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
* [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
*
* where prefix is some repetitions of atom. In the general case we need
* where prefix is some repetitions of atom, and "rest" is the remainder
* of the branch. In the general case we need:
*
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
*
* where the iterator wraps around [begin] ---atom---> [end]
* where the iterator wraps around the atom.
*
* We make the s state here for both cases; s2 is made below if needed
*----------
*/
s = newstate(v->nfa); /* first, new endpoints for the atom */
s2 = newstate(v->nfa);
NOERRN();
moveouts(v->nfa, lp, s);
moveins(v->nfa, rp, s2);
NOERRN();
atom->begin = s;
atom->end = s2;
s = newstate(v->nfa); /* set up starting state */
NOERRN();
EMPTYARC(lp, s);
@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
{
assert(atom->begin->nouts == 1); /* just the EMPTY */
delsub(v->nfa, atom->begin, atom->end);
assert(v->subs[subno].left != NULL);
assert(v->subs[subno] != NULL);
/*
* And here's why the recursion got postponed: it must wait until the
* skeleton is filled in, because it may hit a backref that wants to
* copy the filled-in skeleton.
*/
dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
atom->begin, atom->end);
NOERRN();