From ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 20 Feb 2021 19:26:41 -0500 Subject: [PATCH] Avoid generating extra subre tree nodes for capturing parentheses. Previously, each pair of capturing parentheses gave rise to a separate subre tree node, whose only function was to identify that we ought to capture the match details for this particular sub-expression. In most cases we don't really need that, since we can perfectly well put a "capture this" annotation on the child node that does the real matching work. As with the two preceding commits, the main value of this is to avoid generating and optimizing an NFA for a tree node that's not really pulling its weight. The chosen data representation only allows one capture annotation per subre node. In the legal-per-spec, but seemingly not very useful, case where there are multiple capturing parens around the exact same bit of the regex (i.e. "((xyz))"), wrap the child node in N-1 capture nodes that act the same as before. We could work harder at that but I'll refrain, pending some evidence that such cases are worth troubling over. In passing, improve the comments in regex.h to say what all the different re_info bits mean. Some of them were pretty obvious but others not so much, so reverse-engineer some documentation. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us --- src/backend/regex/README | 15 ++++++++-- src/backend/regex/regcomp.c | 54 ++++++++++++++++++++++++++---------- src/backend/regex/rege_dfa.c | 6 ++-- src/backend/regex/regexec.c | 22 ++++++++------- src/include/regex/regex.h | 32 +++++++++++---------- src/include/regex/regguts.h | 13 +++++---- 6 files changed, 91 insertions(+), 51 deletions(-) diff --git a/src/backend/regex/README b/src/backend/regex/README index cafeb3dffb..e4b083664f 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -130,8 +130,8 @@ possibilities. As an example, consider the regex "(a[bc]+)\1". The compiled representation will have a top-level concatenation subre node. Its first -child is a capture node, and the child of that is a plain DFA node for -"a[bc]+". The concatenation's second child is a backref node for \1. +child is a plain DFA node for "a[bc]+" (which is marked as being a capture +node). The concatenation's second child is a backref node for \1. The DFA associated with the concatenation node will be "a[bc]+a[bc]+", where the backref has been replaced by a copy of the DFA for its referent expression. When executed, the concatenation node will have to search for @@ -147,6 +147,17 @@ run much faster than a pure NFA engine could do. It is this behavior that justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's library. +It's perhaps worth noting that separate capture subre nodes are a rarity: +normally, we just mark a subre as capturing and that's it. However, it's +legal to write a regex like "((x))" in which the same substring has to be +captured by multiple sets of parentheses. Since a subre has room for only +one "capno" field, a single subre can't handle that. We handle such cases +by wrapping the base subre (which captures the innermost parens) in a +no-op capture node, or even more than one for "(((x)))" etc. This is a +little bit inefficient because we end up with multiple identical NFAs, +but since the case is pointless and infrequent, it's not worth working +harder. + Colors and colormapping ----------------------- diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index c0680b280b..0cd4b4c4c2 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -452,7 +452,7 @@ pg_regcomp(regex_t *re, #endif /* Prepend .* to pattern if it's a lookbehind LACON */ - nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug); + nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->latype), debug); } CNOERR(); if (v->tree->flags & SHORTER) @@ -944,7 +944,13 @@ parseqatom(struct vars *v, else atomtype = PLAIN; /* something that's not '(' */ NEXT(); - /* need new endpoints because tree will contain pointers */ + + /* + * Make separate endpoints to ensure we keep this sub-NFA cleanly + * separate from what surrounds it. We need to be sure that when + * we duplicate the sub-NFA for a backref, we get the right states + * and no others. + */ s = newstate(v->nfa); s2 = newstate(v->nfa); NOERR(); @@ -959,11 +965,21 @@ parseqatom(struct vars *v, { assert(v->subs[subno] == NULL); v->subs[subno] = atom; - t = subre(v, '(', atom->flags | CAP, lp, rp); - NOERR(); - t->subno = subno; - t->child = atom; - atom = t; + if (atom->capno == 0) + { + /* normal case: just mark the atom as capturing */ + atom->flags |= CAP; + atom->capno = subno; + } + else + { + /* generate no-op wrapper node to handle "((x))" */ + t = subre(v, '(', atom->flags | CAP, lp, rp); + NOERR(); + t->capno = subno; + t->child = atom; + atom = t; + } } /* postpone everything else pending possible {0} */ break; @@ -976,7 +992,7 @@ parseqatom(struct vars *v, atom = subre(v, 'b', BACKR, lp, rp); NOERR(); subno = v->nextvalue; - atom->subno = subno; + atom->backno = subno; EMPTYARC(lp, rp); /* temporarily, so there's something */ NEXT(); break; @@ -1276,8 +1292,10 @@ parseqatom(struct vars *v, freesubre(v, top->child); top->op = t->op; top->flags = t->flags; + top->latype = t->latype; top->id = t->id; - top->subno = t->subno; + top->capno = t->capno; + top->backno = t->backno; top->min = t->min; top->max = t->max; top->child = t->child; @@ -1790,8 +1808,10 @@ subre(struct vars *v, ret->op = op; ret->flags = flags; + ret->latype = (char) -1; ret->id = 0; /* will be assigned later */ - ret->subno = 0; + ret->capno = 0; + ret->backno = 0; ret->min = ret->max = 1; ret->child = NULL; ret->sibling = NULL; @@ -1893,7 +1913,7 @@ numst(struct subre *t, assert(t != NULL); i = start; - t->id = (short) i++; + t->id = i++; for (t2 = t->child; t2 != NULL; t2 = t2->sibling) i = numst(t2, i); return i; @@ -2040,7 +2060,7 @@ newlacon(struct vars *v, sub = &v->lacons[n]; sub->begin = begin; sub->end = end; - sub->subno = latype; + sub->latype = latype; ZAPCNFA(sub->cnfa); return n; } @@ -2163,7 +2183,7 @@ dump(regex_t *re, struct subre *lasub = &g->lacons[i]; const char *latype; - switch (lasub->subno) + switch (lasub->latype) { case LATYPE_AHEAD_POS: latype = "positive lookahead"; @@ -2227,8 +2247,12 @@ stdump(struct subre *t, fprintf(f, " hasbackref"); if (!(t->flags & INUSE)) fprintf(f, " UNUSED"); - if (t->subno != 0) - fprintf(f, " (#%d)", t->subno); + if (t->latype != (char) -1) + fprintf(f, " latype(%d)", t->latype); + if (t->capno != 0) + fprintf(f, " capture(%d)", t->capno); + if (t->backno != 0) + fprintf(f, " backref(%d)", t->backno); if (t->min != 1 || t->max != 1) { fprintf(f, " {%d,", t->min); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index 89d162ed6a..957ceb8137 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -825,12 +825,12 @@ lacon(struct vars *v, d = getladfa(v, n); if (d == NULL) return 0; - if (LATYPE_IS_AHEAD(sub->subno)) + if (LATYPE_IS_AHEAD(sub->latype)) { /* used to use longest() here, but shortest() could be much cheaper */ end = shortest(v, d, cp, cp, v->stop, (chr **) NULL, (int *) NULL); - satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL); + satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL); } else { @@ -843,7 +843,7 @@ lacon(struct vars *v, * nominal match. */ satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]); - if (!LATYPE_IS_POS(sub->subno)) + if (!LATYPE_IS_POS(sub->latype)) satisfied = !satisfied; } FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied)); diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 4541cf9a7e..2a1ef0176a 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -640,13 +640,11 @@ static void zaptreesubs(struct vars *v, struct subre *t) { + int n = t->capno; struct subre *t2; - if (t->op == '(') + if (n > 0) { - int n = t->subno; - - assert(n > 0); if ((size_t) n < v->nmatch) { v->pmatch[n].rm_so = -1; @@ -667,7 +665,7 @@ subset(struct vars *v, chr *begin, chr *end) { - int n = sub->subno; + int n = sub->capno; assert(n > 0); if ((size_t) n >= v->nmatch) @@ -739,12 +737,10 @@ cdissect(struct vars *v, else er = citerdissect(v, t, begin, end); break; - case '(': /* capturing */ + case '(': /* no-op capture node */ assert(t->child != NULL); - assert(t->subno > 0); + assert(t->capno > 0); er = cdissect(v, t->child, begin, end); - if (er == REG_OKAY) - subset(v, t, begin, end); break; default: er = REG_ASSERT; @@ -758,6 +754,12 @@ cdissect(struct vars *v, */ assert(er != REG_NOMATCH || (t->flags & BACKR)); + /* + * If this node is marked as capturing, save successful match's location. + */ + if (t->capno > 0 && er == REG_OKAY) + subset(v, t, begin, end); + return er; } @@ -932,7 +934,7 @@ cbrdissect(struct vars *v, chr *begin, /* beginning of relevant substring */ chr *end) /* end of same */ { - int n = t->subno; + int n = t->backno; size_t numreps; size_t tlen; size_t brlen; diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index dc31899aa4..2b48a19fb7 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -56,21 +56,23 @@ typedef struct { int re_magic; /* magic number */ size_t re_nsub; /* number of subexpressions */ - long re_info; /* information about RE */ -#define REG_UBACKREF 000001 -#define REG_ULOOKAROUND 000002 -#define REG_UBOUNDS 000004 -#define REG_UBRACES 000010 -#define REG_UBSALNUM 000020 -#define REG_UPBOTCH 000040 -#define REG_UBBS 000100 -#define REG_UNONPOSIX 000200 -#define REG_UUNSPEC 000400 -#define REG_UUNPORT 001000 -#define REG_ULOCALE 002000 -#define REG_UEMPTYMATCH 004000 -#define REG_UIMPOSSIBLE 010000 -#define REG_USHORTEST 020000 + long re_info; /* bitmask of the following flags: */ +#define REG_UBACKREF 000001 /* has back-reference (\n) */ +#define REG_ULOOKAROUND 000002 /* has lookahead/lookbehind constraint */ +#define REG_UBOUNDS 000004 /* has bounded quantifier ({m,n}) */ +#define REG_UBRACES 000010 /* has { that doesn't begin a quantifier */ +#define REG_UBSALNUM 000020 /* has backslash-alphanumeric in non-ARE */ +#define REG_UPBOTCH 000040 /* has unmatched right paren in ERE (legal + * per spec, but that was a mistake) */ +#define REG_UBBS 000100 /* has backslash within bracket expr */ +#define REG_UNONPOSIX 000200 /* has any construct that extends POSIX */ +#define REG_UUNSPEC 000400 /* has any case disallowed by POSIX, e.g. + * an empty branch */ +#define REG_UUNPORT 001000 /* has numeric character code dependency */ +#define REG_ULOCALE 002000 /* has locale dependency */ +#define REG_UEMPTYMATCH 004000 /* can match a zero-length string */ +#define REG_UIMPOSSIBLE 010000 /* provably cannot match anything */ +#define REG_USHORTEST 020000 /* has non-greedy quantifier */ int re_csize; /* sizeof(character) */ char *re_endp; /* backward compatibility kludge */ Oid re_collation; /* Collation that defines LC_CTYPE behavior */ diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 90ee16957a..306525eb5f 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -422,7 +422,7 @@ struct cnfa * "op" is one of: * '=' plain regex without interesting substructure (implemented as DFA) * 'b' back-reference (has no substructure either) - * '(' capture node: captures the match of its single child + * '(' no-op capture node: captures the match of its single child * '.' concatenation: matches a match for first child, then second child * '|' alternation: matches a match for any of its children * '*' iteration: matches some number of matches of its single child @@ -446,8 +446,8 @@ struct subre #define LONGER 01 /* prefers longer match */ #define SHORTER 02 /* prefers shorter match */ #define MIXED 04 /* mixed preference below */ -#define CAP 010 /* capturing parens below */ -#define BACKR 020 /* back reference below */ +#define CAP 010 /* capturing parens here or below */ +#define BACKR 020 /* back reference here or below */ #define INUSE 0100 /* in use in final tree */ #define NOPROP 03 /* bits which may not propagate up */ #define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ @@ -457,9 +457,10 @@ struct subre #define PREF(f) ((f)&NOPROP) #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) - short id; /* ID of subre (1..ntree-1) */ - int subno; /* subexpression number for 'b' and '(', or - * LATYPE code for lookaround constraint */ + char latype; /* LATYPE code, if lookaround constraint */ + int id; /* ID of subre (1..ntree-1) */ + int capno; /* if capture node, subno to capture into */ + int backno; /* if backref node, subno it refers to */ short min; /* min repetitions for iteration or backref */ short max; /* max repetitions for iteration or backref */ struct subre *child; /* first child, if any (also freelist chain) */