diff --git a/src/backend/regex/README b/src/backend/regex/README index cafeb3dffb..e4b083664f 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -130,8 +130,8 @@ possibilities. As an example, consider the regex "(a[bc]+)\1". The compiled representation will have a top-level concatenation subre node. Its first -child is a capture node, and the child of that is a plain DFA node for -"a[bc]+". The concatenation's second child is a backref node for \1. +child is a plain DFA node for "a[bc]+" (which is marked as being a capture +node). The concatenation's second child is a backref node for \1. The DFA associated with the concatenation node will be "a[bc]+a[bc]+", where the backref has been replaced by a copy of the DFA for its referent expression. When executed, the concatenation node will have to search for @@ -147,6 +147,17 @@ run much faster than a pure NFA engine could do. It is this behavior that justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's library. +It's perhaps worth noting that separate capture subre nodes are a rarity: +normally, we just mark a subre as capturing and that's it. However, it's +legal to write a regex like "((x))" in which the same substring has to be +captured by multiple sets of parentheses. Since a subre has room for only +one "capno" field, a single subre can't handle that. We handle such cases +by wrapping the base subre (which captures the innermost parens) in a +no-op capture node, or even more than one for "(((x)))" etc. This is a +little bit inefficient because we end up with multiple identical NFAs, +but since the case is pointless and infrequent, it's not worth working +harder. + Colors and colormapping ----------------------- diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index c0680b280b..0cd4b4c4c2 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -452,7 +452,7 @@ pg_regcomp(regex_t *re, #endif /* Prepend .* to pattern if it's a lookbehind LACON */ - nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug); + nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->latype), debug); } CNOERR(); if (v->tree->flags & SHORTER) @@ -944,7 +944,13 @@ parseqatom(struct vars *v, else atomtype = PLAIN; /* something that's not '(' */ NEXT(); - /* need new endpoints because tree will contain pointers */ + + /* + * Make separate endpoints to ensure we keep this sub-NFA cleanly + * separate from what surrounds it. We need to be sure that when + * we duplicate the sub-NFA for a backref, we get the right states + * and no others. + */ s = newstate(v->nfa); s2 = newstate(v->nfa); NOERR(); @@ -959,11 +965,21 @@ parseqatom(struct vars *v, { assert(v->subs[subno] == NULL); v->subs[subno] = atom; - t = subre(v, '(', atom->flags | CAP, lp, rp); - NOERR(); - t->subno = subno; - t->child = atom; - atom = t; + if (atom->capno == 0) + { + /* normal case: just mark the atom as capturing */ + atom->flags |= CAP; + atom->capno = subno; + } + else + { + /* generate no-op wrapper node to handle "((x))" */ + t = subre(v, '(', atom->flags | CAP, lp, rp); + NOERR(); + t->capno = subno; + t->child = atom; + atom = t; + } } /* postpone everything else pending possible {0} */ break; @@ -976,7 +992,7 @@ parseqatom(struct vars *v, atom = subre(v, 'b', BACKR, lp, rp); NOERR(); subno = v->nextvalue; - atom->subno = subno; + atom->backno = subno; EMPTYARC(lp, rp); /* temporarily, so there's something */ NEXT(); break; @@ -1276,8 +1292,10 @@ parseqatom(struct vars *v, freesubre(v, top->child); top->op = t->op; top->flags = t->flags; + top->latype = t->latype; top->id = t->id; - top->subno = t->subno; + top->capno = t->capno; + top->backno = t->backno; top->min = t->min; top->max = t->max; top->child = t->child; @@ -1790,8 +1808,10 @@ subre(struct vars *v, ret->op = op; ret->flags = flags; + ret->latype = (char) -1; ret->id = 0; /* will be assigned later */ - ret->subno = 0; + ret->capno = 0; + ret->backno = 0; ret->min = ret->max = 1; ret->child = NULL; ret->sibling = NULL; @@ -1893,7 +1913,7 @@ numst(struct subre *t, assert(t != NULL); i = start; - t->id = (short) i++; + t->id = i++; for (t2 = t->child; t2 != NULL; t2 = t2->sibling) i = numst(t2, i); return i; @@ -2040,7 +2060,7 @@ newlacon(struct vars *v, sub = &v->lacons[n]; sub->begin = begin; sub->end = end; - sub->subno = latype; + sub->latype = latype; ZAPCNFA(sub->cnfa); return n; } @@ -2163,7 +2183,7 @@ dump(regex_t *re, struct subre *lasub = &g->lacons[i]; const char *latype; - switch (lasub->subno) + switch (lasub->latype) { case LATYPE_AHEAD_POS: latype = "positive lookahead"; @@ -2227,8 +2247,12 @@ stdump(struct subre *t, fprintf(f, " hasbackref"); if (!(t->flags & INUSE)) fprintf(f, " UNUSED"); - if (t->subno != 0) - fprintf(f, " (#%d)", t->subno); + if (t->latype != (char) -1) + fprintf(f, " latype(%d)", t->latype); + if (t->capno != 0) + fprintf(f, " capture(%d)", t->capno); + if (t->backno != 0) + fprintf(f, " backref(%d)", t->backno); if (t->min != 1 || t->max != 1) { fprintf(f, " {%d,", t->min); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index 89d162ed6a..957ceb8137 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -825,12 +825,12 @@ lacon(struct vars *v, d = getladfa(v, n); if (d == NULL) return 0; - if (LATYPE_IS_AHEAD(sub->subno)) + if (LATYPE_IS_AHEAD(sub->latype)) { /* used to use longest() here, but shortest() could be much cheaper */ end = shortest(v, d, cp, cp, v->stop, (chr **) NULL, (int *) NULL); - satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL); + satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL); } else { @@ -843,7 +843,7 @@ lacon(struct vars *v, * nominal match. */ satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]); - if (!LATYPE_IS_POS(sub->subno)) + if (!LATYPE_IS_POS(sub->latype)) satisfied = !satisfied; } FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied)); diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 4541cf9a7e..2a1ef0176a 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -640,13 +640,11 @@ static void zaptreesubs(struct vars *v, struct subre *t) { + int n = t->capno; struct subre *t2; - if (t->op == '(') + if (n > 0) { - int n = t->subno; - - assert(n > 0); if ((size_t) n < v->nmatch) { v->pmatch[n].rm_so = -1; @@ -667,7 +665,7 @@ subset(struct vars *v, chr *begin, chr *end) { - int n = sub->subno; + int n = sub->capno; assert(n > 0); if ((size_t) n >= v->nmatch) @@ -739,12 +737,10 @@ cdissect(struct vars *v, else er = citerdissect(v, t, begin, end); break; - case '(': /* capturing */ + case '(': /* no-op capture node */ assert(t->child != NULL); - assert(t->subno > 0); + assert(t->capno > 0); er = cdissect(v, t->child, begin, end); - if (er == REG_OKAY) - subset(v, t, begin, end); break; default: er = REG_ASSERT; @@ -758,6 +754,12 @@ cdissect(struct vars *v, */ assert(er != REG_NOMATCH || (t->flags & BACKR)); + /* + * If this node is marked as capturing, save successful match's location. + */ + if (t->capno > 0 && er == REG_OKAY) + subset(v, t, begin, end); + return er; } @@ -932,7 +934,7 @@ cbrdissect(struct vars *v, chr *begin, /* beginning of relevant substring */ chr *end) /* end of same */ { - int n = t->subno; + int n = t->backno; size_t numreps; size_t tlen; size_t brlen; diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index dc31899aa4..2b48a19fb7 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -56,21 +56,23 @@ typedef struct { int re_magic; /* magic number */ size_t re_nsub; /* number of subexpressions */ - long re_info; /* information about RE */ -#define REG_UBACKREF 000001 -#define REG_ULOOKAROUND 000002 -#define REG_UBOUNDS 000004 -#define REG_UBRACES 000010 -#define REG_UBSALNUM 000020 -#define REG_UPBOTCH 000040 -#define REG_UBBS 000100 -#define REG_UNONPOSIX 000200 -#define REG_UUNSPEC 000400 -#define REG_UUNPORT 001000 -#define REG_ULOCALE 002000 -#define REG_UEMPTYMATCH 004000 -#define REG_UIMPOSSIBLE 010000 -#define REG_USHORTEST 020000 + long re_info; /* bitmask of the following flags: */ +#define REG_UBACKREF 000001 /* has back-reference (\n) */ +#define REG_ULOOKAROUND 000002 /* has lookahead/lookbehind constraint */ +#define REG_UBOUNDS 000004 /* has bounded quantifier ({m,n}) */ +#define REG_UBRACES 000010 /* has { that doesn't begin a quantifier */ +#define REG_UBSALNUM 000020 /* has backslash-alphanumeric in non-ARE */ +#define REG_UPBOTCH 000040 /* has unmatched right paren in ERE (legal + * per spec, but that was a mistake) */ +#define REG_UBBS 000100 /* has backslash within bracket expr */ +#define REG_UNONPOSIX 000200 /* has any construct that extends POSIX */ +#define REG_UUNSPEC 000400 /* has any case disallowed by POSIX, e.g. + * an empty branch */ +#define REG_UUNPORT 001000 /* has numeric character code dependency */ +#define REG_ULOCALE 002000 /* has locale dependency */ +#define REG_UEMPTYMATCH 004000 /* can match a zero-length string */ +#define REG_UIMPOSSIBLE 010000 /* provably cannot match anything */ +#define REG_USHORTEST 020000 /* has non-greedy quantifier */ int re_csize; /* sizeof(character) */ char *re_endp; /* backward compatibility kludge */ Oid re_collation; /* Collation that defines LC_CTYPE behavior */ diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 90ee16957a..306525eb5f 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -422,7 +422,7 @@ struct cnfa * "op" is one of: * '=' plain regex without interesting substructure (implemented as DFA) * 'b' back-reference (has no substructure either) - * '(' capture node: captures the match of its single child + * '(' no-op capture node: captures the match of its single child * '.' concatenation: matches a match for first child, then second child * '|' alternation: matches a match for any of its children * '*' iteration: matches some number of matches of its single child @@ -446,8 +446,8 @@ struct subre #define LONGER 01 /* prefers longer match */ #define SHORTER 02 /* prefers shorter match */ #define MIXED 04 /* mixed preference below */ -#define CAP 010 /* capturing parens below */ -#define BACKR 020 /* back reference below */ +#define CAP 010 /* capturing parens here or below */ +#define BACKR 020 /* back reference here or below */ #define INUSE 0100 /* in use in final tree */ #define NOPROP 03 /* bits which may not propagate up */ #define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ @@ -457,9 +457,10 @@ struct subre #define PREF(f) ((f)&NOPROP) #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) - short id; /* ID of subre (1..ntree-1) */ - int subno; /* subexpression number for 'b' and '(', or - * LATYPE code for lookaround constraint */ + char latype; /* LATYPE code, if lookaround constraint */ + int id; /* ID of subre (1..ntree-1) */ + int capno; /* if capture node, subno to capture into */ + int backno; /* if backref node, subno it refers to */ short min; /* min repetitions for iteration or backref */ short max; /* max repetitions for iteration or backref */ struct subre *child; /* first child, if any (also freelist chain) */