diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 08f08322ca..6c189bfed2 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -6166,6 +6166,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; The subexpression must entirely precede the back reference in the RE. Subexpressions are numbered in the order of their leading parentheses. Non-capturing parentheses do not define subexpressions. + The back reference considers only the string characters matched by the + referenced subexpression, not any constraints contained in it. For + example, (^\d)\1 will match 22. diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index a10a346e8f..77b860cb0f 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -1382,6 +1382,77 @@ duptraverse(struct nfa *nfa, } } +/* + * removeconstraints - remove any constraints in an NFA + * + * Constraint arcs are replaced by empty arcs, essentially treating all + * constraints as automatically satisfied. + */ +static void +removeconstraints(struct nfa *nfa, + struct state *start, /* process subNFA starting here */ + struct state *stop) /* and stopping here */ +{ + if (start == stop) + return; + + stop->tmp = stop; + removetraverse(nfa, start); + /* done, except for clearing out the tmp pointers */ + + stop->tmp = NULL; + cleartraverse(nfa, start); +} + +/* + * removetraverse - recursive heart of removeconstraints + */ +static void +removetraverse(struct nfa *nfa, + struct state *s) +{ + struct arc *a; + struct arc *oa; + + /* Since this is recursive, it could be driven to stack overflow */ + if (STACK_TOO_DEEP(nfa->v->re)) + { + NERR(REG_ETOOBIG); + return; + } + + if (s->tmp != NULL) + return; /* already done */ + + s->tmp = s; + for (a = s->outs; a != NULL && !NISERR(); a = oa) + { + removetraverse(nfa, a->to); + if (NISERR()) + break; + oa = a->outchain; + switch (a->type) + { + case PLAIN: + case EMPTY: + /* nothing to do */ + break; + case AHEAD: + case BEHIND: + case '^': + case '$': + case LACON: + /* replace it */ + newarc(nfa, EMPTY, 0, s, a->to); + freearc(nfa, a); + break; + default: + NERR(REG_ASSERT); + break; + } + } +} + /* * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set */ diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 1f7fa513b2..3c7627a955 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -150,6 +150,8 @@ static void delsub(struct nfa *, struct state *, struct state *); static void deltraverse(struct nfa *, struct state *, struct state *); static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *); static void duptraverse(struct nfa *, struct state *, struct state *); +static void removeconstraints(struct nfa *, struct state *, struct state *); +static void removetraverse(struct nfa *, struct state *); static void cleartraverse(struct nfa *, struct state *); static struct state *single_color_transition(struct state *, struct state *); static void specialcolors(struct nfa *); @@ -1182,6 +1184,10 @@ parseqatom(struct vars *v, dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end, atom->begin, atom->end); NOERR(); + + /* The backref node's NFA should not enforce any constraints */ + removeconstraints(v->nfa, atom->begin, atom->end); + NOERR(); } /* diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out index 5d993f40c2..01d50ec1e3 100644 --- a/src/test/modules/test_regex/expected/test_regex.out +++ b/src/test/modules/test_regex/expected/test_regex.out @@ -2636,6 +2636,28 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP'); {2,REG_UBACKREF,REG_UNONPOSIX} (1 row) +-- back reference only matches the string, not any constraints +select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP'); + test_regex +-------------------------------------------- + {1,REG_UBACKREF,REG_UNONPOSIX,REG_ULOCALE} + {"abc abc abc",abc} +(2 rows) + +select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP'); + test_regex +-------------------------------------------- + {1,REG_UBACKREF,REG_UNONPOSIX,REG_ULOCALE} + {"abc abc",abc} +(2 rows) + +select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP'); + test_regex +------------------------------------------------------------ + {1,REG_UBACKREF,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE} + {"abc abc",abc} +(2 rows) + -- doing 15 "octal escapes vs back references" -- # initial zero is always octal -- expectMatch 15.1 MP "a\\010b" "a\bb" "a\bb" diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql index b99329391e..7f5bc6e418 100644 --- a/src/test/modules/test_regex/sql/test_regex.sql +++ b/src/test/modules/test_regex/sql/test_regex.sql @@ -770,6 +770,11 @@ select * from test_regex('^(.+)( \1)+$', 'abc abd abc', 'RP'); -- expectNomatch 14.29 RP {^(.+)( \1)+$} {abc abc abd} select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP'); +-- back reference only matches the string, not any constraints +select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP'); +select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP'); +select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP'); + -- doing 15 "octal escapes vs back references" -- # initial zero is always octal