diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 6d77c59e12..059cf64df0 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -777,6 +777,10 @@ sortouts_cmp(const void *a, const void *b) * However, if we have a whole lot of arcs to deal with, retail duplicate * checks become too slow. In that case we proceed by sorting and merging * the arc lists, and then we can indeed just update the arcs in-place. + * + * On the other hand, it's also true that this is frequently called with + * a brand-new newState that has no existing in-arcs. In that case, + * de-duplication is unnecessary, so we can just blindly move all the arcs. */ static void moveins(struct nfa *nfa, @@ -785,7 +789,18 @@ moveins(struct nfa *nfa, { assert(oldState != newState); - if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) + if (newState->nins == 0) + { + /* No need for de-duplication */ + struct arc *a; + + while ((a = oldState->ins) != NULL) + { + createarc(nfa, a->type, a->co, a->from, newState); + freearc(nfa, a); + } + } + else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) { /* With not too many arcs, just do them one at a time */ struct arc *a; @@ -869,6 +884,11 @@ moveins(struct nfa *nfa, /* * copyins - copy in arcs of a state to another state + * + * The comments for moveins() apply here as well. However, in current + * usage, this is *only* called with brand-new target states, so that + * only the "no need for de-duplication" code path is ever reached. + * We keep the rest #ifdef'd out in case it's needed in the future. */ static void copyins(struct nfa *nfa, @@ -876,8 +896,18 @@ copyins(struct nfa *nfa, struct state *newState) { assert(oldState != newState); + assert(newState->nins == 0); /* see comment above */ - if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) + if (newState->nins == 0) + { + /* No need for de-duplication */ + struct arc *a; + + for (a = oldState->ins; a != NULL; a = a->inchain) + createarc(nfa, a->type, a->co, a->from, newState); + } +#ifdef NOT_USED /* see comment above */ + else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) { /* With not too many arcs, just do them one at a time */ struct arc *a; @@ -944,6 +974,7 @@ copyins(struct nfa *nfa, createarc(nfa, a->type, a->co, a->from, newState); } } +#endif /* NOT_USED */ } /* @@ -1058,7 +1089,18 @@ moveouts(struct nfa *nfa, { assert(oldState != newState); - if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) + if (newState->nouts == 0) + { + /* No need for de-duplication */ + struct arc *a; + + while ((a = oldState->outs) != NULL) + { + createarc(nfa, a->type, a->co, newState, a->to); + freearc(nfa, a); + } + } + else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) { /* With not too many arcs, just do them one at a time */ struct arc *a; @@ -1142,6 +1184,8 @@ moveouts(struct nfa *nfa, /* * copyouts - copy out arcs of a state to another state + * + * See comments for copyins() */ static void copyouts(struct nfa *nfa, @@ -1149,8 +1193,18 @@ copyouts(struct nfa *nfa, struct state *newState) { assert(oldState != newState); + assert(newState->nouts == 0); /* see comment above */ - if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) + if (newState->nouts == 0) + { + /* No need for de-duplication */ + struct arc *a; + + for (a = oldState->outs; a != NULL; a = a->outchain) + createarc(nfa, a->type, a->co, newState, a->to); + } +#ifdef NOT_USED /* see comment above */ + else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) { /* With not too many arcs, just do them one at a time */ struct arc *a; @@ -1217,6 +1271,7 @@ copyouts(struct nfa *nfa, createarc(nfa, a->type, a->co, newState, a->to); } } +#endif /* NOT_USED */ } /* @@ -1975,6 +2030,7 @@ combine(struct nfa *nfa, else if (a->co == RAINBOW) { /* con is incompatible if it's for a pseudocolor */ + /* (this is hypothetical; we make no such constraints today) */ if (nfa->cm->cd[con->co].flags & PSEUDO) return INCOMPATIBLE; /* otherwise, constraint constrains arc to be only its color */ @@ -2001,6 +2057,7 @@ combine(struct nfa *nfa, else if (a->co == RAINBOW) { /* con is incompatible if it's for a pseudocolor */ + /* (this is hypothetical; we make no such constraints today) */ if (nfa->cm->cd[con->co].flags & PSEUDO) return INCOMPATIBLE; /* otherwise, constraint constrains arc to be only its color */ @@ -3562,6 +3619,7 @@ carc_cmp(const void *a, const void *b) return -1; if (aa->to > bb->to) return +1; + /* This is unreached, since there should be no duplicate arcs now: */ return 0; } diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out index 5a6cdf47c2..6242d0baa9 100644 --- a/src/test/modules/test_regex/expected/test_regex.out +++ b/src/test/modules/test_regex/expected/test_regex.out @@ -937,6 +937,34 @@ select * from test_regex('a[[=x=]]', 'az', '+Lb'); {0,REG_ULOCALE} (1 row) +-- expectMatch 9.9b &iL {a[[=Y=]]} ay ay +select * from test_regex('a[[=Y=]]', 'ay', 'iL'); + test_regex +----------------- + {0,REG_ULOCALE} + {ay} +(2 rows) + +select * from test_regex('a[[=Y=]]', 'ay', 'iLb'); + test_regex +----------------- + {0,REG_ULOCALE} + {ay} +(2 rows) + +-- expectNomatch 9.9c &L {a[[=Y=]]} ay +select * from test_regex('a[[=Y=]]', 'ay', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} +(1 row) + +select * from test_regex('a[[=Y=]]', 'ay', 'Lb'); + test_regex +----------------- + {0,REG_ULOCALE} +(1 row) + -- expectError 9.10 & {a[0-[=x=]]} ERANGE select * from test_regex('a[0-[=x=]]', '', ''); ERROR: invalid regular expression: invalid character range @@ -2932,6 +2960,34 @@ select * from test_regex('a[^b-d]', 'aC', 'iMb'); {0,REG_UUNPORT} (1 row) +-- expectMatch 19.6 &iM {a[B-Z]} aC aC +select * from test_regex('a[B-Z]', 'aC', 'iM'); + test_regex +----------------- + {0,REG_UUNPORT} + {aC} +(2 rows) + +select * from test_regex('a[B-Z]', 'aC', 'iMb'); + test_regex +----------------- + {0,REG_UUNPORT} + {aC} +(2 rows) + +-- expectNomatch 19.7 &iM {a[^B-Z]} aC +select * from test_regex('a[^B-Z]', 'aC', 'iM'); + test_regex +----------------- + {0,REG_UUNPORT} +(1 row) + +select * from test_regex('a[^B-Z]', 'aC', 'iMb'); + test_regex +----------------- + {0,REG_UUNPORT} +(1 row) + -- doing 20 "directors and embedded options" -- expectError 20.1 & ***? BADPAT select * from test_regex('***?', '', ''); @@ -3850,6 +3906,14 @@ select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz', {foo/bar/baz,foo,bar,baz} (2 rows) +-- expectMatch 24.14 PRT {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$} {foo/bar/baz/quux} {foo/bar/baz/quux} {foo} {bar/baz/quux} {} +select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT'); + test_regex +---------------------------------------------- + {3,REG_UBACKREF,REG_UNONPOSIX,REG_USHORTEST} + {foo/bar/baz/quux,foo,bar/baz/quux,NULL} +(2 rows) + -- doing 25 "mixed quantifiers" -- # this is very incomplete as yet -- # should include | @@ -4926,3 +4990,59 @@ select * from test_regex('(\Y)+', 'foo', 'LNP'); {"",""} (2 rows) +-- and now, tests not from either Spencer or the Tcl project +-- These cases exercise additional code paths in pushfwd()/push()/combine() +select * from test_regex('a\Y(?=45)', 'a45', 'HLP'); + test_regex +----------------------------------------------- + {0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE} + {a} +(2 rows) + +select * from test_regex('a(?=.)c', 'ac', 'HP'); + test_regex +----------------------------------- + {0,REG_ULOOKAROUND,REG_UNONPOSIX} + {ac} +(2 rows) + +select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP'); + test_regex +----------------------------------- + {0,REG_ULOOKAROUND,REG_UNONPOSIX} + {azz33} +(2 rows) + +select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP'); + test_regex +----------------------------------------------- + {0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE} + {az3%} +(2 rows) + +-- These exercise the bulk-arc-movement paths in moveins() and moveouts(); +-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT() +select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)', + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {ABCDEFGHIJKLMNOPQRSTUVWXYZ3} +(2 rows) + +select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+', + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP'); + test_regex +------------------------------------------- + {1,REG_UNONPOSIX,REG_ULOCALE} + {ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,""} +(2 rows) + +select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$', + 'az', ''); + test_regex +-------------- + {2} + {az,az,NULL} +(2 rows) + diff --git a/src/test/modules/test_regex/expected/test_regex_utf8.out b/src/test/modules/test_regex/expected/test_regex_utf8.out index 112698ac61..3b56f36c07 100644 --- a/src/test/modules/test_regex/expected/test_regex_utf8.out +++ b/src/test/modules/test_regex/expected/test_regex_utf8.out @@ -98,3 +98,109 @@ select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237', {0,REG_UBBS,REG_UNONPOSIX,REG_UUNPORT,REG_ULOCALE} (1 row) +select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237', + E'\u1500\u1237', 'iELMP'); + test_regex +---------------------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_UUNPORT,REG_ULOCALE} + {ᔀሷ} +(2 rows) + +-- systematically test char classes +select * from test_regex('[[:alnum:]]+', E'x\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {xᔀሷ} +(2 rows) + +select * from test_regex('[[:alpha:]]+', E'x\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {xᔀሷ} +(2 rows) + +select * from test_regex('[[:ascii:]]+', E'x\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {x} +(2 rows) + +select * from test_regex('[[:blank:]]+', E'x \t\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {" "} +(2 rows) + +select * from test_regex('[[:cntrl:]]+', E'x\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} +(1 row) + +select * from test_regex('[[:digit:]]+', E'x9\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {9} +(2 rows) + +select * from test_regex('[[:graph:]]+', E'x\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {xᔀሷ} +(2 rows) + +select * from test_regex('[[:lower:]]+', E'x\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {x} +(2 rows) + +select * from test_regex('[[:print:]]+', E'x\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {xᔀሷ} +(2 rows) + +select * from test_regex('[[:punct:]]+', E'x.\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {.} +(2 rows) + +select * from test_regex('[[:space:]]+', E'x \t\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {" "} +(2 rows) + +select * from test_regex('[[:upper:]]+', E'xX\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {X} +(2 rows) + +select * from test_regex('[[:xdigit:]]+', E'xa9\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {a9} +(2 rows) + +select * from test_regex('[[:word:]]+', E'x_\u1500\u1237', 'L'); + test_regex +----------------- + {0,REG_ULOCALE} + {x_ᔀሷ} +(2 rows) + diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql index 3419564203..389b8b61b3 100644 --- a/src/test/modules/test_regex/sql/test_regex.sql +++ b/src/test/modules/test_regex/sql/test_regex.sql @@ -304,6 +304,12 @@ select * from test_regex('a[[=x=]]', 'ay', '+Lb'); -- expectNomatch 9.9 &+L {a[[=x=]]} az select * from test_regex('a[[=x=]]', 'az', '+L'); select * from test_regex('a[[=x=]]', 'az', '+Lb'); +-- expectMatch 9.9b &iL {a[[=Y=]]} ay ay +select * from test_regex('a[[=Y=]]', 'ay', 'iL'); +select * from test_regex('a[[=Y=]]', 'ay', 'iLb'); +-- expectNomatch 9.9c &L {a[[=Y=]]} ay +select * from test_regex('a[[=Y=]]', 'ay', 'L'); +select * from test_regex('a[[=Y=]]', 'ay', 'Lb'); -- expectError 9.10 & {a[0-[=x=]]} ERANGE select * from test_regex('a[0-[=x=]]', '', ''); select * from test_regex('a[0-[=x=]]', '', 'b'); @@ -864,6 +870,12 @@ select * from test_regex('a[b-d]', 'aC', 'iMb'); -- expectNomatch 19.5 &iM {a[^b-d]} aC select * from test_regex('a[^b-d]', 'aC', 'iM'); select * from test_regex('a[^b-d]', 'aC', 'iMb'); +-- expectMatch 19.6 &iM {a[B-Z]} aC aC +select * from test_regex('a[B-Z]', 'aC', 'iM'); +select * from test_regex('a[B-Z]', 'aC', 'iMb'); +-- expectNomatch 19.7 &iM {a[^B-Z]} aC +select * from test_regex('a[^B-Z]', 'aC', 'iM'); +select * from test_regex('a[^B-Z]', 'aC', 'iMb'); -- doing 20 "directors and embedded options" @@ -1171,6 +1183,8 @@ select * from test_regex('z*4', '123zzzz456', '-'); select * from test_regex('z*?4', '123zzzz456', 'PT'); -- expectMatch 24.13 PT {^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$} {foo/bar/baz} {foo/bar/baz} {foo} {bar} {baz} select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz', 'PT'); +-- expectMatch 24.14 PRT {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$} {foo/bar/baz/quux} {foo/bar/baz/quux} {foo} {bar/baz/quux} {} +select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT'); -- doing 25 "mixed quantifiers" -- # this is very incomplete as yet @@ -1741,3 +1755,21 @@ select * from test_regex(repeat('x*y*z*', 200), 'x', 'N'); -- regexp {(\Y)+} foo -- } 1 select * from test_regex('(\Y)+', 'foo', 'LNP'); + + +-- and now, tests not from either Spencer or the Tcl project + +-- These cases exercise additional code paths in pushfwd()/push()/combine() +select * from test_regex('a\Y(?=45)', 'a45', 'HLP'); +select * from test_regex('a(?=.)c', 'ac', 'HP'); +select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP'); +select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP'); + +-- These exercise the bulk-arc-movement paths in moveins() and moveouts(); +-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT() +select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)', + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP'); +select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+', + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP'); +select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$', + 'az', ''); diff --git a/src/test/modules/test_regex/sql/test_regex_utf8.sql b/src/test/modules/test_regex/sql/test_regex_utf8.sql index cfd9396194..f23907162e 100644 --- a/src/test/modules/test_regex/sql/test_regex_utf8.sql +++ b/src/test/modules/test_regex/sql/test_regex_utf8.sql @@ -58,3 +58,21 @@ select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237', E'\u1500\u1237', 'ELMP'); select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237', E'A\u1239', 'ELMP'); +select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237', + E'\u1500\u1237', 'iELMP'); + +-- systematically test char classes +select * from test_regex('[[:alnum:]]+', E'x\u1500\u1237', 'L'); +select * from test_regex('[[:alpha:]]+', E'x\u1500\u1237', 'L'); +select * from test_regex('[[:ascii:]]+', E'x\u1500\u1237', 'L'); +select * from test_regex('[[:blank:]]+', E'x \t\u1500\u1237', 'L'); +select * from test_regex('[[:cntrl:]]+', E'x\u1500\u1237', 'L'); +select * from test_regex('[[:digit:]]+', E'x9\u1500\u1237', 'L'); +select * from test_regex('[[:graph:]]+', E'x\u1500\u1237', 'L'); +select * from test_regex('[[:lower:]]+', E'x\u1500\u1237', 'L'); +select * from test_regex('[[:print:]]+', E'x\u1500\u1237', 'L'); +select * from test_regex('[[:punct:]]+', E'x.\u1500\u1237', 'L'); +select * from test_regex('[[:space:]]+', E'x \t\u1500\u1237', 'L'); +select * from test_regex('[[:upper:]]+', E'xX\u1500\u1237', 'L'); +select * from test_regex('[[:xdigit:]]+', E'xa9\u1500\u1237', 'L'); +select * from test_regex('[[:word:]]+', E'x_\u1500\u1237', 'L');