Improve regex compiler's arc moving/copying logic.

The functions moveins(), copyins(), moveouts(), copyouts() are
required to preserve the invariant that there are no duplicate arcs in
the regex's NFA.  Spencer's original implementation of them was O(N^2)
since it checked separately for a match to each source arc.  In commit
579840ca0 I improved that by adding sort/merge logic to be used if
more than a few arcs are to be moved/copied.  However, I now realize
that that missed a bet.  At many call sites, the target state is newly
made and cannot have any existing in-arcs (respectively out-arcs)
that could be duplicates.  So spending any cycles at all on checking
for duplicates is wasted effort; in these cases we can just blindly
move/copy all the source arcs.  Add code paths to do that.

It turns out that for copyins()/copyouts(), *all* the call sites have
this property, making all the "improved" logic in them flat out
unreachable.  Perhaps we'll need the full capability again someday,
so I just #ifdef'd those paths out rather than removing them entirely.

In passing, add a few test cases to improve code coverage in this
area as well as in regc_locale.c/regc_pg_locale.c.

Discussion: https://postgr.es/m/810272.1629064063@sss.pgh.pa.us
This commit is contained in:
Tom Lane 2021-08-17 12:00:02 -04:00
parent f576de1db1
commit 78a843f119
5 changed files with 338 additions and 4 deletions

View File

@ -777,6 +777,10 @@ sortouts_cmp(const void *a, const void *b)
* However, if we have a whole lot of arcs to deal with, retail duplicate
* checks become too slow. In that case we proceed by sorting and merging
* the arc lists, and then we can indeed just update the arcs in-place.
*
* On the other hand, it's also true that this is frequently called with
* a brand-new newState that has no existing in-arcs. In that case,
* de-duplication is unnecessary, so we can just blindly move all the arcs.
*/
static void
moveins(struct nfa *nfa,
@ -785,7 +789,18 @@ moveins(struct nfa *nfa,
{
assert(oldState != newState);
if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
if (newState->nins == 0)
{
/* No need for de-duplication */
struct arc *a;
while ((a = oldState->ins) != NULL)
{
createarc(nfa, a->type, a->co, a->from, newState);
freearc(nfa, a);
}
}
else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
{
/* With not too many arcs, just do them one at a time */
struct arc *a;
@ -869,6 +884,11 @@ moveins(struct nfa *nfa,
/*
* copyins - copy in arcs of a state to another state
*
* The comments for moveins() apply here as well. However, in current
* usage, this is *only* called with brand-new target states, so that
* only the "no need for de-duplication" code path is ever reached.
* We keep the rest #ifdef'd out in case it's needed in the future.
*/
static void
copyins(struct nfa *nfa,
@ -876,8 +896,18 @@ copyins(struct nfa *nfa,
struct state *newState)
{
assert(oldState != newState);
assert(newState->nins == 0); /* see comment above */
if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
if (newState->nins == 0)
{
/* No need for de-duplication */
struct arc *a;
for (a = oldState->ins; a != NULL; a = a->inchain)
createarc(nfa, a->type, a->co, a->from, newState);
}
#ifdef NOT_USED /* see comment above */
else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
{
/* With not too many arcs, just do them one at a time */
struct arc *a;
@ -944,6 +974,7 @@ copyins(struct nfa *nfa,
createarc(nfa, a->type, a->co, a->from, newState);
}
}
#endif /* NOT_USED */
}
/*
@ -1058,7 +1089,18 @@ moveouts(struct nfa *nfa,
{
assert(oldState != newState);
if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
if (newState->nouts == 0)
{
/* No need for de-duplication */
struct arc *a;
while ((a = oldState->outs) != NULL)
{
createarc(nfa, a->type, a->co, newState, a->to);
freearc(nfa, a);
}
}
else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
{
/* With not too many arcs, just do them one at a time */
struct arc *a;
@ -1142,6 +1184,8 @@ moveouts(struct nfa *nfa,
/*
* copyouts - copy out arcs of a state to another state
*
* See comments for copyins()
*/
static void
copyouts(struct nfa *nfa,
@ -1149,8 +1193,18 @@ copyouts(struct nfa *nfa,
struct state *newState)
{
assert(oldState != newState);
assert(newState->nouts == 0); /* see comment above */
if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
if (newState->nouts == 0)
{
/* No need for de-duplication */
struct arc *a;
for (a = oldState->outs; a != NULL; a = a->outchain)
createarc(nfa, a->type, a->co, newState, a->to);
}
#ifdef NOT_USED /* see comment above */
else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
{
/* With not too many arcs, just do them one at a time */
struct arc *a;
@ -1217,6 +1271,7 @@ copyouts(struct nfa *nfa,
createarc(nfa, a->type, a->co, newState, a->to);
}
}
#endif /* NOT_USED */
}
/*
@ -1975,6 +2030,7 @@ combine(struct nfa *nfa,
else if (a->co == RAINBOW)
{
/* con is incompatible if it's for a pseudocolor */
/* (this is hypothetical; we make no such constraints today) */
if (nfa->cm->cd[con->co].flags & PSEUDO)
return INCOMPATIBLE;
/* otherwise, constraint constrains arc to be only its color */
@ -2001,6 +2057,7 @@ combine(struct nfa *nfa,
else if (a->co == RAINBOW)
{
/* con is incompatible if it's for a pseudocolor */
/* (this is hypothetical; we make no such constraints today) */
if (nfa->cm->cd[con->co].flags & PSEUDO)
return INCOMPATIBLE;
/* otherwise, constraint constrains arc to be only its color */
@ -3562,6 +3619,7 @@ carc_cmp(const void *a, const void *b)
return -1;
if (aa->to > bb->to)
return +1;
/* This is unreached, since there should be no duplicate arcs now: */
return 0;
}

View File

@ -937,6 +937,34 @@ select * from test_regex('a[[=x=]]', 'az', '+Lb');
{0,REG_ULOCALE}
(1 row)
-- expectMatch 9.9b &iL {a[[=Y=]]} ay ay
select * from test_regex('a[[=Y=]]', 'ay', 'iL');
test_regex
-----------------
{0,REG_ULOCALE}
{ay}
(2 rows)
select * from test_regex('a[[=Y=]]', 'ay', 'iLb');
test_regex
-----------------
{0,REG_ULOCALE}
{ay}
(2 rows)
-- expectNomatch 9.9c &L {a[[=Y=]]} ay
select * from test_regex('a[[=Y=]]', 'ay', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
(1 row)
select * from test_regex('a[[=Y=]]', 'ay', 'Lb');
test_regex
-----------------
{0,REG_ULOCALE}
(1 row)
-- expectError 9.10 & {a[0-[=x=]]} ERANGE
select * from test_regex('a[0-[=x=]]', '', '');
ERROR: invalid regular expression: invalid character range
@ -2932,6 +2960,34 @@ select * from test_regex('a[^b-d]', 'aC', 'iMb');
{0,REG_UUNPORT}
(1 row)
-- expectMatch 19.6 &iM {a[B-Z]} aC aC
select * from test_regex('a[B-Z]', 'aC', 'iM');
test_regex
-----------------
{0,REG_UUNPORT}
{aC}
(2 rows)
select * from test_regex('a[B-Z]', 'aC', 'iMb');
test_regex
-----------------
{0,REG_UUNPORT}
{aC}
(2 rows)
-- expectNomatch 19.7 &iM {a[^B-Z]} aC
select * from test_regex('a[^B-Z]', 'aC', 'iM');
test_regex
-----------------
{0,REG_UUNPORT}
(1 row)
select * from test_regex('a[^B-Z]', 'aC', 'iMb');
test_regex
-----------------
{0,REG_UUNPORT}
(1 row)
-- doing 20 "directors and embedded options"
-- expectError 20.1 & ***? BADPAT
select * from test_regex('***?', '', '');
@ -3850,6 +3906,14 @@ select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz',
{foo/bar/baz,foo,bar,baz}
(2 rows)
-- expectMatch 24.14 PRT {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$} {foo/bar/baz/quux} {foo/bar/baz/quux} {foo} {bar/baz/quux} {}
select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT');
test_regex
----------------------------------------------
{3,REG_UBACKREF,REG_UNONPOSIX,REG_USHORTEST}
{foo/bar/baz/quux,foo,bar/baz/quux,NULL}
(2 rows)
-- doing 25 "mixed quantifiers"
-- # this is very incomplete as yet
-- # should include |
@ -4926,3 +4990,59 @@ select * from test_regex('(\Y)+', 'foo', 'LNP');
{"",""}
(2 rows)
-- and now, tests not from either Spencer or the Tcl project
-- These cases exercise additional code paths in pushfwd()/push()/combine()
select * from test_regex('a\Y(?=45)', 'a45', 'HLP');
test_regex
-----------------------------------------------
{0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
{a}
(2 rows)
select * from test_regex('a(?=.)c', 'ac', 'HP');
test_regex
-----------------------------------
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
{ac}
(2 rows)
select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP');
test_regex
-----------------------------------
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
{azz33}
(2 rows)
select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP');
test_regex
-----------------------------------------------
{0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
{az3%}
(2 rows)
-- These exercise the bulk-arc-movement paths in moveins() and moveouts();
-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT()
select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)',
'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{ABCDEFGHIJKLMNOPQRSTUVWXYZ3}
(2 rows)
select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+',
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP');
test_regex
-------------------------------------------
{1,REG_UNONPOSIX,REG_ULOCALE}
{ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,""}
(2 rows)
select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$',
'az', '');
test_regex
--------------
{2}
{az,az,NULL}
(2 rows)

View File

@ -98,3 +98,109 @@ select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
{0,REG_UBBS,REG_UNONPOSIX,REG_UUNPORT,REG_ULOCALE}
(1 row)
select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
E'\u1500\u1237', 'iELMP');
test_regex
----------------------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_UUNPORT,REG_ULOCALE}
{ᔀሷ}
(2 rows)
-- systematically test char classes
select * from test_regex('[[:alnum:]]+', E'x\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{xᔀሷ}
(2 rows)
select * from test_regex('[[:alpha:]]+', E'x\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{xᔀሷ}
(2 rows)
select * from test_regex('[[:ascii:]]+', E'x\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{x}
(2 rows)
select * from test_regex('[[:blank:]]+', E'x \t\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{" "}
(2 rows)
select * from test_regex('[[:cntrl:]]+', E'x\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
(1 row)
select * from test_regex('[[:digit:]]+', E'x9\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{9}
(2 rows)
select * from test_regex('[[:graph:]]+', E'x\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{xᔀሷ}
(2 rows)
select * from test_regex('[[:lower:]]+', E'x\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{x}
(2 rows)
select * from test_regex('[[:print:]]+', E'x\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{xᔀሷ}
(2 rows)
select * from test_regex('[[:punct:]]+', E'x.\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{.}
(2 rows)
select * from test_regex('[[:space:]]+', E'x \t\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{" "}
(2 rows)
select * from test_regex('[[:upper:]]+', E'xX\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{X}
(2 rows)
select * from test_regex('[[:xdigit:]]+', E'xa9\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{a9}
(2 rows)
select * from test_regex('[[:word:]]+', E'x_\u1500\u1237', 'L');
test_regex
-----------------
{0,REG_ULOCALE}
{x_ᔀሷ}
(2 rows)

View File

@ -304,6 +304,12 @@ select * from test_regex('a[[=x=]]', 'ay', '+Lb');
-- expectNomatch 9.9 &+L {a[[=x=]]} az
select * from test_regex('a[[=x=]]', 'az', '+L');
select * from test_regex('a[[=x=]]', 'az', '+Lb');
-- expectMatch 9.9b &iL {a[[=Y=]]} ay ay
select * from test_regex('a[[=Y=]]', 'ay', 'iL');
select * from test_regex('a[[=Y=]]', 'ay', 'iLb');
-- expectNomatch 9.9c &L {a[[=Y=]]} ay
select * from test_regex('a[[=Y=]]', 'ay', 'L');
select * from test_regex('a[[=Y=]]', 'ay', 'Lb');
-- expectError 9.10 & {a[0-[=x=]]} ERANGE
select * from test_regex('a[0-[=x=]]', '', '');
select * from test_regex('a[0-[=x=]]', '', 'b');
@ -864,6 +870,12 @@ select * from test_regex('a[b-d]', 'aC', 'iMb');
-- expectNomatch 19.5 &iM {a[^b-d]} aC
select * from test_regex('a[^b-d]', 'aC', 'iM');
select * from test_regex('a[^b-d]', 'aC', 'iMb');
-- expectMatch 19.6 &iM {a[B-Z]} aC aC
select * from test_regex('a[B-Z]', 'aC', 'iM');
select * from test_regex('a[B-Z]', 'aC', 'iMb');
-- expectNomatch 19.7 &iM {a[^B-Z]} aC
select * from test_regex('a[^B-Z]', 'aC', 'iM');
select * from test_regex('a[^B-Z]', 'aC', 'iMb');
-- doing 20 "directors and embedded options"
@ -1171,6 +1183,8 @@ select * from test_regex('z*4', '123zzzz456', '-');
select * from test_regex('z*?4', '123zzzz456', 'PT');
-- expectMatch 24.13 PT {^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$} {foo/bar/baz} {foo/bar/baz} {foo} {bar} {baz}
select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz', 'PT');
-- expectMatch 24.14 PRT {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$} {foo/bar/baz/quux} {foo/bar/baz/quux} {foo} {bar/baz/quux} {}
select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT');
-- doing 25 "mixed quantifiers"
-- # this is very incomplete as yet
@ -1741,3 +1755,21 @@ select * from test_regex(repeat('x*y*z*', 200), 'x', 'N');
-- regexp {(\Y)+} foo
-- } 1
select * from test_regex('(\Y)+', 'foo', 'LNP');
-- and now, tests not from either Spencer or the Tcl project
-- These cases exercise additional code paths in pushfwd()/push()/combine()
select * from test_regex('a\Y(?=45)', 'a45', 'HLP');
select * from test_regex('a(?=.)c', 'ac', 'HP');
select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP');
select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP');
-- These exercise the bulk-arc-movement paths in moveins() and moveouts();
-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT()
select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)',
'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP');
select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+',
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP');
select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$',
'az', '');

View File

@ -58,3 +58,21 @@ select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
E'\u1500\u1237', 'ELMP');
select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
E'A\u1239', 'ELMP');
select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
E'\u1500\u1237', 'iELMP');
-- systematically test char classes
select * from test_regex('[[:alnum:]]+', E'x\u1500\u1237', 'L');
select * from test_regex('[[:alpha:]]+', E'x\u1500\u1237', 'L');
select * from test_regex('[[:ascii:]]+', E'x\u1500\u1237', 'L');
select * from test_regex('[[:blank:]]+', E'x \t\u1500\u1237', 'L');
select * from test_regex('[[:cntrl:]]+', E'x\u1500\u1237', 'L');
select * from test_regex('[[:digit:]]+', E'x9\u1500\u1237', 'L');
select * from test_regex('[[:graph:]]+', E'x\u1500\u1237', 'L');
select * from test_regex('[[:lower:]]+', E'x\u1500\u1237', 'L');
select * from test_regex('[[:print:]]+', E'x\u1500\u1237', 'L');
select * from test_regex('[[:punct:]]+', E'x.\u1500\u1237', 'L');
select * from test_regex('[[:space:]]+', E'x \t\u1500\u1237', 'L');
select * from test_regex('[[:upper:]]+', E'xX\u1500\u1237', 'L');
select * from test_regex('[[:xdigit:]]+', E'xa9\u1500\u1237', 'L');
select * from test_regex('[[:word:]]+', E'x_\u1500\u1237', 'L');