From 2a0af7fe460eb46f9af996075972bf7c2e3f211d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 25 Feb 2021 13:00:40 -0500 Subject: [PATCH] Allow complemented character class escapes within regex brackets. The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us --- doc/src/sgml/func.sgml | 25 +- src/backend/regex/re_syntax.n | 13 +- src/backend/regex/regc_color.c | 34 ++- src/backend/regex/regc_lex.c | 166 +--------- src/backend/regex/regc_locale.c | 107 +++---- src/backend/regex/regc_pg_locale.c | 9 + src/backend/regex/regcomp.c | 285 +++++++++++++++--- src/include/regex/regguts.h | 20 +- .../test_regex/expected/test_regex.out | 250 +++++++++++++++ .../modules/test_regex/sql/test_regex.sql | 44 +++ 10 files changed, 677 insertions(+), 276 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index d8224272a5..860ae11826 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -6097,6 +6097,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; non-ASCII characters to belong to any of these classes.) In addition to these standard character classes, PostgreSQL defines + the word character class, which is the same as + alnum plus the underscore (_) + character, and the ascii character class, which contains exactly the 7-bit ASCII set. @@ -6108,9 +6111,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; matching empty strings at the beginning and end of a word respectively. A word is defined as a sequence of word characters that is neither preceded nor followed by word - characters. A word character is an alnum character (as - defined by the POSIX character class described above) - or an underscore. This is an extension, compatible with but not + characters. A word character is any character belonging to the + word character class, that is, any letter, digit, + or underscore. This is an extension, compatible with but not specified by POSIX 1003.2, and should be used with caution in software intended to be portable to other systems. The constraint escapes described below are usually preferable; they @@ -6330,8 +6333,7 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; \w - [[:alnum:]_] - (note underscore is included) + [[:word:]] @@ -6346,21 +6348,18 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo; \W - [^[:alnum:]_] - (note underscore is included) + [^[:word:]] - Within bracket expressions, \d, \s, - and \w lose their outer brackets, - and \D, \S, and \W are illegal. - (So, for example, [a-c\d] is equivalent to + The class-shorthand escapes also work within bracket expressions, + although the definitions shown above are not quite syntactically + valid in that context. + For example, [a-c\d] is equivalent to [a-c[:digit:]]. - Also, [a-c\D], which is equivalent to - [a-c^[:digit:]], is illegal.) diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n index 4621bfc25f..1afaa7cce7 100644 --- a/src/backend/regex/re_syntax.n +++ b/src/backend/regex/re_syntax.n @@ -519,15 +519,10 @@ character classes: (note underscore) .RE .PP -Within bracket expressions, `\fB\ed\fR', `\fB\es\fR', -and `\fB\ew\fR'\& -lose their outer brackets, -and `\fB\eD\fR', `\fB\eS\fR', -and `\fB\eW\fR'\& -are illegal. -.VS 8.2 -(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR. -Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.) +The class-shorthand escapes also work within bracket expressions, +although the definitions shown above are not quite syntactically +valid in that context. +For example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR. .VE 8.2 .PP A constraint escape (AREs only) is a constraint, diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c index 0864011cce..30bda0e5ad 100644 --- a/src/backend/regex/regc_color.c +++ b/src/backend/regex/regc_color.c @@ -936,7 +936,16 @@ okcolors(struct nfa *nfa, } else if (cd->nschrs == 0 && cd->nuchrs == 0) { - /* parent empty, its arcs change color to subcolor */ + /* + * Parent is now empty, so just change all its arcs to the + * subcolor, then free the parent. + * + * It is not obvious that simply relabeling the arcs like this is + * OK; it appears to risk creating duplicate arcs. We are + * basically relying on the assumption that processing of a + * bracket expression can't create arcs of both a color and its + * subcolor between the bracket's endpoints. + */ cd->sub = NOSUB; scd = &cm->cd[sco]; assert(scd->nschrs > 0 || scd->nuchrs > 0); @@ -1062,6 +1071,7 @@ colorcomplement(struct nfa *nfa, struct colordesc *cd; struct colordesc *end = CDEND(cm); color co; + struct arc *a; assert(of != from); @@ -1069,10 +1079,26 @@ colorcomplement(struct nfa *nfa, if (findarc(of, PLAIN, RAINBOW) != NULL) return; + /* Otherwise, transiently mark the colors that appear in of's out-arcs */ + for (a = of->outs; a != NULL; a = a->outchain) + { + if (a->type == PLAIN) + { + assert(a->co >= 0); + cd = &cm->cd[a->co]; + assert(!UNUSEDCOLOR(cd)); + cd->flags |= COLMARK; + } + } + + /* Scan colors, clear transient marks, add arcs for unmarked colors */ for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++) - if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) - if (findarc(of, PLAIN, co) == NULL) - newarc(nfa, type, co, from, to); + { + if (cd->flags & COLMARK) + cd->flags &= ~COLMARK; + else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) + newarc(nfa, type, co, from, to); + } } diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index 1666453164..7673dab76f 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -193,83 +193,6 @@ prefixes(struct vars *v) } } -/* - * lexnest - "call a subroutine", interpolating string at the lexical level - * - * Note, this is not a very general facility. There are a number of - * implicit assumptions about what sorts of strings can be subroutines. - */ -static void -lexnest(struct vars *v, - const chr *beginp, /* start of interpolation */ - const chr *endp) /* one past end of interpolation */ -{ - assert(v->savenow == NULL); /* only one level of nesting */ - v->savenow = v->now; - v->savestop = v->stop; - v->now = beginp; - v->stop = endp; -} - -/* - * string constants to interpolate as expansions of things like \d - */ -static const chr backd[] = { /* \d */ - CHR('['), CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr backD[] = { /* \D */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr brbackd[] = { /* \d within brackets */ - CHR('['), CHR(':'), - CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), - CHR(':'), CHR(']') -}; -static const chr backs[] = { /* \s */ - CHR('['), CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr backS[] = { /* \S */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']'), CHR(']') -}; -static const chr brbacks[] = { /* \s within brackets */ - CHR('['), CHR(':'), - CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), - CHR(':'), CHR(']') -}; -static const chr backw[] = { /* \w */ - CHR('['), CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') -}; -static const chr backW[] = { /* \W */ - CHR('['), CHR('^'), CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_'), CHR(']') -}; -static const chr brbackw[] = { /* \w within brackets */ - CHR('['), CHR(':'), - CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), - CHR(':'), CHR(']'), CHR('_') -}; - -/* - * lexword - interpolate a bracket expression for word characters - * Possibly ought to inquire whether there is a "word" character class. - */ -static void -lexword(struct vars *v) -{ - lexnest(v, backw, ENDOF(backw)); -} - /* * next - get next token */ @@ -292,14 +215,6 @@ next(struct vars *v) RETV(SBEGIN, 0); /* same as \A */ } - /* if we're nested and we've hit end, return to outer level */ - if (v->savenow != NULL && ATEOS()) - { - v->now = v->savenow; - v->stop = v->savestop; - v->savenow = v->savestop = NULL; - } - /* skip white space etc. if appropriate (not in literal or []) */ if (v->cflags & REG_EXPANDED) switch (v->lexcon) @@ -420,32 +335,15 @@ next(struct vars *v) NOTE(REG_UNONPOSIX); if (ATEOS()) FAILW(REG_EESCAPE); - (DISCARD) lexescape(v); + if (!lexescape(v)) + return 0; switch (v->nexttype) { /* not all escapes okay here */ case PLAIN: + case CCLASSS: + case CCLASSC: return 1; break; - case CCLASS: - switch (v->nextvalue) - { - case 'd': - lexnest(v, brbackd, ENDOF(brbackd)); - break; - case 's': - lexnest(v, brbacks, ENDOF(brbacks)); - break; - case 'w': - lexnest(v, brbackw, ENDOF(brbackw)); - break; - default: - FAILW(REG_EESCAPE); - break; - } - /* lexnest done, back up and try again */ - v->nexttype = v->lasttype; - return next(v); - break; } /* not one of the acceptable escapes */ FAILW(REG_EESCAPE); @@ -691,49 +589,17 @@ next(struct vars *v) } RETV(PLAIN, *v->now++); } - (DISCARD) lexescape(v); - if (ISERR()) - FAILW(REG_EESCAPE); - if (v->nexttype == CCLASS) - { /* fudge at lexical level */ - switch (v->nextvalue) - { - case 'd': - lexnest(v, backd, ENDOF(backd)); - break; - case 'D': - lexnest(v, backD, ENDOF(backD)); - break; - case 's': - lexnest(v, backs, ENDOF(backs)); - break; - case 'S': - lexnest(v, backS, ENDOF(backS)); - break; - case 'w': - lexnest(v, backw, ENDOF(backw)); - break; - case 'W': - lexnest(v, backW, ENDOF(backW)); - break; - default: - assert(NOTREACHED); - FAILW(REG_ASSERT); - break; - } - /* lexnest done, back up and try again */ - v->nexttype = v->lasttype; - return next(v); - } - /* otherwise, lexescape has already done the work */ - return !ISERR(); + return lexescape(v); } /* * lexescape - parse an ARE backslash escape (backslash already eaten) - * Note slightly nonstandard use of the CCLASS type code. + * + * This is used for ARE backslashes both normally and inside bracket + * expressions. In the latter case, not all escape types are allowed, + * but the caller must reject unwanted ones after we return. */ -static int /* not actually used, but convenient for RETV */ +static int lexescape(struct vars *v) { chr c; @@ -775,11 +641,11 @@ lexescape(struct vars *v) break; case CHR('d'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'd'); + RETV(CCLASSS, CC_DIGIT); break; case CHR('D'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'D'); + RETV(CCLASSC, CC_DIGIT); break; case CHR('e'): NOTE(REG_UUNPORT); @@ -802,11 +668,11 @@ lexescape(struct vars *v) break; case CHR('s'): NOTE(REG_ULOCALE); - RETV(CCLASS, 's'); + RETV(CCLASSS, CC_SPACE); break; case CHR('S'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'S'); + RETV(CCLASSC, CC_SPACE); break; case CHR('t'): RETV(PLAIN, CHR('\t')); @@ -828,11 +694,11 @@ lexescape(struct vars *v) break; case CHR('w'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'w'); + RETV(CCLASSS, CC_WORD); break; case CHR('W'): NOTE(REG_ULOCALE); - RETV(CCLASS, 'W'); + RETV(CCLASSC, CC_WORD); break; case CHR('x'): NOTE(REG_UUNPORT); diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c index 047abc3e1e..b5f3a73b1b 100644 --- a/src/backend/regex/regc_locale.c +++ b/src/backend/regex/regc_locale.c @@ -350,17 +350,13 @@ static const struct cname }; /* - * The following arrays define the valid character class names. + * The following array defines the valid character class names. + * The entries must match enum char_classes in regguts.h. */ static const char *const classNames[NUM_CCLASSES + 1] = { "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", - "lower", "print", "punct", "space", "upper", "xdigit", NULL -}; - -enum classes -{ - CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, - CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT + "lower", "print", "punct", "space", "upper", "xdigit", "word", + NULL }; /* @@ -536,7 +532,36 @@ eclass(struct vars *v, /* context */ } /* - * cclass - supply cvec for a character class + * lookupcclass - lookup a character class identified by name + * + * On failure, sets an error code in *v; the result is then garbage. + */ +static enum char_classes +lookupcclass(struct vars *v, /* context (for returning errors) */ + const chr *startp, /* where the name starts */ + const chr *endp) /* just past the end of the name */ +{ + size_t len; + const char *const *namePtr; + int i; + + /* + * Map the name to the corresponding enumerated value. + */ + len = endp - startp; + for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) + { + if (strlen(*namePtr) == len && + pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) + return (enum char_classes) i; + } + + ERR(REG_ECTYPE); + return (enum char_classes) 0; +} + +/* + * cclasscvec - supply cvec for a character class * * Must include case counterparts if "cases" is true. * @@ -545,45 +570,20 @@ eclass(struct vars *v, /* context */ * because callers are not supposed to explicitly free the result either way. */ static struct cvec * -cclass(struct vars *v, /* context */ - const chr *startp, /* where the name starts */ - const chr *endp, /* just past the end of the name */ - int cases) /* case-independent? */ +cclasscvec(struct vars *v, /* context */ + enum char_classes cclasscode, /* class to build a cvec for */ + int cases) /* case-independent? */ { - size_t len; struct cvec *cv = NULL; - const char *const *namePtr; - int i, - index; - - /* - * Map the name to the corresponding enumerated value. - */ - len = endp - startp; - index = -1; - for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) - { - if (strlen(*namePtr) == len && - pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) - { - index = i; - break; - } - } - if (index == -1) - { - ERR(REG_ECTYPE); - return NULL; - } /* * Remap lower and upper to alpha if the match is case insensitive. */ if (cases && - ((enum classes) index == CC_LOWER || - (enum classes) index == CC_UPPER)) - index = (int) CC_ALPHA; + (cclasscode == CC_LOWER || + cclasscode == CC_UPPER)) + cclasscode = CC_ALPHA; /* * Now compute the character class contents. For classes that are based @@ -595,16 +595,19 @@ cclass(struct vars *v, /* context */ * NB: keep this code in sync with cclass_column_index(), below. */ - switch ((enum classes) index) + switch (cclasscode) { case CC_PRINT: - cv = pg_ctype_get_cache(pg_wc_isprint, index); + cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode); break; case CC_ALNUM: - cv = pg_ctype_get_cache(pg_wc_isalnum, index); + cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode); break; case CC_ALPHA: - cv = pg_ctype_get_cache(pg_wc_isalpha, index); + cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode); + break; + case CC_WORD: + cv = pg_ctype_get_cache(pg_wc_isword, cclasscode); break; case CC_ASCII: /* hard-wired meaning */ @@ -625,10 +628,10 @@ cclass(struct vars *v, /* context */ addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: - cv = pg_ctype_get_cache(pg_wc_isdigit, index); + cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode); break; case CC_PUNCT: - cv = pg_ctype_get_cache(pg_wc_ispunct, index); + cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode); break; case CC_XDIGIT: @@ -646,16 +649,16 @@ cclass(struct vars *v, /* context */ } break; case CC_SPACE: - cv = pg_ctype_get_cache(pg_wc_isspace, index); + cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode); break; case CC_LOWER: - cv = pg_ctype_get_cache(pg_wc_islower, index); + cv = pg_ctype_get_cache(pg_wc_islower, cclasscode); break; case CC_UPPER: - cv = pg_ctype_get_cache(pg_wc_isupper, index); + cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode); break; case CC_GRAPH: - cv = pg_ctype_get_cache(pg_wc_isgraph, index); + cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode); break; } @@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c) /* * Note: we should not see requests to consider cclasses that are not - * treated as locale-specific by cclass(), above. + * treated as locale-specific by cclasscvec(), above. */ if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) colnum |= cm->classbits[CC_PRINT]; @@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c) colnum |= cm->classbits[CC_ALNUM]; if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) colnum |= cm->classbits[CC_ALPHA]; + if (cm->classbits[CC_WORD] && pg_wc_isword(c)) + colnum |= cm->classbits[CC_WORD]; assert(cm->classbits[CC_ASCII] == 0); assert(cm->classbits[CC_BLANK] == 0); assert(cm->classbits[CC_CNTRL] == 0); diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 1fff3df1da..bbbd61c604 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -400,6 +400,15 @@ pg_wc_isalnum(pg_wchar c) return 0; /* can't get here, but keep compiler quiet */ } +static int +pg_wc_isword(pg_wchar c) +{ + /* We define word characters as alnum class plus underscore */ + if (c == CHR('_')) + return 1; + return pg_wc_isalnum(c); +} + static int pg_wc_isupper(pg_wchar c) { diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 0cd4b4c4c2..7b77a29136 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -46,13 +46,18 @@ static struct subre *parsebranch(struct vars *, int, int, struct state *, struct static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *); static void nonword(struct vars *, int, struct state *, struct state *); static void word(struct vars *, int, struct state *, struct state *); +static void charclass(struct vars *, enum char_classes, + struct state *, struct state *); +static void charclasscomplement(struct vars *, enum char_classes, + struct state *, struct state *); static int scannum(struct vars *); static void repeat(struct vars *, struct state *, struct state *, int, int); static void bracket(struct vars *, struct state *, struct state *); static void cbracket(struct vars *, struct state *, struct state *); -static void brackpart(struct vars *, struct state *, struct state *); +static void brackpart(struct vars *, struct state *, struct state *, bool *); static const chr *scanplain(struct vars *); static void onechr(struct vars *, chr, struct state *, struct state *); +static void optimizebracket(struct vars *, struct state *, struct state *); static void wordchrs(struct vars *); static void processlacon(struct vars *, struct state *, struct state *, int, struct state *, struct state *); @@ -81,8 +86,6 @@ static const char *stid(struct subre *, char *, size_t); /* === regc_lex.c === */ static void lexstart(struct vars *); static void prefixes(struct vars *); -static void lexnest(struct vars *, const chr *, const chr *); -static void lexword(struct vars *); static int next(struct vars *); static int lexescape(struct vars *); static chr lexdigits(struct vars *, int, int, int); @@ -206,6 +209,7 @@ static void freecvec(struct cvec *); static int pg_wc_isdigit(pg_wchar c); static int pg_wc_isalpha(pg_wchar c); static int pg_wc_isalnum(pg_wchar c); +static int pg_wc_isword(pg_wchar c); static int pg_wc_isupper(pg_wchar c); static int pg_wc_islower(pg_wchar c); static int pg_wc_isgraph(pg_wchar c); @@ -220,7 +224,8 @@ static chr element(struct vars *, const chr *, const chr *); static struct cvec *range(struct vars *, chr, chr, int); static int before(chr, chr); static struct cvec *eclass(struct vars *, chr, int); -static struct cvec *cclass(struct vars *, const chr *, const chr *, int); +static enum char_classes lookupcclass(struct vars *, const chr *, const chr *); +static struct cvec *cclasscvec(struct vars *, enum char_classes, int); static int cclass_column_index(struct colormap *, chr); static struct cvec *allcases(struct vars *, chr); static int cmp(const chr *, const chr *, size_t); @@ -233,14 +238,12 @@ struct vars regex_t *re; const chr *now; /* scan pointer into string */ const chr *stop; /* end of string */ - const chr *savenow; /* saved now and stop for "subroutine call" */ - const chr *savestop; int err; /* error code (0 if none) */ int cflags; /* copy of compile flags */ int lasttype; /* type of previous token */ int nexttype; /* type of next token */ chr nextvalue; /* value (if any) of next token */ - int lexcon; /* lexical context type (see lex.c) */ + int lexcon; /* lexical context type (see regc_lex.c) */ int nsubexp; /* subexpression count */ struct subre **subs; /* subRE pointer vector */ size_t nsubs; /* length of vector */ @@ -287,6 +290,8 @@ struct vars #define ECLASS 'E' /* start of [= */ #define CCLASS 'C' /* start of [: */ #define END 'X' /* end of [. [= [: */ +#define CCLASSS 's' /* char class shorthand escape */ +#define CCLASSC 'c' /* complement char class shorthand escape */ #define RANGE 'R' /* - within [] which might be range delim. */ #define LACON 'L' /* lookaround constraint subRE */ #define AHEAD 'a' /* color-lookahead arc */ @@ -356,7 +361,6 @@ pg_regcomp(regex_t *re, v->re = re; v->now = string; v->stop = v->now + len; - v->savenow = v->savestop = NULL; v->err = 0; v->cflags = flags; v->nsubexp = 0; @@ -835,23 +839,25 @@ parseqatom(struct vars *v, return; break; case '<': - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); word(v, AHEAD, s, rp); + NEXT(); return; break; case '>': - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case WBDRY: - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); nonword(v, BEHIND, lp, s); @@ -860,10 +866,11 @@ parseqatom(struct vars *v, NOERR(); word(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case NWBDRY: - wordchrs(v); /* does NEXT() */ + wordchrs(v); s = newstate(v->nfa); NOERR(); word(v, BEHIND, lp, s); @@ -872,6 +879,7 @@ parseqatom(struct vars *v, NOERR(); nonword(v, BEHIND, lp, s); nonword(v, AHEAD, s, rp); + NEXT(); return; break; case LACON: /* lookaround constraint */ @@ -925,6 +933,16 @@ parseqatom(struct vars *v, assert(SEE(']') || ISERR()); NEXT(); break; + case CCLASSS: + charclass(v, (enum char_classes) v->nextvalue, lp, rp); + okcolors(v->nfa, v->cm); + NEXT(); + break; + case CCLASSC: + charclasscomplement(v, (enum char_classes) v->nextvalue, lp, rp); + /* charclasscomplement() did okcolors() internally */ + NEXT(); + break; case '.': rainbow(v->nfa, v->cm, PLAIN, (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS, @@ -1338,6 +1356,75 @@ word(struct vars *v, /* (no need for special attention to \n) */ } +/* + * charclass - generate arcs for a character class + * + * This is used for both atoms (\w and sibling escapes) and for elements + * of bracket expressions. The caller is responsible for calling okcolors() + * at the end of processing the atom or bracket. + */ +static void +charclass(struct vars *v, + enum char_classes cls, + struct state *lp, + struct state *rp) +{ + struct cvec *cv; + + /* obtain possibly-cached cvec for char class */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, cls, (v->cflags & REG_ICASE)); + NOERR(); + + /* build the arcs; this may cause color splitting */ + subcolorcvec(v, cv, lp, rp); +} + +/* + * charclasscomplement - generate arcs for a complemented character class + * + * This is used for both atoms (\W and sibling escapes) and for elements + * of bracket expressions. In bracket expressions, it is the caller's + * responsibility that there not be any open subcolors when this is called. + */ +static void +charclasscomplement(struct vars *v, + enum char_classes cls, + struct state *lp, + struct state *rp) +{ + struct state *cstate; + struct cvec *cv; + + /* make dummy state to hang temporary arcs on */ + cstate = newstate(v->nfa); + NOERR(); + + /* obtain possibly-cached cvec for char class */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, cls, (v->cflags & REG_ICASE)); + NOERR(); + + /* build arcs for char class; this may cause color splitting */ + subcolorcvec(v, cv, cstate, cstate); + + /* in NLSTOP mode, ensure newline is not part of the result set */ + if (v->cflags & REG_NLSTOP) + newarc(v->nfa, PLAIN, v->nlcolor, cstate, cstate); + NOERR(); + + /* clean up any subcolors in the arc set */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* now build output arcs for the complement of the char class */ + colorcomplement(v->nfa, v->cm, PLAIN, cstate, lp, rp); + NOERR(); + + /* clean up dummy state */ + dropstate(v->nfa, cstate); +} + /* * scannum - scan a number */ @@ -1456,6 +1543,7 @@ repeat(struct vars *v, /* * bracket - handle non-complemented bracket expression + * * Also called from cbracket for complemented bracket expressions. */ static void @@ -1463,16 +1551,52 @@ bracket(struct vars *v, struct state *lp, struct state *rp) { + /* + * We can't process complemented char classes (e.g. \W) immediately while + * scanning the bracket expression, else color bookkeeping gets confused. + * Instead, remember whether we saw any in have_cclassc[], and process + * them at the end. + */ + bool have_cclassc[NUM_CCLASSES]; + bool any_cclassc; + int i; + + memset(have_cclassc, false, sizeof(have_cclassc)); + assert(SEE('[')); NEXT(); while (!SEE(']') && !SEE(EOS)) - brackpart(v, lp, rp); + brackpart(v, lp, rp, have_cclassc); assert(SEE(']') || ISERR()); + + /* close up open subcolors from the positive bracket elements */ okcolors(v->nfa, v->cm); + NOERR(); + + /* now handle any complemented elements */ + any_cclassc = false; + for (i = 0; i < NUM_CCLASSES; i++) + { + if (have_cclassc[i]) + { + charclasscomplement(v, (enum char_classes) i, lp, rp); + NOERR(); + any_cclassc = true; + } + } + + /* + * If we had any complemented elements, see if we can optimize the bracket + * into a rainbow. Since a complemented element is the only way a WHITE + * arc could get into the result, there's no point in checking otherwise. + */ + if (any_cclassc) + optimizebracket(v, lp, rp); } /* * cbracket - handle complemented bracket expression + * * We do it by calling bracket() with dummy endpoints, and then complementing * the result. The alternative would be to invoke rainbow(), and then delete * arcs as the b.e. is seen... but that gets messy, and is really quite @@ -1496,7 +1620,9 @@ cbracket(struct vars *v, /* * Easy part of complementing, and all there is to do since the MCCE code - * was removed. + * was removed. Note that the result of colorcomplement() cannot be a + * rainbow, since we don't allow empty brackets; so there's no point in + * calling optimizebracket() again. */ colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp); NOERR(); @@ -1511,14 +1637,15 @@ cbracket(struct vars *v, static void brackpart(struct vars *v, struct state *lp, - struct state *rp) + struct state *rp, + bool *have_cclassc) { chr startc; chr endc; struct cvec *cv; + enum char_classes cls; const chr *startp; const chr *endp; - chr c[1]; /* parse something, get rid of special cases, take shortcuts */ switch (v->nexttype) @@ -1528,15 +1655,14 @@ brackpart(struct vars *v, return; break; case PLAIN: - c[0] = v->nextvalue; + startc = v->nextvalue; NEXT(); /* shortcut for ordinary chr (not range) */ if (!SEE(RANGE)) { - onechr(v, c[0], lp, rp); + onechr(v, startc, lp, rp); return; } - startc = element(v, c, c + 1); NOERR(); break; case COLLEL: @@ -1564,9 +1690,20 @@ brackpart(struct vars *v, endp = scanplain(v); INSIST(startp < endp, REG_ECTYPE); NOERR(); - cv = cclass(v, startp, endp, (v->cflags & REG_ICASE)); + cls = lookupcclass(v, startp, endp); NOERR(); - subcolorcvec(v, cv, lp, rp); + charclass(v, cls, lp, rp); + return; + break; + case CCLASSS: + charclass(v, (enum char_classes) v->nextvalue, lp, rp); + NEXT(); + return; + break; + case CCLASSC: + /* we cannot call charclasscomplement() immediately */ + have_cclassc[v->nextvalue] = true; + NEXT(); return; break; default: @@ -1582,9 +1719,8 @@ brackpart(struct vars *v, { case PLAIN: case RANGE: - c[0] = v->nextvalue; + endc = v->nextvalue; NEXT(); - endc = element(v, c, c + 1); NOERR(); break; case COLLEL: @@ -1618,7 +1754,7 @@ brackpart(struct vars *v, /* * scanplain - scan PLAIN contents of [. etc. * - * Certain bits of trickery in lex.c know that this code does not try + * Certain bits of trickery in regc_lex.c know that this code does not try * to look past the final bracket of the [. etc. */ static const chr * /* just after end of sequence */ @@ -1664,39 +1800,98 @@ onechr(struct vars *v, subcolorcvec(v, allcases(v, c), lp, rp); } +/* + * optimizebracket - see if bracket expression can be converted to RAINBOW + * + * Cases such as "[\s\S]" can produce a set of arcs of all colors, which we + * can replace by a single RAINBOW arc for efficiency. (This might seem + * like a silly way to write ".", but it's seemingly a common locution in + * some other flavors of regex, so take the trouble to support it well.) + */ +static void +optimizebracket(struct vars *v, + struct state *lp, + struct state *rp) +{ + struct colordesc *cd; + struct colordesc *end = CDEND(v->cm); + struct arc *a; + bool israinbow; + + /* + * Scan lp's out-arcs and transiently mark the mentioned colors. We + * expect that all of lp's out-arcs are plain, non-RAINBOW arcs to rp. + * (Note: there shouldn't be any pseudocolors yet, but check anyway.) + */ + for (a = lp->outs; a != NULL; a = a->outchain) + { + assert(a->type == PLAIN); + assert(a->co >= 0); /* i.e. not RAINBOW */ + assert(a->to == rp); + cd = &v->cm->cd[a->co]; + assert(!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)); + cd->flags |= COLMARK; + } + + /* Scan colors, clear transient marks, check for unmarked live colors */ + israinbow = true; + for (cd = v->cm->cd; cd < end; cd++) + { + if (cd->flags & COLMARK) + cd->flags &= ~COLMARK; + else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO)) + israinbow = false; + } + + /* Can't do anything if not all colors have arcs */ + if (!israinbow) + return; + + /* OK, drop existing arcs and replace with a rainbow */ + while ((a = lp->outs) != NULL) + freearc(v->nfa, a); + newarc(v->nfa, PLAIN, RAINBOW, lp, rp); +} + /* * wordchrs - set up word-chr list for word-boundary stuff, if needed * - * The list is kept as a bunch of arcs between two dummy states; it's - * disposed of by the unreachable-states sweep in NFA optimization. - * Does NEXT(). Must not be called from any unusual lexical context. - * This should be reconciled with the \w etc. handling in lex.c, and - * should be cleaned up to reduce dependencies on input scanning. + * The list is kept as a bunch of circular arcs on an otherwise-unused state. + * + * Note that this must not be called while we have any open subcolors, + * else construction of the list would confuse color bookkeeping. + * Hence, we can't currently apply a similar optimization in + * charclass[complement](), as those need to be usable within bracket + * expressions. */ static void wordchrs(struct vars *v) { - struct state *left; - struct state *right; + struct state *cstate; + struct cvec *cv; if (v->wordchrs != NULL) - { - NEXT(); /* for consistency */ - return; - } + return; /* done already */ - left = newstate(v->nfa); - right = newstate(v->nfa); + /* make dummy state to hang the cache arcs on */ + cstate = newstate(v->nfa); NOERR(); - /* fine point: implemented with [::], and lexer will set REG_ULOCALE */ - lexword(v); - NEXT(); - assert(v->savenow != NULL && SEE('[')); - bracket(v, left, right); - assert((v->savenow != NULL && SEE(']')) || ISERR()); - NEXT(); + + /* obtain possibly-cached cvec for \w characters */ + NOTE(REG_ULOCALE); + cv = cclasscvec(v, CC_WORD, (v->cflags & REG_ICASE)); NOERR(); - v->wordchrs = left; + + /* build the arcs; this may cause color splitting */ + subcolorcvec(v, cv, cstate, cstate); + NOERR(); + + /* close new open subcolors to ensure the cache entry is self-contained */ + okcolors(v->nfa, v->cm); + NOERR(); + + /* success! save the cache pointer */ + v->wordchrs = cstate; } /* diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 306525eb5f..0e76a828f8 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -127,6 +127,18 @@ #define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS))) +/* + * known character classes + */ +enum char_classes +{ + CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, + CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD +}; + +#define NUM_CCLASSES 14 + + /* * As soon as possible, we map chrs into equivalence classes -- "colors" -- * which are of much more manageable number. @@ -164,12 +176,14 @@ struct colordesc #define NOSUB COLORLESS /* value of "sub" when no open subcolor */ struct arc *arcs; /* chain of all arcs of this color */ chr firstchr; /* simple char first assigned to this color */ - int flags; /* bit values defined next */ + int flags; /* bitmask of the following flags: */ #define FREECOL 01 /* currently free */ #define PSEUDO 02 /* pseudocolor, no real chars */ -#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL) +#define COLMARK 04 /* temporary marker used in some functions */ }; +#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL) + /* * The color map itself * @@ -199,8 +213,6 @@ struct colordesc * appear in increasing chr-value order. */ -#define NUM_CCLASSES 13 /* must match data in regc_locale.c */ - typedef struct colormaprange { chr cmin; /* range represents cmin..cmax inclusive */ diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out index 21282789c2..92154b6d28 100644 --- a/src/test/modules/test_regex/expected/test_regex.out +++ b/src/test/modules/test_regex/expected/test_regex.out @@ -1970,6 +1970,256 @@ select * from test_regex('a[\w]b', 'axb', 'LPE'); {axb} (2 rows) +-- these should be invalid +select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE'); +ERROR: invalid regular expression: invalid character range +select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE'); +ERROR: invalid regular expression: invalid character range +select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS'); +ERROR: invalid regular expression: invalid character range +select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS'); +ERROR: invalid regular expression: invalid character range +-- test complemented char classes within brackets +select * from test_regex('[\D]', '0123456789abc*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {a} +(2 rows) + +select * from test_regex('[^\D]', 'abc0123456789*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {0} +(2 rows) + +select * from test_regex('[1\D7]', '0123456789abc*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {1} +(2 rows) + +select * from test_regex('[7\D1]', '0123456789abc*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {1} +(2 rows) + +select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {2} +(2 rows) + +select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {2} +(2 rows) + +select * from test_regex('\W', '0123456789abc_*', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {*} +(2 rows) + +select * from test_regex('[\W]', '0123456789abc_*', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {*} +(2 rows) + +select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE'); + test_regex +-------------------------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UEMPTYMATCH} + {"012 3456789abc_*"} +(2 rows) + +-- check char classes' handling of newlines +select * from test_regex('\s+', E'abc \n def', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {" + + "} +(2 rows) + +select * from test_regex('\s+', E'abc \n def', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {" + + "} +(2 rows) + +select * from test_regex('[\s]+', E'abc \n def', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {" + + "} +(2 rows) + +select * from test_regex('[\s]+', E'abc \n def', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {" + + "} +(2 rows) + +select * from test_regex('\S+', E'abc\ndef', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('\S+', E'abc\ndef', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('[\S]+', E'abc\ndef', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('[\S]+', E'abc\ndef', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('\d+', E'012\n345', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {012} +(2 rows) + +select * from test_regex('\d+', E'012\n345', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {012} +(2 rows) + +select * from test_regex('[\d]+', E'012\n345', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {012} +(2 rows) + +select * from test_regex('[\d]+', E'012\n345', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {012} +(2 rows) + +select * from test_regex('\D+', E'abc\ndef345', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {"abc + + def"} +(2 rows) + +select * from test_regex('\D+', E'abc\ndef345', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('[\D]+', E'abc\ndef345', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {"abc + + def"} +(2 rows) + +select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc} +(2 rows) + +select * from test_regex('\w+', E'abc_012\ndef', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc_012} +(2 rows) + +select * from test_regex('\w+', E'abc_012\ndef', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {abc_012} +(2 rows) + +select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc_012} +(2 rows) + +select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {abc_012} +(2 rows) + +select * from test_regex('\W+', E'***\n@@@___', 'LP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {"*** + + @@@"} +(2 rows) + +select * from test_regex('\W+', E'***\n@@@___', 'nLP'); + test_regex +------------------------------- + {0,REG_UNONPOSIX,REG_ULOCALE} + {***} +(2 rows) + +select * from test_regex('[\W]+', E'***\n@@@___', 'LPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {"*** + + @@@"} +(2 rows) + +select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE'); + test_regex +---------------------------------------- + {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE} + {***} +(2 rows) + -- doing 13 "escapes" -- expectError 13.1 & "a\\" EESCAPE select * from test_regex('a\', '', ''); diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql index 31e947ee9c..b99329391e 100644 --- a/src/test/modules/test_regex/sql/test_regex.sql +++ b/src/test/modules/test_regex/sql/test_regex.sql @@ -597,6 +597,50 @@ select * from test_regex('a[\s]b', 'a b', 'LPE'); -- expectMatch 12.18 LPE {a[\w]b} axb axb select * from test_regex('a[\w]b', 'axb', 'LPE'); +-- these should be invalid +select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE'); +select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE'); +select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS'); +select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS'); + +-- test complemented char classes within brackets +select * from test_regex('[\D]', '0123456789abc*', 'LPE'); +select * from test_regex('[^\D]', 'abc0123456789*', 'LPE'); +select * from test_regex('[1\D7]', '0123456789abc*', 'LPE'); +select * from test_regex('[7\D1]', '0123456789abc*', 'LPE'); +select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE'); +select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE'); +select * from test_regex('\W', '0123456789abc_*', 'LP'); +select * from test_regex('[\W]', '0123456789abc_*', 'LPE'); +select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE'); + +-- check char classes' handling of newlines +select * from test_regex('\s+', E'abc \n def', 'LP'); +select * from test_regex('\s+', E'abc \n def', 'nLP'); +select * from test_regex('[\s]+', E'abc \n def', 'LPE'); +select * from test_regex('[\s]+', E'abc \n def', 'nLPE'); +select * from test_regex('\S+', E'abc\ndef', 'LP'); +select * from test_regex('\S+', E'abc\ndef', 'nLP'); +select * from test_regex('[\S]+', E'abc\ndef', 'LPE'); +select * from test_regex('[\S]+', E'abc\ndef', 'nLPE'); +select * from test_regex('\d+', E'012\n345', 'LP'); +select * from test_regex('\d+', E'012\n345', 'nLP'); +select * from test_regex('[\d]+', E'012\n345', 'LPE'); +select * from test_regex('[\d]+', E'012\n345', 'nLPE'); +select * from test_regex('\D+', E'abc\ndef345', 'LP'); +select * from test_regex('\D+', E'abc\ndef345', 'nLP'); +select * from test_regex('[\D]+', E'abc\ndef345', 'LPE'); +select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE'); +select * from test_regex('\w+', E'abc_012\ndef', 'LP'); +select * from test_regex('\w+', E'abc_012\ndef', 'nLP'); +select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE'); +select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE'); +select * from test_regex('\W+', E'***\n@@@___', 'LP'); +select * from test_regex('\W+', E'***\n@@@___', 'nLP'); +select * from test_regex('[\W]+', E'***\n@@@___', 'LPE'); +select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE'); + + -- doing 13 "escapes" -- expectError 13.1 & "a\\" EESCAPE