Allow complemented character class escapes within regex brackets.

The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
2021-02-25 13:00:40 -05:00 · 2021-02-25 13:00:40 -05:00 · 2a0af7fe46
parent 6b40d9bdbd
commit 2a0af7fe46
10 changed files with 677 additions and 276 deletions
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@ -6097,6 +6097,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
    non-ASCII characters to belong to any of these classes.)
    In addition to these standard character
    classes, <productname>PostgreSQL</productname> defines
    the <literal>word</literal> character class, which is the same as
    <literal>alnum</literal> plus the underscore (<literal>_</literal>)
    character, and
    the <literal>ascii</literal> character class, which contains exactly
    the 7-bit ASCII set.
   </para>
@ -6108,9 +6111,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
    matching empty strings at the beginning
    and end of a word respectively.  A word is defined as a sequence
    of word characters that is neither preceded nor followed by word
-    characters.  A word character is an <literal>alnum</literal> character (as
+    characters.  A word character is any character belonging to the
-    defined by the <acronym>POSIX</acronym> character class described above)
+    <literal>word</literal> character class, that is, any letter, digit,
-    or an underscore.  This is an extension, compatible with but not
+    or underscore.  This is an extension, compatible with but not
    specified by <acronym>POSIX</acronym> 1003.2, and should be used with
    caution in software intended to be portable to other systems.
    The constraint escapes described below are usually preferable; they
@ -6330,8 +6333,7 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
       <row>
       <entry> <literal>\w</literal> </entry>
-       <entry> <literal>[[:alnum:]_]</literal>
+       <entry> <literal>[[:word:]]</literal> </entry>
       (note underscore is included) </entry>
       </row>
       <row>
@ -6346,21 +6348,18 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
       <row>
       <entry> <literal>\W</literal> </entry>
-       <entry> <literal>[^[:alnum:]_]</literal>
+       <entry> <literal>[^[:word:]]</literal> </entry>
       (note underscore is included) </entry>
       </row>
      </tbody>
     </tgroup>
    </table>
   <para>
-    Within bracket expressions, <literal>\d</literal>, <literal>\s</literal>,
+    The class-shorthand escapes also work within bracket expressions,
-    and <literal>\w</literal> lose their outer brackets,
+    although the definitions shown above are not quite syntactically
-    and <literal>\D</literal>, <literal>\S</literal>, and <literal>\W</literal> are illegal.
+    valid in that context.
-    (So, for example, <literal>[a-c\d]</literal> is equivalent to
+    For example, <literal>[a-c\d]</literal> is equivalent to
    <literal>[a-c[:digit:]]</literal>.
    Also, <literal>[a-c\D]</literal>, which is equivalent to
    <literal>[a-c^[:digit:]]</literal>, is illegal.)
   </para>
   <table id="posix-constraint-escapes-table">
--- a/src/backend/regex/re_syntax.n
+++ b/src/backend/regex/re_syntax.n
@ -519,15 +519,10 @@ character classes:
 (note underscore)
 .RE
 .PP
-Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
+The class-shorthand escapes also work within bracket expressions,
-and `\fB\ew\fR'\&
+although the definitions shown above are not quite syntactically
-lose their outer brackets,
+valid in that context.
-and `\fB\eD\fR', `\fB\eS\fR',
+For example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
 and `\fB\eW\fR'\&
 are illegal.
 .VS 8.2
 (So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
 Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
 .VE 8.2
 .PP
 A constraint escape (AREs only) is a constraint,
--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@ -936,7 +936,16 @@ okcolors(struct nfa *nfa,
 		}
 		else if (cd->nschrs == 0 && cd->nuchrs == 0)
 		{
-			/* parent empty, its arcs change color to subcolor */
+			/*
 			 * Parent is now empty, so just change all its arcs to the
 			 * subcolor, then free the parent.
 			 *
 			 * It is not obvious that simply relabeling the arcs like this is
 			 * OK; it appears to risk creating duplicate arcs.  We are
 			 * basically relying on the assumption that processing of a
 			 * bracket expression can't create arcs of both a color and its
 			 * subcolor between the bracket's endpoints.
 			 */
 			cd->sub = NOSUB;
 			scd = &cm->cd[sco];
 			assert(scd->nschrs > 0 || scd->nuchrs > 0);
@ -1062,6 +1071,7 @@ colorcomplement(struct nfa *nfa,
 	struct colordesc *cd;
 	struct colordesc *end = CDEND(cm);
 	color		co;
 	struct arc *a;
 	assert(of != from);
@ -1069,10 +1079,26 @@ colorcomplement(struct nfa *nfa,
 	if (findarc(of, PLAIN, RAINBOW) != NULL)
 		return;
 	/* Otherwise, transiently mark the colors that appear in of's out-arcs */
 	for (a = of->outs; a != NULL; a = a->outchain)
 	{
 		if (a->type == PLAIN)
 		{
 			assert(a->co >= 0);
 			cd = &cm->cd[a->co];
 			assert(!UNUSEDCOLOR(cd));
 			cd->flags |= COLMARK;
 		}
 	}
 	/* Scan colors, clear transient marks, add arcs for unmarked colors */
 	for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
-		if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
+	{
-			if (findarc(of, PLAIN, co) == NULL)
+		if (cd->flags & COLMARK)
-				newarc(nfa, type, co, from, to);
+			cd->flags &= ~COLMARK;
 		else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
 			newarc(nfa, type, co, from, to);
 	}
 }
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@ -193,83 +193,6 @@ prefixes(struct vars *v)
 	}
 }
 /*
 * lexnest - "call a subroutine", interpolating string at the lexical level
 *
 * Note, this is not a very general facility.  There are a number of
 * implicit assumptions about what sorts of strings can be subroutines.
 */
 static void
 lexnest(struct vars *v,
 		const chr *beginp,		/* start of interpolation */
 		const chr *endp)		/* one past end of interpolation */
 {
 	assert(v->savenow == NULL); /* only one level of nesting */
 	v->savenow = v->now;
 	v->savestop = v->stop;
 	v->now = beginp;
 	v->stop = endp;
 }
 /*
 * string constants to interpolate as expansions of things like \d
 */
 static const chr backd[] = {	/* \d */
 	CHR('['), CHR('['), CHR(':'),
 	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
 	CHR(':'), CHR(']'), CHR(']')
 };
 static const chr backD[] = {	/* \D */
 	CHR('['), CHR('^'), CHR('['), CHR(':'),
 	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
 	CHR(':'), CHR(']'), CHR(']')
 };
 static const chr brbackd[] = {	/* \d within brackets */
 	CHR('['), CHR(':'),
 	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
 	CHR(':'), CHR(']')
 };
 static const chr backs[] = {	/* \s */
 	CHR('['), CHR('['), CHR(':'),
 	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
 	CHR(':'), CHR(']'), CHR(']')
 };
 static const chr backS[] = {	/* \S */
 	CHR('['), CHR('^'), CHR('['), CHR(':'),
 	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
 	CHR(':'), CHR(']'), CHR(']')
 };
 static const chr brbacks[] = {	/* \s within brackets */
 	CHR('['), CHR(':'),
 	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
 	CHR(':'), CHR(']')
 };
 static const chr backw[] = {	/* \w */
 	CHR('['), CHR('['), CHR(':'),
 	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
 	CHR(':'), CHR(']'), CHR('_'), CHR(']')
 };
 static const chr backW[] = {	/* \W */
 	CHR('['), CHR('^'), CHR('['), CHR(':'),
 	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
 	CHR(':'), CHR(']'), CHR('_'), CHR(']')
 };
 static const chr brbackw[] = {	/* \w within brackets */
 	CHR('['), CHR(':'),
 	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
 	CHR(':'), CHR(']'), CHR('_')
 };
 /*
 * lexword - interpolate a bracket expression for word characters
 * Possibly ought to inquire whether there is a "word" character class.
 */
 static void
 lexword(struct vars *v)
 {
 	lexnest(v, backw, ENDOF(backw));
 }
 /*
 * next - get next token
 */
@ -292,14 +215,6 @@ next(struct vars *v)
 		RETV(SBEGIN, 0);		/* same as \A */
 	}
 	/* if we're nested and we've hit end, return to outer level */
 	if (v->savenow != NULL && ATEOS())
 	{
 		v->now = v->savenow;
 		v->stop = v->savestop;
 		v->savenow = v->savestop = NULL;
 	}
 	/* skip white space etc. if appropriate (not in literal or []) */
 	if (v->cflags & REG_EXPANDED)
 		switch (v->lexcon)
@ -420,32 +335,15 @@ next(struct vars *v)
 					NOTE(REG_UNONPOSIX);
 					if (ATEOS())
 						FAILW(REG_EESCAPE);
-					(DISCARD) lexescape(v);
+					if (!lexescape(v))
 						return 0;
 					switch (v->nexttype)
 					{			/* not all escapes okay here */
 						case PLAIN:
 						case CCLASSS:
 						case CCLASSC:
 							return 1;
 							break;
 						case CCLASS:
 							switch (v->nextvalue)
 							{
 								case 'd':
 									lexnest(v, brbackd, ENDOF(brbackd));
 									break;
 								case 's':
 									lexnest(v, brbacks, ENDOF(brbacks));
 									break;
 								case 'w':
 									lexnest(v, brbackw, ENDOF(brbackw));
 									break;
 								default:
 									FAILW(REG_EESCAPE);
 									break;
 							}
 							/* lexnest done, back up and try again */
 							v->nexttype = v->lasttype;
 							return next(v);
 							break;
 					}
 					/* not one of the acceptable escapes */
 					FAILW(REG_EESCAPE);
@ -691,49 +589,17 @@ next(struct vars *v)
 		}
 		RETV(PLAIN, *v->now++);
 	}
-	(DISCARD) lexescape(v);
+	return lexescape(v);
 	if (ISERR())
 		FAILW(REG_EESCAPE);
 	if (v->nexttype == CCLASS)
 	{							/* fudge at lexical level */
 		switch (v->nextvalue)
 		{
 			case 'd':
 				lexnest(v, backd, ENDOF(backd));
 				break;
 			case 'D':
 				lexnest(v, backD, ENDOF(backD));
 				break;
 			case 's':
 				lexnest(v, backs, ENDOF(backs));
 				break;
 			case 'S':
 				lexnest(v, backS, ENDOF(backS));
 				break;
 			case 'w':
 				lexnest(v, backw, ENDOF(backw));
 				break;
 			case 'W':
 				lexnest(v, backW, ENDOF(backW));
 				break;
 			default:
 				assert(NOTREACHED);
 				FAILW(REG_ASSERT);
 				break;
 		}
 		/* lexnest done, back up and try again */
 		v->nexttype = v->lasttype;
 		return next(v);
 	}
 	/* otherwise, lexescape has already done the work */
 	return !ISERR();
 }
 /*
 * lexescape - parse an ARE backslash escape (backslash already eaten)
- * Note slightly nonstandard use of the CCLASS type code.
+ *
 * This is used for ARE backslashes both normally and inside bracket
 * expressions.  In the latter case, not all escape types are allowed,
 * but the caller must reject unwanted ones after we return.
 */
-static int						/* not actually used, but convenient for RETV */
+static int
 lexescape(struct vars *v)
 {
 	chr			c;
@ -775,11 +641,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('d'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'd');
+			RETV(CCLASSS, CC_DIGIT);
 			break;
 		case CHR('D'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'D');
+			RETV(CCLASSC, CC_DIGIT);
 			break;
 		case CHR('e'):
 			NOTE(REG_UUNPORT);
@ -802,11 +668,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('s'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 's');
+			RETV(CCLASSS, CC_SPACE);
 			break;
 		case CHR('S'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'S');
+			RETV(CCLASSC, CC_SPACE);
 			break;
 		case CHR('t'):
 			RETV(PLAIN, CHR('\t'));
@ -828,11 +694,11 @@ lexescape(struct vars *v)
 			break;
 		case CHR('w'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'w');
+			RETV(CCLASSS, CC_WORD);
 			break;
 		case CHR('W'):
 			NOTE(REG_ULOCALE);
-			RETV(CCLASS, 'W');
+			RETV(CCLASSC, CC_WORD);
 			break;
 		case CHR('x'):
 			NOTE(REG_UUNPORT);
--- a/src/backend/regex/regc_locale.c
+++ b/src/backend/regex/regc_locale.c
@ -350,17 +350,13 @@ static const struct cname
 };
 /*
- * The following arrays define the valid character class names.
+ * The following array defines the valid character class names.
 * The entries must match enum char_classes in regguts.h.
 */
 static const char *const classNames[NUM_CCLASSES + 1] = {
 	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
-	"lower", "print", "punct", "space", "upper", "xdigit", NULL
+	"lower", "print", "punct", "space", "upper", "xdigit", "word",
-};
+	NULL
 enum classes
 {
 	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
 	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
 };
 /*
@ -536,7 +532,36 @@ eclass(struct vars *v,			/* context */
 }
 /*
- * cclass - supply cvec for a character class
+ * lookupcclass - lookup a character class identified by name
 *
 * On failure, sets an error code in *v; the result is then garbage.
 */
 static enum char_classes
 lookupcclass(struct vars *v,	/* context (for returning errors) */
 			 const chr *startp, /* where the name starts */
 			 const chr *endp)	/* just past the end of the name */
 {
 	size_t		len;
 	const char *const *namePtr;
 	int			i;
 	/*
 	 * Map the name to the corresponding enumerated value.
 	 */
 	len = endp - startp;
 	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
 	{
 		if (strlen(*namePtr) == len &&
 			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
 			return (enum char_classes) i;
 	}
 	ERR(REG_ECTYPE);
 	return (enum char_classes) 0;
 }
 /*
 * cclasscvec - supply cvec for a character class
 *
 * Must include case counterparts if "cases" is true.
 *
@ -545,45 +570,20 @@ eclass(struct vars *v,			/* context */
 * because callers are not supposed to explicitly free the result either way.
 */
 static struct cvec *
-cclass(struct vars *v,			/* context */
+cclasscvec(struct vars *v,		/* context */
-	   const chr *startp,		/* where the name starts */
+		   enum char_classes cclasscode,	/* class to build a cvec for */
-	   const chr *endp,			/* just past the end of the name */
+		   int cases)			/* case-independent? */
 	   int cases)				/* case-independent? */
 {
 	size_t		len;
 	struct cvec *cv = NULL;
 	const char *const *namePtr;
 	int			i,
 				index;
 	/*
 	 * Map the name to the corresponding enumerated value.
 	 */
 	len = endp - startp;
 	index = -1;
 	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
 	{
 		if (strlen(*namePtr) == len &&
 			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
 		{
 			index = i;
 			break;
 		}
 	}
 	if (index == -1)
 	{
 		ERR(REG_ECTYPE);
 		return NULL;
 	}
 	/*
 	 * Remap lower and upper to alpha if the match is case insensitive.
 	 */
 	if (cases &&
-		((enum classes) index == CC_LOWER ||
+		(cclasscode == CC_LOWER ||
-		 (enum classes) index == CC_UPPER))
+		 cclasscode == CC_UPPER))
-		index = (int) CC_ALPHA;
+		cclasscode = CC_ALPHA;
 	/*
 	 * Now compute the character class contents.  For classes that are based
@ -595,16 +595,19 @@ cclass(struct vars *v,			/* context */
 	 * NB: keep this code in sync with cclass_column_index(), below.
 	 */
-	switch ((enum classes) index)
+	switch (cclasscode)
 	{
 		case CC_PRINT:
-			cv = pg_ctype_get_cache(pg_wc_isprint, index);
+			cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
 			break;
 		case CC_ALNUM:
-			cv = pg_ctype_get_cache(pg_wc_isalnum, index);
+			cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
 			break;
 		case CC_ALPHA:
-			cv = pg_ctype_get_cache(pg_wc_isalpha, index);
+			cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
 			break;
 		case CC_WORD:
 			cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
 			break;
 		case CC_ASCII:
 			/* hard-wired meaning */
@ -625,10 +628,10 @@ cclass(struct vars *v,			/* context */
 			addrange(cv, 0x7f, 0x9f);
 			break;
 		case CC_DIGIT:
-			cv = pg_ctype_get_cache(pg_wc_isdigit, index);
+			cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
 			break;
 		case CC_PUNCT:
-			cv = pg_ctype_get_cache(pg_wc_ispunct, index);
+			cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
 			break;
 		case CC_XDIGIT:
@ -646,16 +649,16 @@ cclass(struct vars *v,			/* context */
 			}
 			break;
 		case CC_SPACE:
-			cv = pg_ctype_get_cache(pg_wc_isspace, index);
+			cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
 			break;
 		case CC_LOWER:
-			cv = pg_ctype_get_cache(pg_wc_islower, index);
+			cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
 			break;
 		case CC_UPPER:
-			cv = pg_ctype_get_cache(pg_wc_isupper, index);
+			cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
 			break;
 		case CC_GRAPH:
-			cv = pg_ctype_get_cache(pg_wc_isgraph, index);
+			cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
 			break;
 	}
@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c)
 	/*
 	 * Note: we should not see requests to consider cclasses that are not
-	 * treated as locale-specific by cclass(), above.
+	 * treated as locale-specific by cclasscvec(), above.
 	 */
 	if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
 		colnum |= cm->classbits[CC_PRINT];
@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c)
 		colnum |= cm->classbits[CC_ALNUM];
 	if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
 		colnum |= cm->classbits[CC_ALPHA];
 	if (cm->classbits[CC_WORD] && pg_wc_isword(c))
 		colnum |= cm->classbits[CC_WORD];
 	assert(cm->classbits[CC_ASCII] == 0);
 	assert(cm->classbits[CC_BLANK] == 0);
 	assert(cm->classbits[CC_CNTRL] == 0);
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@ -400,6 +400,15 @@ pg_wc_isalnum(pg_wchar c)
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 static int
 pg_wc_isword(pg_wchar c)
 {
 	/* We define word characters as alnum class plus underscore */
 	if (c == CHR('_'))
 		return 1;
 	return pg_wc_isalnum(c);
 }
 static int
 pg_wc_isupper(pg_wchar c)
 {
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@ -46,13 +46,18 @@ static struct subre *parsebranch(struct vars *, int, int, struct state *, struct
 static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
 static void nonword(struct vars *, int, struct state *, struct state *);
 static void word(struct vars *, int, struct state *, struct state *);
 static void charclass(struct vars *, enum char_classes,
 					  struct state *, struct state *);
 static void charclasscomplement(struct vars *, enum char_classes,
 								struct state *, struct state *);
 static int	scannum(struct vars *);
 static void repeat(struct vars *, struct state *, struct state *, int, int);
 static void bracket(struct vars *, struct state *, struct state *);
 static void cbracket(struct vars *, struct state *, struct state *);
-static void brackpart(struct vars *, struct state *, struct state *);
+static void brackpart(struct vars *, struct state *, struct state *, bool *);
 static const chr *scanplain(struct vars *);
 static void onechr(struct vars *, chr, struct state *, struct state *);
 static void optimizebracket(struct vars *, struct state *, struct state *);
 static void wordchrs(struct vars *);
 static void processlacon(struct vars *, struct state *, struct state *, int,
 						 struct state *, struct state *);
@ -81,8 +86,6 @@ static const char *stid(struct subre *, char *, size_t);
 /* === regc_lex.c === */
 static void lexstart(struct vars *);
 static void prefixes(struct vars *);
 static void lexnest(struct vars *, const chr *, const chr *);
 static void lexword(struct vars *);
 static int	next(struct vars *);
 static int	lexescape(struct vars *);
 static chr	lexdigits(struct vars *, int, int, int);
@ -206,6 +209,7 @@ static void freecvec(struct cvec *);
 static int	pg_wc_isdigit(pg_wchar c);
 static int	pg_wc_isalpha(pg_wchar c);
 static int	pg_wc_isalnum(pg_wchar c);
 static int	pg_wc_isword(pg_wchar c);
 static int	pg_wc_isupper(pg_wchar c);
 static int	pg_wc_islower(pg_wchar c);
 static int	pg_wc_isgraph(pg_wchar c);
@ -220,7 +224,8 @@ static chr	element(struct vars *, const chr *, const chr *);
 static struct cvec *range(struct vars *, chr, chr, int);
 static int	before(chr, chr);
 static struct cvec *eclass(struct vars *, chr, int);
-static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
+static enum char_classes lookupcclass(struct vars *, const chr *, const chr *);
 static struct cvec *cclasscvec(struct vars *, enum char_classes, int);
 static int	cclass_column_index(struct colormap *, chr);
 static struct cvec *allcases(struct vars *, chr);
 static int	cmp(const chr *, const chr *, size_t);
@ -233,14 +238,12 @@ struct vars
 	regex_t    *re;
 	const chr  *now;			/* scan pointer into string */
 	const chr  *stop;			/* end of string */
 	const chr  *savenow;		/* saved now and stop for "subroutine call" */
 	const chr  *savestop;
 	int			err;			/* error code (0 if none) */
 	int			cflags;			/* copy of compile flags */
 	int			lasttype;		/* type of previous token */
 	int			nexttype;		/* type of next token */
 	chr			nextvalue;		/* value (if any) of next token */
-	int			lexcon;			/* lexical context type (see lex.c) */
+	int			lexcon;			/* lexical context type (see regc_lex.c) */
 	int			nsubexp;		/* subexpression count */
 	struct subre **subs;		/* subRE pointer vector */
 	size_t		nsubs;			/* length of vector */
@ -287,6 +290,8 @@ struct vars
 #define ECLASS	'E'				/* start of [= */
 #define CCLASS	'C'				/* start of [: */
 #define END 'X'					/* end of [. [= [: */
 #define CCLASSS	's'				/* char class shorthand escape */
 #define CCLASSC	'c'				/* complement char class shorthand escape */
 #define RANGE	'R'				/* - within [] which might be range delim. */
 #define LACON	'L'				/* lookaround constraint subRE */
 #define AHEAD	'a'				/* color-lookahead arc */
@ -356,7 +361,6 @@ pg_regcomp(regex_t *re,
 	v->re = re;
 	v->now = string;
 	v->stop = v->now + len;
 	v->savenow = v->savestop = NULL;
 	v->err = 0;
 	v->cflags = flags;
 	v->nsubexp = 0;
@ -835,23 +839,25 @@ parseqatom(struct vars *v,
 			return;
 			break;
 		case '<':
-			wordchrs(v);		/* does NEXT() */
+			wordchrs(v);
 			s = newstate(v->nfa);
 			NOERR();
 			nonword(v, BEHIND, lp, s);
 			word(v, AHEAD, s, rp);
 			NEXT();
 			return;
 			break;
 		case '>':
-			wordchrs(v);		/* does NEXT() */
+			wordchrs(v);
 			s = newstate(v->nfa);
 			NOERR();
 			word(v, BEHIND, lp, s);
 			nonword(v, AHEAD, s, rp);
 			NEXT();
 			return;
 			break;
 		case WBDRY:
-			wordchrs(v);		/* does NEXT() */
+			wordchrs(v);
 			s = newstate(v->nfa);
 			NOERR();
 			nonword(v, BEHIND, lp, s);
@ -860,10 +866,11 @@ parseqatom(struct vars *v,
 			NOERR();
 			word(v, BEHIND, lp, s);
 			nonword(v, AHEAD, s, rp);
 			NEXT();
 			return;
 			break;
 		case NWBDRY:
-			wordchrs(v);		/* does NEXT() */
+			wordchrs(v);
 			s = newstate(v->nfa);
 			NOERR();
 			word(v, BEHIND, lp, s);
@ -872,6 +879,7 @@ parseqatom(struct vars *v,
 			NOERR();
 			nonword(v, BEHIND, lp, s);
 			nonword(v, AHEAD, s, rp);
 			NEXT();
 			return;
 			break;
 		case LACON:				/* lookaround constraint */
@ -925,6 +933,16 @@ parseqatom(struct vars *v,
 			assert(SEE(']') || ISERR());
 			NEXT();
 			break;
 		case CCLASSS:
 			charclass(v, (enum char_classes) v->nextvalue, lp, rp);
 			okcolors(v->nfa, v->cm);
 			NEXT();
 			break;
 		case CCLASSC:
 			charclasscomplement(v, (enum char_classes) v->nextvalue, lp, rp);
 			/* charclasscomplement() did okcolors() internally */
 			NEXT();
 			break;
 		case '.':
 			rainbow(v->nfa, v->cm, PLAIN,
 					(v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
@ -1338,6 +1356,75 @@ word(struct vars *v,
 	/* (no need for special attention to \n) */
 }
 /*
 * charclass - generate arcs for a character class
 *
 * This is used for both atoms (\w and sibling escapes) and for elements
 * of bracket expressions.  The caller is responsible for calling okcolors()
 * at the end of processing the atom or bracket.
 */
 static void
 charclass(struct vars *v,
 		  enum char_classes cls,
 		  struct state *lp,
 		  struct state *rp)
 {
 	struct cvec *cv;
 	/* obtain possibly-cached cvec for char class */
 	NOTE(REG_ULOCALE);
 	cv = cclasscvec(v, cls, (v->cflags & REG_ICASE));
 	NOERR();
 	/* build the arcs; this may cause color splitting */
 	subcolorcvec(v, cv, lp, rp);
 }
 /*
 * charclasscomplement - generate arcs for a complemented character class
 *
 * This is used for both atoms (\W and sibling escapes) and for elements
 * of bracket expressions.  In bracket expressions, it is the caller's
 * responsibility that there not be any open subcolors when this is called.
 */
 static void
 charclasscomplement(struct vars *v,
 					enum char_classes cls,
 					struct state *lp,
 					struct state *rp)
 {
 	struct state *cstate;
 	struct cvec *cv;
 	/* make dummy state to hang temporary arcs on */
 	cstate = newstate(v->nfa);
 	NOERR();
 	/* obtain possibly-cached cvec for char class */
 	NOTE(REG_ULOCALE);
 	cv = cclasscvec(v, cls, (v->cflags & REG_ICASE));
 	NOERR();
 	/* build arcs for char class; this may cause color splitting */
 	subcolorcvec(v, cv, cstate, cstate);
 	/* in NLSTOP mode, ensure newline is not part of the result set */
 	if (v->cflags & REG_NLSTOP)
 		newarc(v->nfa, PLAIN, v->nlcolor, cstate, cstate);
 	NOERR();
 	/* clean up any subcolors in the arc set */
 	okcolors(v->nfa, v->cm);
 	NOERR();
 	/* now build output arcs for the complement of the char class */
 	colorcomplement(v->nfa, v->cm, PLAIN, cstate, lp, rp);
 	NOERR();
 	/* clean up dummy state */
 	dropstate(v->nfa, cstate);
 }
 /*
 * scannum - scan a number
 */
@ -1456,6 +1543,7 @@ repeat(struct vars *v,
 /*
 * bracket - handle non-complemented bracket expression
 *
 * Also called from cbracket for complemented bracket expressions.
 */
 static void
@ -1463,16 +1551,52 @@ bracket(struct vars *v,
 		struct state *lp,
 		struct state *rp)
 {
 	/*
 	 * We can't process complemented char classes (e.g. \W) immediately while
 	 * scanning the bracket expression, else color bookkeeping gets confused.
 	 * Instead, remember whether we saw any in have_cclassc[], and process
 	 * them at the end.
 	 */
 	bool		have_cclassc[NUM_CCLASSES];
 	bool		any_cclassc;
 	int			i;
 	memset(have_cclassc, false, sizeof(have_cclassc));
 	assert(SEE('['));
 	NEXT();
 	while (!SEE(']') && !SEE(EOS))
-		brackpart(v, lp, rp);
+		brackpart(v, lp, rp, have_cclassc);
 	assert(SEE(']') || ISERR());
 	/* close up open subcolors from the positive bracket elements */
 	okcolors(v->nfa, v->cm);
 	NOERR();
 	/* now handle any complemented elements */
 	any_cclassc = false;
 	for (i = 0; i < NUM_CCLASSES; i++)
 	{
 		if (have_cclassc[i])
 		{
 			charclasscomplement(v, (enum char_classes) i, lp, rp);
 			NOERR();
 			any_cclassc = true;
 		}
 	}
 	/*
 	 * If we had any complemented elements, see if we can optimize the bracket
 	 * into a rainbow.  Since a complemented element is the only way a WHITE
 	 * arc could get into the result, there's no point in checking otherwise.
 	 */
 	if (any_cclassc)
 		optimizebracket(v, lp, rp);
 }
 /*
 * cbracket - handle complemented bracket expression
 *
 * We do it by calling bracket() with dummy endpoints, and then complementing
 * the result.  The alternative would be to invoke rainbow(), and then delete
 * arcs as the b.e. is seen... but that gets messy, and is really quite
@ -1496,7 +1620,9 @@ cbracket(struct vars *v,
 	/*
 	 * Easy part of complementing, and all there is to do since the MCCE code
-	 * was removed.
+	 * was removed.  Note that the result of colorcomplement() cannot be a
 	 * rainbow, since we don't allow empty brackets; so there's no point in
 	 * calling optimizebracket() again.
 	 */
 	colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
 	NOERR();
@ -1511,14 +1637,15 @@ cbracket(struct vars *v,
 static void
 brackpart(struct vars *v,
 		  struct state *lp,
-		  struct state *rp)
+		  struct state *rp,
 		  bool *have_cclassc)
 {
 	chr			startc;
 	chr			endc;
 	struct cvec *cv;
 	enum char_classes cls;
 	const chr  *startp;
 	const chr  *endp;
 	chr			c[1];
 	/* parse something, get rid of special cases, take shortcuts */
 	switch (v->nexttype)
@ -1528,15 +1655,14 @@ brackpart(struct vars *v,
 			return;
 			break;
 		case PLAIN:
-			c[0] = v->nextvalue;
+			startc = v->nextvalue;
 			NEXT();
 			/* shortcut for ordinary chr (not range) */
 			if (!SEE(RANGE))
 			{
-				onechr(v, c[0], lp, rp);
+				onechr(v, startc, lp, rp);
 				return;
 			}
 			startc = element(v, c, c + 1);
 			NOERR();
 			break;
 		case COLLEL:
@ -1564,9 +1690,20 @@ brackpart(struct vars *v,
 			endp = scanplain(v);
 			INSIST(startp < endp, REG_ECTYPE);
 			NOERR();
-			cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
+			cls = lookupcclass(v, startp, endp);
 			NOERR();
-			subcolorcvec(v, cv, lp, rp);
+			charclass(v, cls, lp, rp);
 			return;
 			break;
 		case CCLASSS:
 			charclass(v, (enum char_classes) v->nextvalue, lp, rp);
 			NEXT();
 			return;
 			break;
 		case CCLASSC:
 			/* we cannot call charclasscomplement() immediately */
 			have_cclassc[v->nextvalue] = true;
 			NEXT();
 			return;
 			break;
 		default:
@ -1582,9 +1719,8 @@ brackpart(struct vars *v,
 		{
 			case PLAIN:
 			case RANGE:
-				c[0] = v->nextvalue;
+				endc = v->nextvalue;
 				NEXT();
 				endc = element(v, c, c + 1);
 				NOERR();
 				break;
 			case COLLEL:
@ -1618,7 +1754,7 @@ brackpart(struct vars *v,
 /*
 * scanplain - scan PLAIN contents of [. etc.
 *
- * Certain bits of trickery in lex.c know that this code does not try
+ * Certain bits of trickery in regc_lex.c know that this code does not try
 * to look past the final bracket of the [. etc.
 */
 static const chr *				/* just after end of sequence */
@ -1664,39 +1800,98 @@ onechr(struct vars *v,
 	subcolorcvec(v, allcases(v, c), lp, rp);
 }
 /*
 * optimizebracket - see if bracket expression can be converted to RAINBOW
 *
 * Cases such as "[\s\S]" can produce a set of arcs of all colors, which we
 * can replace by a single RAINBOW arc for efficiency.  (This might seem
 * like a silly way to write ".", but it's seemingly a common locution in
 * some other flavors of regex, so take the trouble to support it well.)
 */
 static void
 optimizebracket(struct vars *v,
 				struct state *lp,
 				struct state *rp)
 {
 	struct colordesc *cd;
 	struct colordesc *end = CDEND(v->cm);
 	struct arc *a;
 	bool		israinbow;
 	/*
 	 * Scan lp's out-arcs and transiently mark the mentioned colors.  We
 	 * expect that all of lp's out-arcs are plain, non-RAINBOW arcs to rp.
 	 * (Note: there shouldn't be any pseudocolors yet, but check anyway.)
 	 */
 	for (a = lp->outs; a != NULL; a = a->outchain)
 	{
 		assert(a->type == PLAIN);
 		assert(a->co >= 0);		/* i.e. not RAINBOW */
 		assert(a->to == rp);
 		cd = &v->cm->cd[a->co];
 		assert(!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO));
 		cd->flags |= COLMARK;
 	}
 	/* Scan colors, clear transient marks, check for unmarked live colors */
 	israinbow = true;
 	for (cd = v->cm->cd; cd < end; cd++)
 	{
 		if (cd->flags & COLMARK)
 			cd->flags &= ~COLMARK;
 		else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
 			israinbow = false;
 	}
 	/* Can't do anything if not all colors have arcs */
 	if (!israinbow)
 		return;
 	/* OK, drop existing arcs and replace with a rainbow */
 	while ((a = lp->outs) != NULL)
 		freearc(v->nfa, a);
 	newarc(v->nfa, PLAIN, RAINBOW, lp, rp);
 }
 /*
 * wordchrs - set up word-chr list for word-boundary stuff, if needed
 *
- * The list is kept as a bunch of arcs between two dummy states; it's
+ * The list is kept as a bunch of circular arcs on an otherwise-unused state.
- * disposed of by the unreachable-states sweep in NFA optimization.
+ *
- * Does NEXT().  Must not be called from any unusual lexical context.
+ * Note that this must not be called while we have any open subcolors,
- * This should be reconciled with the \w etc. handling in lex.c, and
+ * else construction of the list would confuse color bookkeeping.
- * should be cleaned up to reduce dependencies on input scanning.
+ * Hence, we can't currently apply a similar optimization in
 * charclass[complement](), as those need to be usable within bracket
 * expressions.
 */
 static void
 wordchrs(struct vars *v)
 {
-	struct state *left;
+	struct state *cstate;
-	struct state *right;
+	struct cvec *cv;
 	if (v->wordchrs != NULL)
-	{
+		return;					/* done already */
 		NEXT();					/* for consistency */
 		return;
 	}
-	left = newstate(v->nfa);
+	/* make dummy state to hang the cache arcs on */
-	right = newstate(v->nfa);
+	cstate = newstate(v->nfa);
 	NOERR();
-	/* fine point:	implemented with [::], and lexer will set REG_ULOCALE */
+
-	lexword(v);
+	/* obtain possibly-cached cvec for \w characters */
-	NEXT();
+	NOTE(REG_ULOCALE);
-	assert(v->savenow != NULL && SEE('['));
+	cv = cclasscvec(v, CC_WORD, (v->cflags & REG_ICASE));
 	bracket(v, left, right);
 	assert((v->savenow != NULL && SEE(']')) || ISERR());
 	NEXT();
 	NOERR();
-	v->wordchrs = left;
+
 	/* build the arcs; this may cause color splitting */
 	subcolorcvec(v, cv, cstate, cstate);
 	NOERR();
 	/* close new open subcolors to ensure the cache entry is self-contained */
 	okcolors(v->nfa, v->cm);
 	NOERR();
 	/* success! save the cache pointer */
 	v->wordchrs = cstate;
 }
 /*
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@ -127,6 +127,18 @@
 #define ISBSET(uv, sn)	((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
 /*
 * known character classes
 */
 enum char_classes
 {
 	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
 	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD
 };
 #define NUM_CCLASSES 14
 /*
 * As soon as possible, we map chrs into equivalence classes -- "colors" --
 * which are of much more manageable number.
@ -164,12 +176,14 @@ struct colordesc
 #define  NOSUB	 COLORLESS		/* value of "sub" when no open subcolor */
 	struct arc *arcs;			/* chain of all arcs of this color */
 	chr			firstchr;		/* simple char first assigned to this color */
-	int			flags;			/* bit values defined next */
+	int			flags;			/* bitmask of the following flags: */
 #define  FREECOL 01				/* currently free */
 #define  PSEUDO  02				/* pseudocolor, no real chars */
-#define  UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
+#define  COLMARK 04				/* temporary marker used in some functions */
 };
 #define  UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
 /*
 * The color map itself
 *
@ -199,8 +213,6 @@ struct colordesc
 * appear in increasing chr-value order.
 */
 #define NUM_CCLASSES 13			/* must match data in regc_locale.c */
 typedef struct colormaprange
 {
 	chr			cmin;			/* range represents cmin..cmax inclusive */
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@ -1970,6 +1970,256 @@ select * from test_regex('a[\w]b', 'axb', 'LPE');
 {axb}
 (2 rows)
 -- these should be invalid
 select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
 ERROR:  invalid regular expression: invalid character range
 select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
 ERROR:  invalid regular expression: invalid character range
 select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
 ERROR:  invalid regular expression: invalid character range
 select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
 ERROR:  invalid regular expression: invalid character range
 -- test complemented char classes within brackets
 select * from test_regex('[\D]', '0123456789abc*', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {a}
 (2 rows)
 select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {0}
 (2 rows)
 select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {1}
 (2 rows)
 select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {1}
 (2 rows)
 select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {2}
 (2 rows)
 select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {2}
 (2 rows)
 select * from test_regex('\W', '0123456789abc_*', 'LP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {*}
 (2 rows)
 select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {*}
 (2 rows)
 select * from test_regex('[\s\S]*', '012  3456789abc_*', 'LNPE');
                       test_regex                       
 --------------------------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UEMPTYMATCH}
 {"012  3456789abc_*"}
 (2 rows)
 -- check char classes' handling of newlines
 select * from test_regex('\s+', E'abc  \n  def', 'LP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {"                           +
   "}
 (2 rows)
 select * from test_regex('\s+', E'abc  \n  def', 'nLP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {"                           +
   "}
 (2 rows)
 select * from test_regex('[\s]+', E'abc  \n  def', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {"                                    +
   "}
 (2 rows)
 select * from test_regex('[\s]+', E'abc  \n  def', 'nLPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {"                                    +
   "}
 (2 rows)
 select * from test_regex('\S+', E'abc\ndef', 'LP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {abc}
 (2 rows)
 select * from test_regex('\S+', E'abc\ndef', 'nLP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {abc}
 (2 rows)
 select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {abc}
 (2 rows)
 select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {abc}
 (2 rows)
 select * from test_regex('\d+', E'012\n345', 'LP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {012}
 (2 rows)
 select * from test_regex('\d+', E'012\n345', 'nLP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {012}
 (2 rows)
 select * from test_regex('[\d]+', E'012\n345', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {012}
 (2 rows)
 select * from test_regex('[\d]+', E'012\n345', 'nLPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {012}
 (2 rows)
 select * from test_regex('\D+', E'abc\ndef345', 'LP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {"abc                        +
 def"}
 (2 rows)
 select * from test_regex('\D+', E'abc\ndef345', 'nLP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {abc}
 (2 rows)
 select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {"abc                                 +
 def"}
 (2 rows)
 select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {abc}
 (2 rows)
 select * from test_regex('\w+', E'abc_012\ndef', 'LP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {abc_012}
 (2 rows)
 select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {abc_012}
 (2 rows)
 select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {abc_012}
 (2 rows)
 select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {abc_012}
 (2 rows)
 select * from test_regex('\W+', E'***\n@@@___', 'LP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {"***                        +
 @@@"}
 (2 rows)
 select * from test_regex('\W+', E'***\n@@@___', 'nLP');
          test_regex           
 -------------------------------
 {0,REG_UNONPOSIX,REG_ULOCALE}
 {***}
 (2 rows)
 select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {"***                                 +
 @@@"}
 (2 rows)
 select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
               test_regex               
 ----------------------------------------
 {0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
 {***}
 (2 rows)
 -- doing 13 "escapes"
 -- expectError	13.1  &		"a\\"		EESCAPE
 select * from test_regex('a\', '', '');
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@ -597,6 +597,50 @@ select * from test_regex('a[\s]b', 'a b', 'LPE');
 -- expectMatch	12.18 LPE	{a[\w]b}	axb	axb
 select * from test_regex('a[\w]b', 'axb', 'LPE');
 -- these should be invalid
 select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
 select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
 select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
 select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
 -- test complemented char classes within brackets
 select * from test_regex('[\D]', '0123456789abc*', 'LPE');
 select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
 select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
 select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
 select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
 select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
 select * from test_regex('\W', '0123456789abc_*', 'LP');
 select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
 select * from test_regex('[\s\S]*', '012  3456789abc_*', 'LNPE');
 -- check char classes' handling of newlines
 select * from test_regex('\s+', E'abc  \n  def', 'LP');
 select * from test_regex('\s+', E'abc  \n  def', 'nLP');
 select * from test_regex('[\s]+', E'abc  \n  def', 'LPE');
 select * from test_regex('[\s]+', E'abc  \n  def', 'nLPE');
 select * from test_regex('\S+', E'abc\ndef', 'LP');
 select * from test_regex('\S+', E'abc\ndef', 'nLP');
 select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
 select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
 select * from test_regex('\d+', E'012\n345', 'LP');
 select * from test_regex('\d+', E'012\n345', 'nLP');
 select * from test_regex('[\d]+', E'012\n345', 'LPE');
 select * from test_regex('[\d]+', E'012\n345', 'nLPE');
 select * from test_regex('\D+', E'abc\ndef345', 'LP');
 select * from test_regex('\D+', E'abc\ndef345', 'nLP');
 select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
 select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
 select * from test_regex('\w+', E'abc_012\ndef', 'LP');
 select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
 select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
 select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
 select * from test_regex('\W+', E'***\n@@@___', 'LP');
 select * from test_regex('\W+', E'***\n@@@___', 'nLP');
 select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
 select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
 -- doing 13 "escapes"
 -- expectError	13.1  &		"a\\"		EESCAPE