/* * regc_locale.c -- * * This file contains locale-specific regexp routines. * This file is #included by regcomp.c. * * Copyright (c) 1998 by Scriptics Corporation. * * This software is copyrighted by the Regents of the University of * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState * Corporation and other parties. The following terms apply to all files * associated with the software unless explicitly disclaimed in * individual files. * * The authors hereby grant permission to use, copy, modify, distribute, * and license this software and its documentation for any purpose, provided * that existing copyright notices are retained in all copies and that this * notice is included verbatim in any distributions. No written agreement, * license, or royalty fee is required for any of the authorized uses. * Modifications to this software may be copyrighted by their authors * and need not follow the licensing terms described here, provided that * the new terms are clearly indicated on the first page of each file where * they apply. * * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR * MODIFICATIONS. * * GOVERNMENT USE: If you are acquiring this software on behalf of the * U.S. government, the Government shall have only "Restricted Rights" * in the software and related documentation as defined in the Federal * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you * are acquiring the software on behalf of the Department of Defense, the * software shall be classified as "Commercial Computer Software" and the * Government shall have only "Restricted Rights" as defined in Clause * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the * authors grant the U.S. Government and others acting in its behalf * permission to use and distribute the software in accordance with the * terms specified in this license. * * src/backend/regex/regc_locale.c */ /* ASCII character-name table */ static const struct cname { const char *name; const char code; } cnames[] = { { "NUL", '\0' }, { "SOH", '\001' }, { "STX", '\002' }, { "ETX", '\003' }, { "EOT", '\004' }, { "ENQ", '\005' }, { "ACK", '\006' }, { "BEL", '\007' }, { "alert", '\007' }, { "BS", '\010' }, { "backspace", '\b' }, { "HT", '\011' }, { "tab", '\t' }, { "LF", '\012' }, { "newline", '\n' }, { "VT", '\013' }, { "vertical-tab", '\v' }, { "FF", '\014' }, { "form-feed", '\f' }, { "CR", '\015' }, { "carriage-return", '\r' }, { "SO", '\016' }, { "SI", '\017' }, { "DLE", '\020' }, { "DC1", '\021' }, { "DC2", '\022' }, { "DC3", '\023' }, { "DC4", '\024' }, { "NAK", '\025' }, { "SYN", '\026' }, { "ETB", '\027' }, { "CAN", '\030' }, { "EM", '\031' }, { "SUB", '\032' }, { "ESC", '\033' }, { "IS4", '\034' }, { "FS", '\034' }, { "IS3", '\035' }, { "GS", '\035' }, { "IS2", '\036' }, { "RS", '\036' }, { "IS1", '\037' }, { "US", '\037' }, { "space", ' ' }, { "exclamation-mark", '!' }, { "quotation-mark", '"' }, { "number-sign", '#' }, { "dollar-sign", '$' }, { "percent-sign", '%' }, { "ampersand", '&' }, { "apostrophe", '\'' }, { "left-parenthesis", '(' }, { "right-parenthesis", ')' }, { "asterisk", '*' }, { "plus-sign", '+' }, { "comma", ',' }, { "hyphen", '-' }, { "hyphen-minus", '-' }, { "period", '.' }, { "full-stop", '.' }, { "slash", '/' }, { "solidus", '/' }, { "zero", '0' }, { "one", '1' }, { "two", '2' }, { "three", '3' }, { "four", '4' }, { "five", '5' }, { "six", '6' }, { "seven", '7' }, { "eight", '8' }, { "nine", '9' }, { "colon", ':' }, { "semicolon", ';' }, { "less-than-sign", '<' }, { "equals-sign", '=' }, { "greater-than-sign", '>' }, { "question-mark", '?' }, { "commercial-at", '@' }, { "left-square-bracket", '[' }, { "backslash", '\\' }, { "reverse-solidus", '\\' }, { "right-square-bracket", ']' }, { "circumflex", '^' }, { "circumflex-accent", '^' }, { "underscore", '_' }, { "low-line", '_' }, { "grave-accent", '`' }, { "left-brace", '{' }, { "left-curly-bracket", '{' }, { "vertical-line", '|' }, { "right-brace", '}' }, { "right-curly-bracket", '}' }, { "tilde", '~' }, { "DEL", '\177' }, { NULL, 0 } }; /* * The following array defines the valid character class names. * The entries must match enum char_classes in regguts.h. */ static const char *const classNames[NUM_CCLASSES + 1] = { "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "xdigit", "word", NULL }; /* * We do not use the hard-wired Unicode classification tables that Tcl does. * This is because (a) we need to deal with other encodings besides Unicode, * and (b) we want to track the behavior of the libc locale routines as * closely as possible. For example, it wouldn't be unreasonable for a * locale to not consider every Unicode letter as a letter. So we build * character classification cvecs by asking libc, even for Unicode. */ /* * element - map collating-element name to chr */ static chr element(struct vars *v, /* context */ const chr *startp, /* points to start of name */ const chr *endp) /* points just past end of name */ { const struct cname *cn; size_t len; /* generic: one-chr names stand for themselves */ assert(startp < endp); len = endp - startp; if (len == 1) return *startp; NOTE(REG_ULOCALE); /* search table */ for (cn = cnames; cn->name != NULL; cn++) { if (strlen(cn->name) == len && pg_char_and_wchar_strncmp(cn->name, startp, len) == 0) { break; /* NOTE BREAK OUT */ } } if (cn->name != NULL) return CHR(cn->code); /* couldn't find it */ ERR(REG_ECOLLATE); return 0; } /* * range - supply cvec for a range, including legality check */ static struct cvec * range(struct vars *v, /* context */ chr a, /* range start */ chr b, /* range end, might equal a */ int cases) /* case-independent? */ { int nchrs; struct cvec *cv; chr c, cc; if (a != b && !before(a, b)) { ERR(REG_ERANGE); return NULL; } if (!cases) { /* easy version */ cv = getcvec(v, 0, 1); NOERRN(); addrange(cv, a, b); return cv; } /* * When case-independent, it's hard to decide when cvec ranges are usable, * so for now at least, we won't try. We use a range for the originally * specified chrs and then add on any case-equivalents that are outside * that range as individual chrs. * * To ensure sane behavior if someone specifies a very large range, limit * the allocation size to 100000 chrs (arbitrary) and check for overrun * inside the loop below. */ nchrs = b - a + 1; if (nchrs <= 0 || nchrs > 100000) nchrs = 100000; cv = getcvec(v, nchrs, 1); NOERRN(); addrange(cv, a, b); for (c = a; c <= b; c++) { cc = pg_wc_tolower(c); if (cc != c && (before(cc, a) || before(b, cc))) { if (cv->nchrs >= cv->chrspace) { ERR(REG_ETOOBIG); return NULL; } addchr(cv, cc); } cc = pg_wc_toupper(c); if (cc != c && (before(cc, a) || before(b, cc))) { if (cv->nchrs >= cv->chrspace) { ERR(REG_ETOOBIG); return NULL; } addchr(cv, cc); } INTERRUPT(v->re); } return cv; } /* * before - is chr x before chr y, for purposes of range legality? */ static int /* predicate */ before(chr x, chr y) { if (x < y) return 1; return 0; } /* * eclass - supply cvec for an equivalence class * Must include case counterparts on request. */ static struct cvec * eclass(struct vars *v, /* context */ chr c, /* Collating element representing the * equivalence class. */ int cases) /* all cases? */ { struct cvec *cv; /* crude fake equivalence class for testing */ if ((v->cflags & REG_FAKE) && c == 'x') { cv = getcvec(v, 4, 0); addchr(cv, CHR('x')); addchr(cv, CHR('y')); if (cases) { addchr(cv, CHR('X')); addchr(cv, CHR('Y')); } return cv; } /* otherwise, none */ if (cases) return allcases(v, c); cv = getcvec(v, 1, 0); assert(cv != NULL); addchr(cv, c); return cv; } /* * lookupcclass - lookup a character class identified by name * * On failure, sets an error code in *v; the result is then garbage. */ static enum char_classes lookupcclass(struct vars *v, /* context (for returning errors) */ const chr *startp, /* where the name starts */ const chr *endp) /* just past the end of the name */ { size_t len; const char *const *namePtr; int i; /* * Map the name to the corresponding enumerated value. */ len = endp - startp; for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) { if (strlen(*namePtr) == len && pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) return (enum char_classes) i; } ERR(REG_ECTYPE); return (enum char_classes) 0; } /* * cclasscvec - supply cvec for a character class * * Must include case counterparts if "cases" is true. * * The returned cvec might be either a transient cvec gotten from getcvec(), * or a permanently cached one from pg_ctype_get_cache(). This is okay * because callers are not supposed to explicitly free the result either way. */ static struct cvec * cclasscvec(struct vars *v, /* context */ enum char_classes cclasscode, /* class to build a cvec for */ int cases) /* case-independent? */ { struct cvec *cv = NULL; /* * Remap lower and upper to alpha if the match is case insensitive. */ if (cases && (cclasscode == CC_LOWER || cclasscode == CC_UPPER)) cclasscode = CC_ALPHA; /* * Now compute the character class contents. For classes that are based * on the behavior of a or function, we use * pg_ctype_get_cache so that we can cache the results. Other classes * have definitions that are hard-wired here, and for those we just * construct a transient cvec on the fly. * * NB: keep this code in sync with cclass_column_index(), below. */ switch (cclasscode) { case CC_PRINT: cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode); break; case CC_ALNUM: cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode); break; case CC_ALPHA: cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode); break; case CC_WORD: cv = pg_ctype_get_cache(pg_wc_isword, cclasscode); break; case CC_ASCII: /* hard-wired meaning */ cv = getcvec(v, 0, 1); if (cv) addrange(cv, 0, 0x7f); break; case CC_BLANK: /* hard-wired meaning */ cv = getcvec(v, 2, 0); addchr(cv, '\t'); addchr(cv, ' '); break; case CC_CNTRL: /* hard-wired meaning */ cv = getcvec(v, 0, 2); addrange(cv, 0x0, 0x1f); addrange(cv, 0x7f, 0x9f); break; case CC_DIGIT: cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode); break; case CC_PUNCT: cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode); break; case CC_XDIGIT: /* * It's not clear how to define this in non-western locales, and * even less clear that there's any particular use in trying. So * just hard-wire the meaning. */ cv = getcvec(v, 0, 3); if (cv) { addrange(cv, '0', '9'); addrange(cv, 'a', 'f'); addrange(cv, 'A', 'F'); } break; case CC_SPACE: cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode); break; case CC_LOWER: cv = pg_ctype_get_cache(pg_wc_islower, cclasscode); break; case CC_UPPER: cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode); break; case CC_GRAPH: cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode); break; } /* If cv is NULL now, the reason must be "out of memory" */ if (cv == NULL) ERR(REG_ESPACE); return cv; } /* * cclass_column_index - get appropriate high colormap column index for chr */ static int cclass_column_index(struct colormap *cm, chr c) { int colnum = 0; /* Shouldn't go through all these pushups for simple chrs */ assert(c > MAX_SIMPLE_CHR); /* * Note: we should not see requests to consider cclasses that are not * treated as locale-specific by cclasscvec(), above. */ if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) colnum |= cm->classbits[CC_PRINT]; if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c)) colnum |= cm->classbits[CC_ALNUM]; if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) colnum |= cm->classbits[CC_ALPHA]; if (cm->classbits[CC_WORD] && pg_wc_isword(c)) colnum |= cm->classbits[CC_WORD]; assert(cm->classbits[CC_ASCII] == 0); assert(cm->classbits[CC_BLANK] == 0); assert(cm->classbits[CC_CNTRL] == 0); if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c)) colnum |= cm->classbits[CC_DIGIT]; if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c)) colnum |= cm->classbits[CC_PUNCT]; assert(cm->classbits[CC_XDIGIT] == 0); if (cm->classbits[CC_SPACE] && pg_wc_isspace(c)) colnum |= cm->classbits[CC_SPACE]; if (cm->classbits[CC_LOWER] && pg_wc_islower(c)) colnum |= cm->classbits[CC_LOWER]; if (cm->classbits[CC_UPPER] && pg_wc_isupper(c)) colnum |= cm->classbits[CC_UPPER]; if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c)) colnum |= cm->classbits[CC_GRAPH]; return colnum; } /* * allcases - supply cvec for all case counterparts of a chr (including itself) * * This is a shortcut, preferably an efficient one, for simple characters; * messy cases are done via range(). */ static struct cvec * allcases(struct vars *v, /* context */ chr c) /* character to get case equivs of */ { struct cvec *cv; chr lc, uc; lc = pg_wc_tolower(c); uc = pg_wc_toupper(c); cv = getcvec(v, 2, 0); addchr(cv, lc); if (lc != uc) addchr(cv, uc); return cv; } /* * cmp - chr-substring compare * * Backrefs need this. It should preferably be efficient. * Note that it does not need to report anything except equal/unequal. * Note also that the length is exact, and the comparison should not * stop at embedded NULs! */ static int /* 0 for equal, nonzero for unequal */ cmp(const chr *x, const chr *y, /* strings to compare */ size_t len) /* exact length of comparison */ { return memcmp(VS(x), VS(y), len * sizeof(chr)); } /* * casecmp - case-independent chr-substring compare * * REG_ICASE backrefs need this. It should preferably be efficient. * Note that it does not need to report anything except equal/unequal. * Note also that the length is exact, and the comparison should not * stop at embedded NULs! */ static int /* 0 for equal, nonzero for unequal */ casecmp(const chr *x, const chr *y, /* strings to compare */ size_t len) /* exact length of comparison */ { for (; len > 0; len--, x++, y++) { if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y))) return 1; } return 0; }