767 lines
15 KiB
C
767 lines
15 KiB
C
/*
|
|
* regc_locale.c --
|
|
*
|
|
* This file contains locale-specific regexp routines.
|
|
* This file is #included by regcomp.c.
|
|
*
|
|
* Copyright (c) 1998 by Scriptics Corporation.
|
|
*
|
|
* This software is copyrighted by the Regents of the University of
|
|
* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
|
|
* Corporation and other parties. The following terms apply to all files
|
|
* associated with the software unless explicitly disclaimed in
|
|
* individual files.
|
|
*
|
|
* The authors hereby grant permission to use, copy, modify, distribute,
|
|
* and license this software and its documentation for any purpose, provided
|
|
* that existing copyright notices are retained in all copies and that this
|
|
* notice is included verbatim in any distributions. No written agreement,
|
|
* license, or royalty fee is required for any of the authorized uses.
|
|
* Modifications to this software may be copyrighted by their authors
|
|
* and need not follow the licensing terms described here, provided that
|
|
* the new terms are clearly indicated on the first page of each file where
|
|
* they apply.
|
|
*
|
|
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
|
|
* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
|
|
* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
|
|
* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
|
|
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
|
|
* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
|
|
* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
|
|
* MODIFICATIONS.
|
|
*
|
|
* GOVERNMENT USE: If you are acquiring this software on behalf of the
|
|
* U.S. government, the Government shall have only "Restricted Rights"
|
|
* in the software and related documentation as defined in the Federal
|
|
* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
|
|
* are acquiring the software on behalf of the Department of Defense, the
|
|
* software shall be classified as "Commercial Computer Software" and the
|
|
* Government shall have only "Restricted Rights" as defined in Clause
|
|
* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
|
|
* authors grant the U.S. Government and others acting in its behalf
|
|
* permission to use and distribute the software in accordance with the
|
|
* terms specified in this license.
|
|
*
|
|
* src/backend/regex/regc_locale.c
|
|
*/
|
|
|
|
/* ASCII character-name table */
|
|
|
|
static const struct cname
|
|
{
|
|
const char *name;
|
|
const char code;
|
|
} cnames[] =
|
|
|
|
{
|
|
{
|
|
"NUL", '\0'
|
|
},
|
|
{
|
|
"SOH", '\001'
|
|
},
|
|
{
|
|
"STX", '\002'
|
|
},
|
|
{
|
|
"ETX", '\003'
|
|
},
|
|
{
|
|
"EOT", '\004'
|
|
},
|
|
{
|
|
"ENQ", '\005'
|
|
},
|
|
{
|
|
"ACK", '\006'
|
|
},
|
|
{
|
|
"BEL", '\007'
|
|
},
|
|
{
|
|
"alert", '\007'
|
|
},
|
|
{
|
|
"BS", '\010'
|
|
},
|
|
{
|
|
"backspace", '\b'
|
|
},
|
|
{
|
|
"HT", '\011'
|
|
},
|
|
{
|
|
"tab", '\t'
|
|
},
|
|
{
|
|
"LF", '\012'
|
|
},
|
|
{
|
|
"newline", '\n'
|
|
},
|
|
{
|
|
"VT", '\013'
|
|
},
|
|
{
|
|
"vertical-tab", '\v'
|
|
},
|
|
{
|
|
"FF", '\014'
|
|
},
|
|
{
|
|
"form-feed", '\f'
|
|
},
|
|
{
|
|
"CR", '\015'
|
|
},
|
|
{
|
|
"carriage-return", '\r'
|
|
},
|
|
{
|
|
"SO", '\016'
|
|
},
|
|
{
|
|
"SI", '\017'
|
|
},
|
|
{
|
|
"DLE", '\020'
|
|
},
|
|
{
|
|
"DC1", '\021'
|
|
},
|
|
{
|
|
"DC2", '\022'
|
|
},
|
|
{
|
|
"DC3", '\023'
|
|
},
|
|
{
|
|
"DC4", '\024'
|
|
},
|
|
{
|
|
"NAK", '\025'
|
|
},
|
|
{
|
|
"SYN", '\026'
|
|
},
|
|
{
|
|
"ETB", '\027'
|
|
},
|
|
{
|
|
"CAN", '\030'
|
|
},
|
|
{
|
|
"EM", '\031'
|
|
},
|
|
{
|
|
"SUB", '\032'
|
|
},
|
|
{
|
|
"ESC", '\033'
|
|
},
|
|
{
|
|
"IS4", '\034'
|
|
},
|
|
{
|
|
"FS", '\034'
|
|
},
|
|
{
|
|
"IS3", '\035'
|
|
},
|
|
{
|
|
"GS", '\035'
|
|
},
|
|
{
|
|
"IS2", '\036'
|
|
},
|
|
{
|
|
"RS", '\036'
|
|
},
|
|
{
|
|
"IS1", '\037'
|
|
},
|
|
{
|
|
"US", '\037'
|
|
},
|
|
{
|
|
"space", ' '
|
|
},
|
|
{
|
|
"exclamation-mark", '!'
|
|
},
|
|
{
|
|
"quotation-mark", '"'
|
|
},
|
|
{
|
|
"number-sign", '#'
|
|
},
|
|
{
|
|
"dollar-sign", '$'
|
|
},
|
|
{
|
|
"percent-sign", '%'
|
|
},
|
|
{
|
|
"ampersand", '&'
|
|
},
|
|
{
|
|
"apostrophe", '\''
|
|
},
|
|
{
|
|
"left-parenthesis", '('
|
|
},
|
|
{
|
|
"right-parenthesis", ')'
|
|
},
|
|
{
|
|
"asterisk", '*'
|
|
},
|
|
{
|
|
"plus-sign", '+'
|
|
},
|
|
{
|
|
"comma", ','
|
|
},
|
|
{
|
|
"hyphen", '-'
|
|
},
|
|
{
|
|
"hyphen-minus", '-'
|
|
},
|
|
{
|
|
"period", '.'
|
|
},
|
|
{
|
|
"full-stop", '.'
|
|
},
|
|
{
|
|
"slash", '/'
|
|
},
|
|
{
|
|
"solidus", '/'
|
|
},
|
|
{
|
|
"zero", '0'
|
|
},
|
|
{
|
|
"one", '1'
|
|
},
|
|
{
|
|
"two", '2'
|
|
},
|
|
{
|
|
"three", '3'
|
|
},
|
|
{
|
|
"four", '4'
|
|
},
|
|
{
|
|
"five", '5'
|
|
},
|
|
{
|
|
"six", '6'
|
|
},
|
|
{
|
|
"seven", '7'
|
|
},
|
|
{
|
|
"eight", '8'
|
|
},
|
|
{
|
|
"nine", '9'
|
|
},
|
|
{
|
|
"colon", ':'
|
|
},
|
|
{
|
|
"semicolon", ';'
|
|
},
|
|
{
|
|
"less-than-sign", '<'
|
|
},
|
|
{
|
|
"equals-sign", '='
|
|
},
|
|
{
|
|
"greater-than-sign", '>'
|
|
},
|
|
{
|
|
"question-mark", '?'
|
|
},
|
|
{
|
|
"commercial-at", '@'
|
|
},
|
|
{
|
|
"left-square-bracket", '['
|
|
},
|
|
{
|
|
"backslash", '\\'
|
|
},
|
|
{
|
|
"reverse-solidus", '\\'
|
|
},
|
|
{
|
|
"right-square-bracket", ']'
|
|
},
|
|
{
|
|
"circumflex", '^'
|
|
},
|
|
{
|
|
"circumflex-accent", '^'
|
|
},
|
|
{
|
|
"underscore", '_'
|
|
},
|
|
{
|
|
"low-line", '_'
|
|
},
|
|
{
|
|
"grave-accent", '`'
|
|
},
|
|
{
|
|
"left-brace", '{'
|
|
},
|
|
{
|
|
"left-curly-bracket", '{'
|
|
},
|
|
{
|
|
"vertical-line", '|'
|
|
},
|
|
{
|
|
"right-brace", '}'
|
|
},
|
|
{
|
|
"right-curly-bracket", '}'
|
|
},
|
|
{
|
|
"tilde", '~'
|
|
},
|
|
{
|
|
"DEL", '\177'
|
|
},
|
|
{
|
|
NULL, 0
|
|
}
|
|
};
|
|
|
|
/*
|
|
* The following arrays define the valid character class names.
|
|
*/
|
|
static const char *const classNames[NUM_CCLASSES + 1] = {
|
|
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
|
|
"lower", "print", "punct", "space", "upper", "xdigit", NULL
|
|
};
|
|
|
|
enum classes
|
|
{
|
|
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
|
|
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
|
|
};
|
|
|
|
/*
|
|
* We do not use the hard-wired Unicode classification tables that Tcl does.
|
|
* This is because (a) we need to deal with other encodings besides Unicode,
|
|
* and (b) we want to track the behavior of the libc locale routines as
|
|
* closely as possible. For example, it wouldn't be unreasonable for a
|
|
* locale to not consider every Unicode letter as a letter. So we build
|
|
* character classification cvecs by asking libc, even for Unicode.
|
|
*/
|
|
|
|
|
|
/*
|
|
* element - map collating-element name to chr
|
|
*/
|
|
static chr
|
|
element(struct vars *v, /* context */
|
|
const chr *startp, /* points to start of name */
|
|
const chr *endp) /* points just past end of name */
|
|
{
|
|
const struct cname *cn;
|
|
size_t len;
|
|
|
|
/* generic: one-chr names stand for themselves */
|
|
assert(startp < endp);
|
|
len = endp - startp;
|
|
if (len == 1)
|
|
return *startp;
|
|
|
|
NOTE(REG_ULOCALE);
|
|
|
|
/* search table */
|
|
for (cn = cnames; cn->name != NULL; cn++)
|
|
{
|
|
if (strlen(cn->name) == len &&
|
|
pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
|
|
{
|
|
break; /* NOTE BREAK OUT */
|
|
}
|
|
}
|
|
if (cn->name != NULL)
|
|
return CHR(cn->code);
|
|
|
|
/* couldn't find it */
|
|
ERR(REG_ECOLLATE);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* range - supply cvec for a range, including legality check
|
|
*/
|
|
static struct cvec *
|
|
range(struct vars *v, /* context */
|
|
chr a, /* range start */
|
|
chr b, /* range end, might equal a */
|
|
int cases) /* case-independent? */
|
|
{
|
|
int nchrs;
|
|
struct cvec *cv;
|
|
chr c,
|
|
cc;
|
|
|
|
if (a != b && !before(a, b))
|
|
{
|
|
ERR(REG_ERANGE);
|
|
return NULL;
|
|
}
|
|
|
|
if (!cases)
|
|
{ /* easy version */
|
|
cv = getcvec(v, 0, 1);
|
|
NOERRN();
|
|
addrange(cv, a, b);
|
|
return cv;
|
|
}
|
|
|
|
/*
|
|
* When case-independent, it's hard to decide when cvec ranges are usable,
|
|
* so for now at least, we won't try. We use a range for the originally
|
|
* specified chrs and then add on any case-equivalents that are outside
|
|
* that range as individual chrs.
|
|
*
|
|
* To ensure sane behavior if someone specifies a very large range, limit
|
|
* the allocation size to 100000 chrs (arbitrary) and check for overrun
|
|
* inside the loop below.
|
|
*/
|
|
nchrs = b - a + 1;
|
|
if (nchrs <= 0 || nchrs > 100000)
|
|
nchrs = 100000;
|
|
|
|
cv = getcvec(v, nchrs, 1);
|
|
NOERRN();
|
|
addrange(cv, a, b);
|
|
|
|
for (c = a; c <= b; c++)
|
|
{
|
|
cc = pg_wc_tolower(c);
|
|
if (cc != c &&
|
|
(before(cc, a) || before(b, cc)))
|
|
{
|
|
if (cv->nchrs >= cv->chrspace)
|
|
{
|
|
ERR(REG_ETOOBIG);
|
|
return NULL;
|
|
}
|
|
addchr(cv, cc);
|
|
}
|
|
cc = pg_wc_toupper(c);
|
|
if (cc != c &&
|
|
(before(cc, a) || before(b, cc)))
|
|
{
|
|
if (cv->nchrs >= cv->chrspace)
|
|
{
|
|
ERR(REG_ETOOBIG);
|
|
return NULL;
|
|
}
|
|
addchr(cv, cc);
|
|
}
|
|
if (CANCEL_REQUESTED(v->re))
|
|
{
|
|
ERR(REG_CANCEL);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
return cv;
|
|
}
|
|
|
|
/*
|
|
* before - is chr x before chr y, for purposes of range legality?
|
|
*/
|
|
static int /* predicate */
|
|
before(chr x, chr y)
|
|
{
|
|
if (x < y)
|
|
return 1;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* eclass - supply cvec for an equivalence class
|
|
* Must include case counterparts on request.
|
|
*/
|
|
static struct cvec *
|
|
eclass(struct vars *v, /* context */
|
|
chr c, /* Collating element representing the
|
|
* equivalence class. */
|
|
int cases) /* all cases? */
|
|
{
|
|
struct cvec *cv;
|
|
|
|
/* crude fake equivalence class for testing */
|
|
if ((v->cflags & REG_FAKE) && c == 'x')
|
|
{
|
|
cv = getcvec(v, 4, 0);
|
|
addchr(cv, CHR('x'));
|
|
addchr(cv, CHR('y'));
|
|
if (cases)
|
|
{
|
|
addchr(cv, CHR('X'));
|
|
addchr(cv, CHR('Y'));
|
|
}
|
|
return cv;
|
|
}
|
|
|
|
/* otherwise, none */
|
|
if (cases)
|
|
return allcases(v, c);
|
|
cv = getcvec(v, 1, 0);
|
|
assert(cv != NULL);
|
|
addchr(cv, c);
|
|
return cv;
|
|
}
|
|
|
|
/*
|
|
* cclass - supply cvec for a character class
|
|
*
|
|
* Must include case counterparts if "cases" is true.
|
|
*
|
|
* The returned cvec might be either a transient cvec gotten from getcvec(),
|
|
* or a permanently cached one from pg_ctype_get_cache(). This is okay
|
|
* because callers are not supposed to explicitly free the result either way.
|
|
*/
|
|
static struct cvec *
|
|
cclass(struct vars *v, /* context */
|
|
const chr *startp, /* where the name starts */
|
|
const chr *endp, /* just past the end of the name */
|
|
int cases) /* case-independent? */
|
|
{
|
|
size_t len;
|
|
struct cvec *cv = NULL;
|
|
const char *const *namePtr;
|
|
int i,
|
|
index;
|
|
|
|
/*
|
|
* Map the name to the corresponding enumerated value.
|
|
*/
|
|
len = endp - startp;
|
|
index = -1;
|
|
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
|
|
{
|
|
if (strlen(*namePtr) == len &&
|
|
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
|
|
{
|
|
index = i;
|
|
break;
|
|
}
|
|
}
|
|
if (index == -1)
|
|
{
|
|
ERR(REG_ECTYPE);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Remap lower and upper to alpha if the match is case insensitive.
|
|
*/
|
|
|
|
if (cases &&
|
|
((enum classes) index == CC_LOWER ||
|
|
(enum classes) index == CC_UPPER))
|
|
index = (int) CC_ALPHA;
|
|
|
|
/*
|
|
* Now compute the character class contents. For classes that are based
|
|
* on the behavior of a <wctype.h> or <ctype.h> function, we use
|
|
* pg_ctype_get_cache so that we can cache the results. Other classes
|
|
* have definitions that are hard-wired here, and for those we just
|
|
* construct a transient cvec on the fly.
|
|
*
|
|
* NB: keep this code in sync with cclass_column_index(), below.
|
|
*/
|
|
|
|
switch ((enum classes) index)
|
|
{
|
|
case CC_PRINT:
|
|
cv = pg_ctype_get_cache(pg_wc_isprint, index);
|
|
break;
|
|
case CC_ALNUM:
|
|
cv = pg_ctype_get_cache(pg_wc_isalnum, index);
|
|
break;
|
|
case CC_ALPHA:
|
|
cv = pg_ctype_get_cache(pg_wc_isalpha, index);
|
|
break;
|
|
case CC_ASCII:
|
|
/* hard-wired meaning */
|
|
cv = getcvec(v, 0, 1);
|
|
if (cv)
|
|
addrange(cv, 0, 0x7f);
|
|
break;
|
|
case CC_BLANK:
|
|
/* hard-wired meaning */
|
|
cv = getcvec(v, 2, 0);
|
|
addchr(cv, '\t');
|
|
addchr(cv, ' ');
|
|
break;
|
|
case CC_CNTRL:
|
|
/* hard-wired meaning */
|
|
cv = getcvec(v, 0, 2);
|
|
addrange(cv, 0x0, 0x1f);
|
|
addrange(cv, 0x7f, 0x9f);
|
|
break;
|
|
case CC_DIGIT:
|
|
cv = pg_ctype_get_cache(pg_wc_isdigit, index);
|
|
break;
|
|
case CC_PUNCT:
|
|
cv = pg_ctype_get_cache(pg_wc_ispunct, index);
|
|
break;
|
|
case CC_XDIGIT:
|
|
|
|
/*
|
|
* It's not clear how to define this in non-western locales, and
|
|
* even less clear that there's any particular use in trying. So
|
|
* just hard-wire the meaning.
|
|
*/
|
|
cv = getcvec(v, 0, 3);
|
|
if (cv)
|
|
{
|
|
addrange(cv, '0', '9');
|
|
addrange(cv, 'a', 'f');
|
|
addrange(cv, 'A', 'F');
|
|
}
|
|
break;
|
|
case CC_SPACE:
|
|
cv = pg_ctype_get_cache(pg_wc_isspace, index);
|
|
break;
|
|
case CC_LOWER:
|
|
cv = pg_ctype_get_cache(pg_wc_islower, index);
|
|
break;
|
|
case CC_UPPER:
|
|
cv = pg_ctype_get_cache(pg_wc_isupper, index);
|
|
break;
|
|
case CC_GRAPH:
|
|
cv = pg_ctype_get_cache(pg_wc_isgraph, index);
|
|
break;
|
|
}
|
|
|
|
/* If cv is NULL now, the reason must be "out of memory" */
|
|
if (cv == NULL)
|
|
ERR(REG_ESPACE);
|
|
return cv;
|
|
}
|
|
|
|
/*
|
|
* cclass_column_index - get appropriate high colormap column index for chr
|
|
*/
|
|
static int
|
|
cclass_column_index(struct colormap *cm, chr c)
|
|
{
|
|
int colnum = 0;
|
|
|
|
/* Shouldn't go through all these pushups for simple chrs */
|
|
assert(c > MAX_SIMPLE_CHR);
|
|
|
|
/*
|
|
* Note: we should not see requests to consider cclasses that are not
|
|
* treated as locale-specific by cclass(), above.
|
|
*/
|
|
if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
|
|
colnum |= cm->classbits[CC_PRINT];
|
|
if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
|
|
colnum |= cm->classbits[CC_ALNUM];
|
|
if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
|
|
colnum |= cm->classbits[CC_ALPHA];
|
|
assert(cm->classbits[CC_ASCII] == 0);
|
|
assert(cm->classbits[CC_BLANK] == 0);
|
|
assert(cm->classbits[CC_CNTRL] == 0);
|
|
if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
|
|
colnum |= cm->classbits[CC_DIGIT];
|
|
if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
|
|
colnum |= cm->classbits[CC_PUNCT];
|
|
assert(cm->classbits[CC_XDIGIT] == 0);
|
|
if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
|
|
colnum |= cm->classbits[CC_SPACE];
|
|
if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
|
|
colnum |= cm->classbits[CC_LOWER];
|
|
if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
|
|
colnum |= cm->classbits[CC_UPPER];
|
|
if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
|
|
colnum |= cm->classbits[CC_GRAPH];
|
|
|
|
return colnum;
|
|
}
|
|
|
|
/*
|
|
* allcases - supply cvec for all case counterparts of a chr (including itself)
|
|
*
|
|
* This is a shortcut, preferably an efficient one, for simple characters;
|
|
* messy cases are done via range().
|
|
*/
|
|
static struct cvec *
|
|
allcases(struct vars *v, /* context */
|
|
chr c) /* character to get case equivs of */
|
|
{
|
|
struct cvec *cv;
|
|
chr lc,
|
|
uc;
|
|
|
|
lc = pg_wc_tolower(c);
|
|
uc = pg_wc_toupper(c);
|
|
|
|
cv = getcvec(v, 2, 0);
|
|
addchr(cv, lc);
|
|
if (lc != uc)
|
|
addchr(cv, uc);
|
|
return cv;
|
|
}
|
|
|
|
/*
|
|
* cmp - chr-substring compare
|
|
*
|
|
* Backrefs need this. It should preferably be efficient.
|
|
* Note that it does not need to report anything except equal/unequal.
|
|
* Note also that the length is exact, and the comparison should not
|
|
* stop at embedded NULs!
|
|
*/
|
|
static int /* 0 for equal, nonzero for unequal */
|
|
cmp(const chr *x, const chr *y, /* strings to compare */
|
|
size_t len) /* exact length of comparison */
|
|
{
|
|
return memcmp(VS(x), VS(y), len * sizeof(chr));
|
|
}
|
|
|
|
/*
|
|
* casecmp - case-independent chr-substring compare
|
|
*
|
|
* REG_ICASE backrefs need this. It should preferably be efficient.
|
|
* Note that it does not need to report anything except equal/unequal.
|
|
* Note also that the length is exact, and the comparison should not
|
|
* stop at embedded NULs!
|
|
*/
|
|
static int /* 0 for equal, nonzero for unequal */
|
|
casecmp(const chr *x, const chr *y, /* strings to compare */
|
|
size_t len) /* exact length of comparison */
|
|
{
|
|
for (; len > 0; len--, x++, y++)
|
|
{
|
|
if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|