postgresql/src/backend/regex/regc_locale.c

616 lines
14 KiB
C
Raw Normal View History

/*
* regc_locale.c --
*
* This file contains locale-specific regexp routines.
* This file is #included by regcomp.c.
*
* Copyright (c) 1998 by Scriptics Corporation.
*
* This software is copyrighted by the Regents of the University of
* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
* Corporation and other parties. The following terms apply to all files
* associated with the software unless explicitly disclaimed in
* individual files.
*
* The authors hereby grant permission to use, copy, modify, distribute,
* and license this software and its documentation for any purpose, provided
* that existing copyright notices are retained in all copies and that this
* notice is included verbatim in any distributions. No written agreement,
* license, or royalty fee is required for any of the authorized uses.
* Modifications to this software may be copyrighted by their authors
* and need not follow the licensing terms described here, provided that
* the new terms are clearly indicated on the first page of each file where
* they apply.
*
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
* MODIFICATIONS.
*
* GOVERNMENT USE: If you are acquiring this software on behalf of the
* U.S. government, the Government shall have only "Restricted Rights"
* in the software and related documentation as defined in the Federal
* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
* are acquiring the software on behalf of the Department of Defense, the
* software shall be classified as "Commercial Computer Software" and the
* Government shall have only "Restricted Rights" as defined in Clause
* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
* authors grant the U.S. Government and others acting in its behalf
* permission to use and distribute the software in accordance with the
* terms specified in this license.
*
* $Header: /cvsroot/pgsql/src/backend/regex/regc_locale.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
*/
/* ASCII character-name table */
static struct cname {
char *name;
char code;
} cnames[] = {
{"NUL", '\0'},
{"SOH", '\001'},
{"STX", '\002'},
{"ETX", '\003'},
{"EOT", '\004'},
{"ENQ", '\005'},
{"ACK", '\006'},
{"BEL", '\007'},
{"alert", '\007'},
{"BS", '\010'},
{"backspace", '\b'},
{"HT", '\011'},
{"tab", '\t'},
{"LF", '\012'},
{"newline", '\n'},
{"VT", '\013'},
{"vertical-tab", '\v'},
{"FF", '\014'},
{"form-feed", '\f'},
{"CR", '\015'},
{"carriage-return", '\r'},
{"SO", '\016'},
{"SI", '\017'},
{"DLE", '\020'},
{"DC1", '\021'},
{"DC2", '\022'},
{"DC3", '\023'},
{"DC4", '\024'},
{"NAK", '\025'},
{"SYN", '\026'},
{"ETB", '\027'},
{"CAN", '\030'},
{"EM", '\031'},
{"SUB", '\032'},
{"ESC", '\033'},
{"IS4", '\034'},
{"FS", '\034'},
{"IS3", '\035'},
{"GS", '\035'},
{"IS2", '\036'},
{"RS", '\036'},
{"IS1", '\037'},
{"US", '\037'},
{"space", ' '},
{"exclamation-mark",'!'},
{"quotation-mark", '"'},
{"number-sign", '#'},
{"dollar-sign", '$'},
{"percent-sign", '%'},
{"ampersand", '&'},
{"apostrophe", '\''},
{"left-parenthesis",'('},
{"right-parenthesis", ')'},
{"asterisk", '*'},
{"plus-sign", '+'},
{"comma", ','},
{"hyphen", '-'},
{"hyphen-minus", '-'},
{"period", '.'},
{"full-stop", '.'},
{"slash", '/'},
{"solidus", '/'},
{"zero", '0'},
{"one", '1'},
{"two", '2'},
{"three", '3'},
{"four", '4'},
{"five", '5'},
{"six", '6'},
{"seven", '7'},
{"eight", '8'},
{"nine", '9'},
{"colon", ':'},
{"semicolon", ';'},
{"less-than-sign", '<'},
{"equals-sign", '='},
{"greater-than-sign", '>'},
{"question-mark", '?'},
{"commercial-at", '@'},
{"left-square-bracket", '['},
{"backslash", '\\'},
{"reverse-solidus", '\\'},
{"right-square-bracket", ']'},
{"circumflex", '^'},
{"circumflex-accent", '^'},
{"underscore", '_'},
{"low-line", '_'},
{"grave-accent", '`'},
{"left-brace", '{'},
{"left-curly-bracket", '{'},
{"vertical-line", '|'},
{"right-brace", '}'},
{"right-curly-bracket", '}'},
{"tilde", '~'},
{"DEL", '\177'},
{NULL, 0}
};
/*
* some ctype functions with non-ascii-char guard
*/
static int
pg_isdigit(pg_wchar c)
{
return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
}
static int
pg_isalpha(pg_wchar c)
{
return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
}
static int
pg_isalnum(pg_wchar c)
{
return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
}
static int
pg_isupper(pg_wchar c)
{
return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
}
static int
pg_islower(pg_wchar c)
{
return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
}
static int
pg_isgraph(pg_wchar c)
{
return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
}
static int
pg_ispunct(pg_wchar c)
{
return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
}
static int
pg_isspace(pg_wchar c)
{
return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
}
static pg_wchar
pg_toupper(pg_wchar c)
{
if (c >= 0 && c <= UCHAR_MAX)
return toupper((unsigned char) c);
return c;
}
static pg_wchar
pg_tolower(pg_wchar c)
{
if (c >= 0 && c <= UCHAR_MAX)
return tolower((unsigned char) c);
return c;
}
/*
* nmcces - how many distinct MCCEs are there?
*/
static int
nmcces(struct vars *v)
{
/*
* No multi-character collating elements defined at the moment.
*/
return 0;
}
/*
* nleaders - how many chrs can be first chrs of MCCEs?
*/
static int
nleaders(struct vars *v)
{
return 0;
}
/*
* allmcces - return a cvec with all the MCCEs of the locale
*/
static struct cvec *
allmcces(struct vars *v, /* context */
struct cvec *cv) /* this is supposed to have enough room */
{
return clearcvec(cv);
}
/*
* element - map collating-element name to celt
*/
static celt
element(struct vars *v, /* context */
chr *startp, /* points to start of name */
chr *endp) /* points just past end of name */
{
struct cname *cn;
size_t len;
/* generic: one-chr names stand for themselves */
assert(startp < endp);
len = endp - startp;
if (len == 1) {
return *startp;
}
NOTE(REG_ULOCALE);
/* search table */
for (cn=cnames; cn->name!=NULL; cn++) {
if (strlen(cn->name)==len &&
pg_char_and_wchar_strncmp(cn->name, startp, len)==0) {
break; /* NOTE BREAK OUT */
}
}
if (cn->name != NULL) {
return CHR(cn->code);
}
/* couldn't find it */
ERR(REG_ECOLLATE);
return 0;
}
/*
* range - supply cvec for a range, including legality check
*/
static struct cvec *
range(struct vars *v, /* context */
celt a, /* range start */
celt b, /* range end, might equal a */
int cases) /* case-independent? */
{
int nchrs;
struct cvec *cv;
celt c, lc, uc;
if (a != b && !before(a, b)) {
ERR(REG_ERANGE);
return NULL;
}
if (!cases) { /* easy version */
cv = getcvec(v, 0, 1, 0);
NOERRN();
addrange(cv, a, b);
return cv;
}
/*
* When case-independent, it's hard to decide when cvec ranges are
* usable, so for now at least, we won't try. We allocate enough
* space for two case variants plus a little extra for the two
* title case variants.
*/
nchrs = (b - a + 1)*2 + 4;
cv = getcvec(v, nchrs, 0, 0);
NOERRN();
for (c=a; c<=b; c++) {
addchr(cv, c);
lc = pg_tolower((chr)c);
if (c != lc) {
addchr(cv, lc);
}
uc = pg_toupper((chr)c);
if (c != uc) {
addchr(cv, uc);
}
}
return cv;
}
/*
* before - is celt x before celt y, for purposes of range legality?
*/
static int /* predicate */
before(celt x, celt y)
{
/* trivial because no MCCEs */
if (x < y) {
return 1;
}
return 0;
}
/*
* eclass - supply cvec for an equivalence class
* Must include case counterparts on request.
*/
static struct cvec *
eclass(struct vars *v, /* context */
celt c, /* Collating element representing
* the equivalence class. */
int cases) /* all cases? */
{
struct cvec *cv;
/* crude fake equivalence class for testing */
if ((v->cflags&REG_FAKE) && c == 'x') {
cv = getcvec(v, 4, 0, 0);
addchr(cv, (chr)'x');
addchr(cv, (chr)'y');
if (cases) {
addchr(cv, (chr)'X');
addchr(cv, (chr)'Y');
}
return cv;
}
/* otherwise, none */
if (cases) {
return allcases(v, c);
}
cv = getcvec(v, 1, 0, 0);
assert(cv != NULL);
addchr(cv, (chr)c);
return cv;
}
/*
* cclass - supply cvec for a character class
*
* Must include case counterparts on request.
*/
static struct cvec *
cclass(struct vars *v, /* context */
chr *startp, /* where the name starts */
chr *endp, /* just past the end of the name */
int cases) /* case-independent? */
{
size_t len;
struct cvec *cv = NULL;
char **namePtr;
int i, index;
/*
* The following arrays define the valid character class names.
*/
static char *classNames[] = {
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
"lower", "print", "punct", "space", "upper", "xdigit", NULL
};
enum classes {
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
};
/*
* Map the name to the corresponding enumerated value.
*/
len = endp - startp;
index = -1;
for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {
if (strlen(*namePtr) == len &&
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) {
index = i;
break;
}
}
if (index == -1) {
ERR(REG_ECTYPE);
return NULL;
}
/*
* Remap lower and upper to alpha if the match is case insensitive.
*/
if (cases &&
((enum classes) index == CC_LOWER ||
(enum classes) index == CC_UPPER))
index = (int) CC_ALPHA;
/*
* Now compute the character class contents.
*
* For the moment, assume that only char codes < 256 can be in these
* classes.
*/
switch((enum classes) index) {
case CC_PRINT:
case CC_ALNUM:
cv = getcvec(v, UCHAR_MAX, 1, 0);
if (cv) {
for (i=0 ; i<= UCHAR_MAX ; i++) {
if (pg_isalpha((chr) i))
addchr(cv, (chr) i);
}
addrange(cv, (chr) '0', (chr) '9');
}
break;
case CC_ALPHA:
cv = getcvec(v, UCHAR_MAX, 0, 0);
if (cv) {
for (i=0 ; i<= UCHAR_MAX ; i++) {
if (pg_isalpha((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_ASCII:
cv = getcvec(v, 0, 1, 0);
if (cv) {
addrange(cv, 0, 0x7f);
}
break;
case CC_BLANK:
cv = getcvec(v, 2, 0, 0);
addchr(cv, '\t');
addchr(cv, ' ');
break;
case CC_CNTRL:
cv = getcvec(v, 0, 2, 0);
addrange(cv, 0x0, 0x1f);
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
cv = getcvec(v, 0, 1, 0);
if (cv) {
addrange(cv, (chr) '0', (chr) '9');
}
break;
case CC_PUNCT:
cv = getcvec(v, UCHAR_MAX, 0, 0);
if (cv) {
for (i=0 ; i<= UCHAR_MAX ; i++) {
if (pg_ispunct((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_XDIGIT:
cv = getcvec(v, 0, 3, 0);
if (cv) {
addrange(cv, '0', '9');
addrange(cv, 'a', 'f');
addrange(cv, 'A', 'F');
}
break;
case CC_SPACE:
cv = getcvec(v, UCHAR_MAX, 0, 0);
if (cv) {
for (i=0 ; i<= UCHAR_MAX ; i++) {
if (pg_isspace((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_LOWER:
cv = getcvec(v, UCHAR_MAX, 0, 0);
if (cv) {
for (i=0 ; i<= UCHAR_MAX ; i++) {
if (pg_islower((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_UPPER:
cv = getcvec(v, UCHAR_MAX, 0, 0);
if (cv) {
for (i=0 ; i<= UCHAR_MAX ; i++) {
if (pg_isupper((chr) i))
addchr(cv, (chr) i);
}
}
break;
case CC_GRAPH:
cv = getcvec(v, UCHAR_MAX, 0, 0);
if (cv) {
for (i=0 ; i<= UCHAR_MAX ; i++) {
if (pg_isgraph((chr) i))
addchr(cv, (chr) i);
}
}
break;
}
if (cv == NULL) {
ERR(REG_ESPACE);
}
return cv;
}
/*
* allcases - supply cvec for all case counterparts of a chr (including itself)
*
* This is a shortcut, preferably an efficient one, for simple characters;
* messy cases are done via range().
*/
static struct cvec *
allcases(struct vars *v, /* context */
chr pc) /* character to get case equivs of */
{
struct cvec *cv;
chr c = (chr)pc;
chr lc, uc;
lc = pg_tolower((chr)c);
uc = pg_toupper((chr)c);
cv = getcvec(v, 2, 0, 0);
addchr(cv, lc);
if (lc != uc) {
addchr(cv, uc);
}
return cv;
}
/*
* cmp - chr-substring compare
*
* Backrefs need this. It should preferably be efficient.
* Note that it does not need to report anything except equal/unequal.
* Note also that the length is exact, and the comparison should not
* stop at embedded NULs!
*/
static int /* 0 for equal, nonzero for unequal */
cmp(const chr *x, const chr *y, /* strings to compare */
size_t len) /* exact length of comparison */
{
return memcmp(VS(x), VS(y), len*sizeof(chr));
}
/*
* casecmp - case-independent chr-substring compare
*
* REG_ICASE backrefs need this. It should preferably be efficient.
* Note that it does not need to report anything except equal/unequal.
* Note also that the length is exact, and the comparison should not
* stop at embedded NULs!
*/
static int /* 0 for equal, nonzero for unequal */
casecmp(const chr *x, const chr *y, /* strings to compare */
size_t len) /* exact length of comparison */
{
for (; len > 0; len--, x++, y++) {
if ((*x!=*y) && (pg_tolower(*x) != pg_tolower(*y))) {
return 1;
}
}
return 0;
}