postgresql/src/backend/regex/regc_locale.c

768 lines
15 KiB
C

/*
* regc_locale.c --
*
* This file contains locale-specific regexp routines.
* This file is #included by regcomp.c.
*
* Copyright (c) 1998 by Scriptics Corporation.
*
* This software is copyrighted by the Regents of the University of
* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
* Corporation and other parties. The following terms apply to all files
* associated with the software unless explicitly disclaimed in
* individual files.
*
* The authors hereby grant permission to use, copy, modify, distribute,
* and license this software and its documentation for any purpose, provided
* that existing copyright notices are retained in all copies and that this
* notice is included verbatim in any distributions. No written agreement,
* license, or royalty fee is required for any of the authorized uses.
* Modifications to this software may be copyrighted by their authors
* and need not follow the licensing terms described here, provided that
* the new terms are clearly indicated on the first page of each file where
* they apply.
*
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
* MODIFICATIONS.
*
* GOVERNMENT USE: If you are acquiring this software on behalf of the
* U.S. government, the Government shall have only "Restricted Rights"
* in the software and related documentation as defined in the Federal
* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
* are acquiring the software on behalf of the Department of Defense, the
* software shall be classified as "Commercial Computer Software" and the
* Government shall have only "Restricted Rights" as defined in Clause
* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
* authors grant the U.S. Government and others acting in its behalf
* permission to use and distribute the software in accordance with the
* terms specified in this license.
*
* src/backend/regex/regc_locale.c
*/
/* ASCII character-name table */
static const struct cname
{
const char *name;
const char code;
} cnames[] =
{
{
"NUL", '\0'
},
{
"SOH", '\001'
},
{
"STX", '\002'
},
{
"ETX", '\003'
},
{
"EOT", '\004'
},
{
"ENQ", '\005'
},
{
"ACK", '\006'
},
{
"BEL", '\007'
},
{
"alert", '\007'
},
{
"BS", '\010'
},
{
"backspace", '\b'
},
{
"HT", '\011'
},
{
"tab", '\t'
},
{
"LF", '\012'
},
{
"newline", '\n'
},
{
"VT", '\013'
},
{
"vertical-tab", '\v'
},
{
"FF", '\014'
},
{
"form-feed", '\f'
},
{
"CR", '\015'
},
{
"carriage-return", '\r'
},
{
"SO", '\016'
},
{
"SI", '\017'
},
{
"DLE", '\020'
},
{
"DC1", '\021'
},
{
"DC2", '\022'
},
{
"DC3", '\023'
},
{
"DC4", '\024'
},
{
"NAK", '\025'
},
{
"SYN", '\026'
},
{
"ETB", '\027'
},
{
"CAN", '\030'
},
{
"EM", '\031'
},
{
"SUB", '\032'
},
{
"ESC", '\033'
},
{
"IS4", '\034'
},
{
"FS", '\034'
},
{
"IS3", '\035'
},
{
"GS", '\035'
},
{
"IS2", '\036'
},
{
"RS", '\036'
},
{
"IS1", '\037'
},
{
"US", '\037'
},
{
"space", ' '
},
{
"exclamation-mark", '!'
},
{
"quotation-mark", '"'
},
{
"number-sign", '#'
},
{
"dollar-sign", '$'
},
{
"percent-sign", '%'
},
{
"ampersand", '&'
},
{
"apostrophe", '\''
},
{
"left-parenthesis", '('
},
{
"right-parenthesis", ')'
},
{
"asterisk", '*'
},
{
"plus-sign", '+'
},
{
"comma", ','
},
{
"hyphen", '-'
},
{
"hyphen-minus", '-'
},
{
"period", '.'
},
{
"full-stop", '.'
},
{
"slash", '/'
},
{
"solidus", '/'
},
{
"zero", '0'
},
{
"one", '1'
},
{
"two", '2'
},
{
"three", '3'
},
{
"four", '4'
},
{
"five", '5'
},
{
"six", '6'
},
{
"seven", '7'
},
{
"eight", '8'
},
{
"nine", '9'
},
{
"colon", ':'
},
{
"semicolon", ';'
},
{
"less-than-sign", '<'
},
{
"equals-sign", '='
},
{
"greater-than-sign", '>'
},
{
"question-mark", '?'
},
{
"commercial-at", '@'
},
{
"left-square-bracket", '['
},
{
"backslash", '\\'
},
{
"reverse-solidus", '\\'
},
{
"right-square-bracket", ']'
},
{
"circumflex", '^'
},
{
"circumflex-accent", '^'
},
{
"underscore", '_'
},
{
"low-line", '_'
},
{
"grave-accent", '`'
},
{
"left-brace", '{'
},
{
"left-curly-bracket", '{'
},
{
"vertical-line", '|'
},
{
"right-brace", '}'
},
{
"right-curly-bracket", '}'
},
{
"tilde", '~'
},
{
"DEL", '\177'
},
{
NULL, 0
}
};
/*
* The following array defines the valid character class names.
* The entries must match enum char_classes in regguts.h.
*/
static const char *const classNames[NUM_CCLASSES + 1] = {
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
"lower", "print", "punct", "space", "upper", "xdigit", "word",
NULL
};
/*
* We do not use the hard-wired Unicode classification tables that Tcl does.
* This is because (a) we need to deal with other encodings besides Unicode,
* and (b) we want to track the behavior of the libc locale routines as
* closely as possible. For example, it wouldn't be unreasonable for a
* locale to not consider every Unicode letter as a letter. So we build
* character classification cvecs by asking libc, even for Unicode.
*/
/*
* element - map collating-element name to chr
*/
static chr
element(struct vars *v, /* context */
const chr *startp, /* points to start of name */
const chr *endp) /* points just past end of name */
{
const struct cname *cn;
size_t len;
/* generic: one-chr names stand for themselves */
assert(startp < endp);
len = endp - startp;
if (len == 1)
return *startp;
NOTE(REG_ULOCALE);
/* search table */
for (cn = cnames; cn->name != NULL; cn++)
{
if (strlen(cn->name) == len &&
pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
{
break; /* NOTE BREAK OUT */
}
}
if (cn->name != NULL)
return CHR(cn->code);
/* couldn't find it */
ERR(REG_ECOLLATE);
return 0;
}
/*
* range - supply cvec for a range, including legality check
*/
static struct cvec *
range(struct vars *v, /* context */
chr a, /* range start */
chr b, /* range end, might equal a */
int cases) /* case-independent? */
{
int nchrs;
struct cvec *cv;
chr c,
cc;
if (a != b && !before(a, b))
{
ERR(REG_ERANGE);
return NULL;
}
if (!cases)
{ /* easy version */
cv = getcvec(v, 0, 1);
NOERRN();
addrange(cv, a, b);
return cv;
}
/*
* When case-independent, it's hard to decide when cvec ranges are usable,
* so for now at least, we won't try. We use a range for the originally
* specified chrs and then add on any case-equivalents that are outside
* that range as individual chrs.
*
* To ensure sane behavior if someone specifies a very large range, limit
* the allocation size to 100000 chrs (arbitrary) and check for overrun
* inside the loop below.
*/
nchrs = b - a + 1;
if (nchrs <= 0 || nchrs > 100000)
nchrs = 100000;
cv = getcvec(v, nchrs, 1);
NOERRN();
addrange(cv, a, b);
for (c = a; c <= b; c++)
{
cc = pg_wc_tolower(c);
if (cc != c &&
(before(cc, a) || before(b, cc)))
{
if (cv->nchrs >= cv->chrspace)
{
ERR(REG_ETOOBIG);
return NULL;
}
addchr(cv, cc);
}
cc = pg_wc_toupper(c);
if (cc != c &&
(before(cc, a) || before(b, cc)))
{
if (cv->nchrs >= cv->chrspace)
{
ERR(REG_ETOOBIG);
return NULL;
}
addchr(cv, cc);
}
INTERRUPT(v->re);
}
return cv;
}
/*
* before - is chr x before chr y, for purposes of range legality?
*/
static int /* predicate */
before(chr x, chr y)
{
if (x < y)
return 1;
return 0;
}
/*
* eclass - supply cvec for an equivalence class
* Must include case counterparts on request.
*/
static struct cvec *
eclass(struct vars *v, /* context */
chr c, /* Collating element representing the
* equivalence class. */
int cases) /* all cases? */
{
struct cvec *cv;
/* crude fake equivalence class for testing */
if ((v->cflags & REG_FAKE) && c == 'x')
{
cv = getcvec(v, 4, 0);
addchr(cv, CHR('x'));
addchr(cv, CHR('y'));
if (cases)
{
addchr(cv, CHR('X'));
addchr(cv, CHR('Y'));
}
return cv;
}
/* otherwise, none */
if (cases)
return allcases(v, c);
cv = getcvec(v, 1, 0);
assert(cv != NULL);
addchr(cv, c);
return cv;
}
/*
* lookupcclass - lookup a character class identified by name
*
* On failure, sets an error code in *v; the result is then garbage.
*/
static enum char_classes
lookupcclass(struct vars *v, /* context (for returning errors) */
const chr *startp, /* where the name starts */
const chr *endp) /* just past the end of the name */
{
size_t len;
const char *const *namePtr;
int i;
/*
* Map the name to the corresponding enumerated value.
*/
len = endp - startp;
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
{
if (strlen(*namePtr) == len &&
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
return (enum char_classes) i;
}
ERR(REG_ECTYPE);
return (enum char_classes) 0;
}
/*
* cclasscvec - supply cvec for a character class
*
* Must include case counterparts if "cases" is true.
*
* The returned cvec might be either a transient cvec gotten from getcvec(),
* or a permanently cached one from pg_ctype_get_cache(). This is okay
* because callers are not supposed to explicitly free the result either way.
*/
static struct cvec *
cclasscvec(struct vars *v, /* context */
enum char_classes cclasscode, /* class to build a cvec for */
int cases) /* case-independent? */
{
struct cvec *cv = NULL;
/*
* Remap lower and upper to alpha if the match is case insensitive.
*/
if (cases &&
(cclasscode == CC_LOWER ||
cclasscode == CC_UPPER))
cclasscode = CC_ALPHA;
/*
* Now compute the character class contents. For classes that are based
* on the behavior of a <wctype.h> or <ctype.h> function, we use
* pg_ctype_get_cache so that we can cache the results. Other classes
* have definitions that are hard-wired here, and for those we just
* construct a transient cvec on the fly.
*
* NB: keep this code in sync with cclass_column_index(), below.
*/
switch (cclasscode)
{
case CC_PRINT:
cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
break;
case CC_ALNUM:
cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
break;
case CC_ALPHA:
cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
break;
case CC_WORD:
cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
break;
case CC_ASCII:
/* hard-wired meaning */
cv = getcvec(v, 0, 1);
if (cv)
addrange(cv, 0, 0x7f);
break;
case CC_BLANK:
/* hard-wired meaning */
cv = getcvec(v, 2, 0);
addchr(cv, '\t');
addchr(cv, ' ');
break;
case CC_CNTRL:
/* hard-wired meaning */
cv = getcvec(v, 0, 2);
addrange(cv, 0x0, 0x1f);
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
break;
case CC_PUNCT:
cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
break;
case CC_XDIGIT:
/*
* It's not clear how to define this in non-western locales, and
* even less clear that there's any particular use in trying. So
* just hard-wire the meaning.
*/
cv = getcvec(v, 0, 3);
if (cv)
{
addrange(cv, '0', '9');
addrange(cv, 'a', 'f');
addrange(cv, 'A', 'F');
}
break;
case CC_SPACE:
cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
break;
case CC_LOWER:
cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
break;
case CC_UPPER:
cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
break;
case CC_GRAPH:
cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
break;
}
/* If cv is NULL now, the reason must be "out of memory" */
if (cv == NULL)
ERR(REG_ESPACE);
return cv;
}
/*
* cclass_column_index - get appropriate high colormap column index for chr
*/
static int
cclass_column_index(struct colormap *cm, chr c)
{
int colnum = 0;
/* Shouldn't go through all these pushups for simple chrs */
assert(c > MAX_SIMPLE_CHR);
/*
* Note: we should not see requests to consider cclasses that are not
* treated as locale-specific by cclasscvec(), above.
*/
if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
colnum |= cm->classbits[CC_PRINT];
if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
colnum |= cm->classbits[CC_ALNUM];
if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
colnum |= cm->classbits[CC_ALPHA];
if (cm->classbits[CC_WORD] && pg_wc_isword(c))
colnum |= cm->classbits[CC_WORD];
assert(cm->classbits[CC_ASCII] == 0);
assert(cm->classbits[CC_BLANK] == 0);
assert(cm->classbits[CC_CNTRL] == 0);
if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
colnum |= cm->classbits[CC_DIGIT];
if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
colnum |= cm->classbits[CC_PUNCT];
assert(cm->classbits[CC_XDIGIT] == 0);
if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
colnum |= cm->classbits[CC_SPACE];
if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
colnum |= cm->classbits[CC_LOWER];
if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
colnum |= cm->classbits[CC_UPPER];
if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
colnum |= cm->classbits[CC_GRAPH];
return colnum;
}
/*
* allcases - supply cvec for all case counterparts of a chr (including itself)
*
* This is a shortcut, preferably an efficient one, for simple characters;
* messy cases are done via range().
*/
static struct cvec *
allcases(struct vars *v, /* context */
chr c) /* character to get case equivs of */
{
struct cvec *cv;
chr lc,
uc;
lc = pg_wc_tolower(c);
uc = pg_wc_toupper(c);
cv = getcvec(v, 2, 0);
addchr(cv, lc);
if (lc != uc)
addchr(cv, uc);
return cv;
}
/*
* cmp - chr-substring compare
*
* Backrefs need this. It should preferably be efficient.
* Note that it does not need to report anything except equal/unequal.
* Note also that the length is exact, and the comparison should not
* stop at embedded NULs!
*/
static int /* 0 for equal, nonzero for unequal */
cmp(const chr *x, const chr *y, /* strings to compare */
size_t len) /* exact length of comparison */
{
return memcmp(VS(x), VS(y), len * sizeof(chr));
}
/*
* casecmp - case-independent chr-substring compare
*
* REG_ICASE backrefs need this. It should preferably be efficient.
* Note that it does not need to report anything except equal/unequal.
* Note also that the length is exact, and the comparison should not
* stop at embedded NULs!
*/
static int /* 0 for equal, nonzero for unequal */
casecmp(const chr *x, const chr *y, /* strings to compare */
size_t len) /* exact length of comparison */
{
for (; len > 0; len--, x++, y++)
{
if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
return 1;
}
return 0;
}