From 0d32342501f2a562bc57156dc92d59a0624be4a6 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 1 Dec 2009 21:00:24 +0000 Subject: [PATCH] Teach the regular expression functions to do case-insensitive matching and locale-dependent character classification properly when the database encoding is UTF8. The previous coding worked okay in single-byte encodings, or in any case for ASCII characters, but failed entirely on multibyte characters. The fix assumes that the functions use Unicode code points as the wchar representation for Unicode, ie, wchar matches pg_wchar. This is only a partial solution, since we're still stupid about non-ASCII characters in multibyte encodings other than UTF8. The practical effect of that is limited, however, since those cases are generally Far Eastern glyphs for which concepts like case-folding don't apply anyway. Certainly all or nearly all of the field reports of problems have been about UTF8. A more general solution would require switching to the platform's wchar representation for all regex operations; which is possible but would have substantial disadvantages. Let's try this and see if it's sufficient in practice. --- src/backend/regex/regc_locale.c | 118 ++++++++++++++++++++++++++++---- src/include/regex/regcustom.h | 13 +++- 2 files changed, 117 insertions(+), 14 deletions(-) diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c index 28f5e7ca12..8952c3cde0 100644 --- a/src/backend/regex/regc_locale.c +++ b/src/backend/regex/regc_locale.c @@ -47,7 +47,7 @@ * permission to use and distribute the software in accordance with the * terms specified in this license. * - * $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.9 2008/02/14 17:33:37 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.10 2009/12/01 21:00:24 tgl Exp $ */ /* ASCII character-name table */ @@ -349,67 +349,152 @@ static const struct cname } }; + /* - * some ctype functions with non-ascii-char guard + * ctype functions adapted to work on pg_wchar (a/k/a chr) + * + * When working in UTF8 encoding, we use the functions if + * available. This assumes that every platform uses Unicode codepoints + * directly as the wchar_t representation of Unicode. On some platforms + * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. + * + * In all other encodings, we use the functions for pg_wchar + * values up to 255, and punt for values above that. This is only 100% + * correct in single-byte encodings such as LATINn. However, non-Unicode + * multibyte encodings are mostly Far Eastern character sets for which the + * properties being tested here aren't relevant for higher code values anyway. + * + * NB: the coding here assumes pg_wchar is an unsigned type. */ + static int pg_wc_isdigit(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswdigit((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c)); } static int pg_wc_isalpha(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalpha((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c)); } static int pg_wc_isalnum(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswalnum((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c)); } static int pg_wc_isupper(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswupper((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c)); } static int pg_wc_islower(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswlower((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c)); } static int pg_wc_isgraph(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswgraph((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c)); } static int pg_wc_isprint(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswprint((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c)); } static int pg_wc_ispunct(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswpunct((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c)); } static int pg_wc_isspace(pg_wchar c) { - return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c)); +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return iswspace((wint_t) c); + } +#endif + return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c)); } static pg_wchar pg_wc_toupper(pg_wchar c) { - if (c >= 0 && c <= UCHAR_MAX) +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towupper((wint_t) c); + } +#endif + if (c <= (pg_wchar) UCHAR_MAX) return toupper((unsigned char) c); return c; } @@ -417,7 +502,14 @@ pg_wc_toupper(pg_wchar c) static pg_wchar pg_wc_tolower(pg_wchar c) { - if (c >= 0 && c <= UCHAR_MAX) +#ifdef USE_WIDE_UPPER_LOWER + if (GetDatabaseEncoding() == PG_UTF8) + { + if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) + return towlower((wint_t) c); + } +#endif + if (c <= (pg_wchar) UCHAR_MAX) return tolower((unsigned char) c); return c; } diff --git a/src/include/regex/regcustom.h b/src/include/regex/regcustom.h index 269f926be8..d1a07dd00e 100644 --- a/src/include/regex/regcustom.h +++ b/src/include/regex/regcustom.h @@ -25,7 +25,7 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.7 2008/02/14 17:33:37 tgl Exp $ + * $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.8 2009/12/01 21:00:24 tgl Exp $ */ /* headers if any */ @@ -34,6 +34,17 @@ #include #include +/* + * towlower() and friends should be in , but some pre-C99 systems + * declare them in . + */ +#ifdef HAVE_WCHAR_H +#include +#endif +#ifdef HAVE_WCTYPE_H +#include +#endif + #include "mb/pg_wchar.h"