From 08e0b34badad64ae33d7a1fef5c564989f0b8418 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Fri, 3 Dec 2004 01:20:33 +0000 Subject: [PATCH] Back out fix for Unicode characters above 0x10000 --- doc/src/sgml/postgres.sgml | 3 +- src/backend/utils/mb/wchar.c | 116 +++++++++++++---------------------- src/include/mb/pg_wchar.h | 12 +--- 3 files changed, 47 insertions(+), 84 deletions(-) diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml index ca0d55c3b4..075e356927 100644 --- a/doc/src/sgml/postgres.sgml +++ b/doc/src/sgml/postgres.sgml @@ -1,5 +1,5 @@ diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 97adc9eddb..5a52d34de2 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.39 2004/12/02 22:37:13 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40 2004/12/03 01:20:20 momjian Exp $ * * WIN1250 client encoding updated by Pavel Behal * @@ -343,31 +343,6 @@ pg_johab_dsplen(const unsigned char *s) return (pg_euc_dsplen(s)); } -bool isLegalUTF8(const UTF8 *source, int len) { - UTF8 a; - const UTF8 *srcptr = source+len; - if(!source || (pg_utf_mblen(source) != len)) return false; - switch (len) { - default: return false; - /* Everything else falls through when "true"... */ - case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - if (*source > 0xFD) return false; - } - return true; -} - /* * convert UTF-8 string to pg_wchar (UCS-2) * caller should allocate enough space for "to" @@ -423,7 +398,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) * returns the byte length of a UTF-8 word pointed to by s */ int -pg_utf_mblen(const UTF8 *s) +pg_utf_mblen(const unsigned char *s) { int len = 1; @@ -431,19 +406,13 @@ pg_utf_mblen(const UTF8 *s) len = 1; else if ((*s & 0xe0) == 0xc0) len = 2; - else if ((*s & 0xf0) == 0xe0) - len = 3; - else if ((*s & 0xf8) == 0xf0) - len = 4; - else if ((*s & 0xfc) == 0xf8) - len = 5; - else if ((*s & 0xfe) == 0xfc) - len = 6; + else if ((*s & 0xe0) == 0xe0) + len = 3; return (len); } static int -pg_utf_dsplen(const UTF8 *s) +pg_utf_dsplen(const unsigned char *s) { return 1; /* XXX fix me! */ } @@ -752,8 +721,8 @@ pg_wchar_tbl pg_wchar_table[] = { {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6}, /* 6; PG_UNICODE */ - {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ + {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */ + {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ @@ -775,11 +744,11 @@ pg_wchar_tbl pg_wchar_table[] = { {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ - {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */ - {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */ - {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ - {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ - {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ + {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */ + {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */ + {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ + {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ + {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ }; /* returns the byte length of a word for mule internal code */ @@ -853,48 +822,51 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError) while (len > 0 && *mbstr) { + /* special UTF-8 check */ + if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) + { + if (noError) + return false; + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("Unicode characters greater than or equal to 0x10000 are not supported"))); + } + l = pg_mblen(mbstr); - /* special UTF-8 check */ - if (encoding == PG_UTF8) { - if(!isLegalUTF8(mbstr,l)) { - if (noError) return false; - ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near character %c",*mbstr))); - } - } else { - for (i = 1; i < l; i++) + for (i = 1; i < l; i++) + { + /* + * we expect that every multibyte char consists of bytes + * having the 8th bit set + */ + if (i >= len || (mbstr[i] & 0x80) == 0) { - /* - * we expect that every multibyte char consists of bytes - * having the 8th bit set - */ - if (i >= len || (mbstr[i] & 0x80) == 0) - { - char buf[8 * 2 + 1]; - char *p = buf; - int j, + char buf[8 * 2 + 1]; + char *p = buf; + int j, jlimit; - if (noError) - return false; + if (noError) + return false; - jlimit = Min(l, len); - jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ - for (j = 0; j < jlimit; j++) - p += sprintf(p, "%02x", mbstr[j]); + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", mbstr[j]); - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"%s\": 0x%s", - GetDatabaseEncodingName(), buf))); - } + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte sequence for encoding \"%s\": 0x%s", + GetDatabaseEncodingName(), buf))); } - } + len -= l; mbstr += l; } + return true; } diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index aa2976650b..2ab3b91542 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.54 2004/12/02 22:37:14 momjian Exp $ */ +/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.55 2004/12/03 01:20:33 momjian Exp $ */ #ifndef PG_WCHAR_H #define PG_WCHAR_H @@ -17,14 +17,6 @@ */ typedef unsigned int pg_wchar; - -/* - * The UTF types - */ -typedef unsigned int UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ - /* * various definitions for EUC */ @@ -348,6 +340,4 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc); extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); -extern bool isLegalUTF8(const UTF8 *source, int len); - #endif /* PG_WCHAR_H */