From e1de3e0833b35d0118165713e07abc9718c9d260 Mon Sep 17 00:00:00 2001 From: Tatsuo Ishii Date: Tue, 11 Sep 2001 04:50:36 +0000 Subject: [PATCH] Implement following item in TODO: * Reject character sequences those are not valid in their charset --- src/backend/utils/mb/conv.c | 72 ++++++++++++------------ src/backend/utils/mb/wchar.c | 103 ++++++++++++++++++++++++++++------- src/include/mb/pg_wchar.h | 6 +- 3 files changed, 125 insertions(+), 56 deletions(-) diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index 6dbb9c6649..32b1725672 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -6,7 +6,7 @@ * WIN1250 client encoding support contributed by Pavel Behal * SJIS UDC (NEC selection IBM kanji) support contributed by Eiji Tokuya * - * $Id: conv.c,v 1.27 2001/09/06 04:57:29 ishii Exp $ + * $Id: conv.c,v 1.28 2001/09/11 04:50:36 ishii Exp $ * * */ @@ -1633,48 +1633,48 @@ big5_to_utf(unsigned char *euc, unsigned char *utf, int len) */ pg_enconv pg_enconv_tbl[] = { - { PG_SQL_ASCII, ascii2mic, mic2ascii, ascii2utf, utf2ascii }, - { PG_EUC_JP, euc_jp2mic, mic2euc_jp, euc_jp_to_utf, utf_to_euc_jp }, - { PG_EUC_CN, euc_cn2mic, mic2euc_cn, euc_cn_to_utf, utf_to_euc_cn }, - { PG_EUC_KR, euc_kr2mic, mic2euc_kr, euc_kr_to_utf, utf_to_euc_kr }, - { PG_EUC_TW, euc_tw2mic, mic2euc_tw, euc_tw_to_utf, utf_to_euc_tw }, - { PG_UTF8, 0, 0, 0, 0 }, - { PG_MULE_INTERNAL, 0, 0, 0, 0 }, - { PG_LATIN1, latin12mic, mic2latin1, latin1_to_utf, utf_to_latin1 }, - { PG_LATIN2, latin22mic, mic2latin2, latin2_to_utf, utf_to_latin2 }, - { PG_LATIN3, latin32mic, mic2latin3, latin3_to_utf, utf_to_latin3 }, - { PG_LATIN4, latin42mic, mic2latin4, latin4_to_utf, utf_to_latin4 }, - { PG_LATIN5, iso2mic, mic2iso, latin5_to_utf, utf_to_latin5 }, - { PG_KOI8R, koi8r2mic, mic2koi8r, KOI8R_to_utf, utf_to_KOI8R }, - { PG_WIN1251, win12512mic, mic2win1251, WIN1251_to_utf, utf_to_WIN1251 }, - { PG_ALT, alt2mic, mic2alt, ALT_to_utf, utf_to_ALT }, - { PG_SJIS, sjis2mic, mic2sjis, sjis_to_utf, utf_to_sjis }, + { PG_SQL_ASCII, ascii2mic, mic2ascii, ascii2utf, utf2ascii}, + { PG_EUC_JP, euc_jp2mic, mic2euc_jp, euc_jp_to_utf, utf_to_euc_jp}, + { PG_EUC_CN, euc_cn2mic, mic2euc_cn, euc_cn_to_utf, utf_to_euc_cn}, + { PG_EUC_KR, euc_kr2mic, mic2euc_kr, euc_kr_to_utf, utf_to_euc_kr}, + { PG_EUC_TW, euc_tw2mic, mic2euc_tw, euc_tw_to_utf, utf_to_euc_tw}, + { PG_UTF8, 0, 0, 0, 0}, + { PG_MULE_INTERNAL, 0, 0, 0, 0}, + { PG_LATIN1, latin12mic, mic2latin1, latin1_to_utf, utf_to_latin1}, + { PG_LATIN2, latin22mic, mic2latin2, latin2_to_utf, utf_to_latin2}, + { PG_LATIN3, latin32mic, mic2latin3, latin3_to_utf, utf_to_latin3}, + { PG_LATIN4, latin42mic, mic2latin4, latin4_to_utf, utf_to_latin4}, + { PG_LATIN5, iso2mic, mic2iso, latin5_to_utf, utf_to_latin5}, + { PG_KOI8R, koi8r2mic, mic2koi8r, KOI8R_to_utf, utf_to_KOI8R}, + { PG_WIN1251, win12512mic, mic2win1251, WIN1251_to_utf, utf_to_WIN1251}, + { PG_ALT, alt2mic, mic2alt, ALT_to_utf, utf_to_ALT}, + { PG_SJIS, sjis2mic, mic2sjis, sjis_to_utf, utf_to_sjis}, { PG_BIG5, big52mic, mic2big5, big5_to_utf, utf_to_big5}, - { PG_WIN1250, win12502mic, mic2win1250, 0, 0 }, + { PG_WIN1250, win12502mic, mic2win1250, 0, 0}, }; #else pg_enconv pg_enconv_tbl[] = { - { PG_SQL_ASCII, ascii2mic, mic2ascii, 0, 0 }, - { PG_EUC_JP, euc_jp2mic, mic2euc_jp, 0, 0 }, - { PG_EUC_CN, euc_cn2mic, mic2euc_cn, 0, 0 }, - { PG_EUC_KR, euc_kr2mic, mic2euc_kr, 0, 0 }, - { PG_EUC_TW, euc_tw2mic, mic2euc_tw, 0, 0 }, - { PG_UTF8, 0, 0, 0, 0 }, - { PG_MULE_INTERNAL, 0, 0, 0, 0 }, - { PG_LATIN1, latin12mic, mic2latin1, 0, 0 }, - { PG_LATIN2, latin22mic, mic2latin2, 0, 0 }, - { PG_LATIN3, latin32mic, mic2latin3, 0, 0 }, - { PG_LATIN4, latin42mic, mic2latin4, 0, 0 }, - { PG_LATIN5, iso2mic, mic2iso, 0, 0 }, - { PG_KOI8R, koi8r2mic, mic2koi8r, 0, 0 }, - { PG_WIN1251, win12512mic, mic2win1251, 0, 0 }, - { PG_ALT, alt2mic, mic2alt, 0, 0 }, - { PG_SJIS, sjis2mic, mic2sjis, 0, 0 }, - { PG_BIG5, big52mic, mic2big5, 0, 0 }, - { PG_WIN1250, win12502mic, mic2win1250, 0, 0 }, + { PG_SQL_ASCII, ascii2mic, mic2ascii, 0, 0}, + { PG_EUC_JP, euc_jp2mic, mic2euc_jp, 0, 0}, + { PG_EUC_CN, euc_cn2mic, mic2euc_cn, 0, 0}, + { PG_EUC_KR, euc_kr2mic, mic2euc_kr, 0, 0}, + { PG_EUC_TW, euc_tw2mic, mic2euc_tw, 0, 0}, + { PG_UTF8, 0, 0, 0, 0}, + { PG_MULE_INTERNAL, 0, 0, 0, 0}, + { PG_LATIN1, latin12mic, mic2latin1, 0, 0}, + { PG_LATIN2, latin22mic, mic2latin2, 0, 0}, + { PG_LATIN3, latin32mic, mic2latin3, 0, 0}, + { PG_LATIN4, latin42mic, mic2latin4, 0, 0}, + { PG_LATIN5, iso2mic, mic2iso, 0, 0}, + { PG_KOI8R, koi8r2mic, mic2koi8r, 0, 0}, + { PG_WIN1251, win12512mic, mic2win1251, 0, 0}, + { PG_ALT, alt2mic, mic2alt, 0, 0}, + { PG_SJIS, sjis2mic, mic2sjis, 0, 0}, + { PG_BIG5, big52mic, mic2big5, 0, 0}, + { PG_WIN1250, win12502mic, mic2win1250, 0, 0}, }; #endif /* UNICODE_CONVERSION */ diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index a7e97cc186..114b7f2623 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multi-byte streams. * Tatsuo Ishii - * $Id: wchar.c,v 1.19 2001/09/06 04:57:29 ishii Exp $ + * $Id: wchar.c,v 1.20 2001/09/11 04:50:36 ishii Exp $ * * WIN1250 client encoding updated by Pavel Behal * @@ -458,24 +458,24 @@ pg_big5_mblen(const unsigned char *s) } pg_wchar_tbl pg_wchar_table[] = { - {pg_ascii2wchar_with_len, pg_ascii_mblen}, /* 0; PG_SQL_ASCII */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen}, /* 1; PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_euccn_mblen}, /* 2; PG_EUC_CN */ - {pg_euckr2wchar_with_len, pg_euckr_mblen}, /* 3; PG_EUC_KR */ - {pg_euctw2wchar_with_len, pg_euctw_mblen}, /* 4; PG_EUC_TW */ - {pg_utf2wchar_with_len, pg_utf_mblen}, /* 5; PG_UNICODE */ - {pg_mule2wchar_with_len, pg_mule_mblen}, /* 6; PG_MULE_INTERNAL */ - {pg_latin12wchar_with_len, pg_latin1_mblen}, /* 7; PG_LATIN1 */ - {pg_latin12wchar_with_len, pg_latin1_mblen}, /* 8; PG_LATIN2 */ - {pg_latin12wchar_with_len, pg_latin1_mblen}, /* 9; PG_LATIN3 */ - {pg_latin12wchar_with_len, pg_latin1_mblen}, /* 10; PG_LATIN4 */ - {pg_latin12wchar_with_len, pg_latin1_mblen}, /* 11; PG_LATIN5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen}, /* 12; PG_KOI8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen}, /* 13; PG_WIN1251 */ - {pg_latin12wchar_with_len, pg_latin1_mblen}, /* 14; PG_ALT */ - {0, pg_sjis_mblen}, /* 15; PG_SJIS */ - {0, pg_big5_mblen}, /* 17; PG_BIG5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen} /* 18; PG_WIN1250 */ + {pg_ascii2wchar_with_len, pg_ascii_mblen, 1}, /* 0; PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3}, /* 1; PG_EUC_JP */ + {pg_euccn2wchar_with_len, pg_euccn_mblen, 3}, /* 2; PG_EUC_CN */ + {pg_euckr2wchar_with_len, pg_euckr_mblen, 3}, /* 3; PG_EUC_KR */ + {pg_euctw2wchar_with_len, pg_euctw_mblen, 3}, /* 4; PG_EUC_TW */ + {pg_utf2wchar_with_len, pg_utf_mblen, 3}, /* 5; PG_UNICODE */ + {pg_mule2wchar_with_len, pg_mule_mblen, 3}, /* 6; PG_MULE_INTERNAL */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 7; PG_LATIN1 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 8; PG_LATIN2 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 9; PG_LATIN3 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 10; PG_LATIN4 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 11; PG_LATIN5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 12; PG_KOI8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 13; PG_WIN1251 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 14; PG_ALT */ + {0, pg_sjis_mblen, 2}, /* 15; PG_SJIS */ + {0, pg_big5_mblen, 2}, /* 17; PG_BIG5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, 1} /* 18; PG_WIN1250 */ }; /* returns the byte length of a word for mule internal code */ @@ -498,3 +498,68 @@ pg_encoding_mblen(int encoding, const unsigned char *mbstr) ((*pg_wchar_table[encoding].mblen) (mbstr)) : ((*pg_wchar_table[PG_SQL_ASCII].mblen) (mbstr))); } + +#ifndef FRONTEND +/* + * Verify mbstr to make sure that it has a valid character sequence. + * mbstr is not necessarily NULL terminated. length of mbstr is + * specified by len. If an error was found, returns an error message. + * Note that the message is kept in a static buffer, the next invocation + * might break the message. + * If no error was found, this function returns NULL. + */ +char * +pg_verifymbstr(const unsigned char *mbstr, int len) +{ + int l; + int i, j; + static char buf[256]; + int slen = 0; + + /* we do not check single byte encodings */ + if (pg_wchar_table[GetDatabaseEncoding()].maxmblen <= 1) + return NULL; + + while (len > 0 && *mbstr) + { + l = pg_mblen(mbstr); + + /* multi-byte letter? */ + if (l > 1) + { + for (i=1;i len || *(mbstr+i) == '\0' || + /* we assume that every muti-byte letter + * consists of bytes being the 8th bit set + */ + ((*(mbstr+i) & 0x80) == 0)) + { + int remains = sizeof(buf); + char *p = buf; + + slen = snprintf(p, remains, "Invalid %s character sequence found (0x", + GetDatabaseEncodingName()); + p += slen; + remains -= slen; + + i = ((*(mbstr+i) & 0x80) == 0)?l:i; + + for (j=0;j