From f2a01b0d5a784a5191faad7f2022383760064f8a Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Mon, 15 Jan 2007 15:16:28 +0000 Subject: [PATCH] Fix localization support for multibyte encoding and C locale. Slightly reworked patch from Tatsuo Ishii --- contrib/tsearch2/ts_locale.c | 47 +++++++--- contrib/tsearch2/ts_locale.h | 15 +-- contrib/tsearch2/wordparser/parser.c | 134 +++++++++++++++++++++------ 3 files changed, 144 insertions(+), 52 deletions(-) diff --git a/contrib/tsearch2/ts_locale.c b/contrib/tsearch2/ts_locale.c index cac5317a10..cb022d7e2a 100644 --- a/contrib/tsearch2/ts_locale.c +++ b/contrib/tsearch2/ts_locale.c @@ -12,13 +12,13 @@ size_t wchar2char(char *to, const wchar_t *from, size_t len) { + if (len == 0) + return 0; + if (GetDatabaseEncoding() == PG_UTF8) { int r; - if (len == 0) - return 0; - r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, NULL, NULL); @@ -34,17 +34,19 @@ wchar2char(char *to, const wchar_t *from, size_t len) return wcstombs(to, from, len); } +#endif /* WIN32 */ size_t char2wchar(wchar_t *to, const char *from, size_t len) { + if (len == 0) + return 0; + +#ifdef WIN32 if (GetDatabaseEncoding() == PG_UTF8) { int r; - if (len == 0) - return 0; - r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); if (!r) @@ -60,29 +62,44 @@ char2wchar(wchar_t *to, const char *from, size_t len) return r; } + else +#endif /* WIN32 */ + if ( lc_ctype_is_c() ) + { + /* + * pg_mb2wchar_with_len always adds trailing '\0', so + * 'to' should be allocated with sufficient space + */ + return pg_mb2wchar_with_len(from, (pg_wchar *)to, len); + } return mbstowcs(to, from, len); } -#endif /* WIN32 */ int _t_isalpha(const char *ptr) { - wchar_t character; + wchar_t character[2]; - char2wchar(&character, ptr, 1); + if (lc_ctype_is_c()) + return isalpha(TOUCHAR(ptr)); - return iswalpha((wint_t) character); + char2wchar(character, ptr, 1); + + return iswalpha((wint_t) *character); } int _t_isprint(const char *ptr) { - wchar_t character; + wchar_t character[2]; - char2wchar(&character, ptr, 1); + if (lc_ctype_is_c()) + return isprint(TOUCHAR(ptr)); - return iswprint((wint_t) character); + char2wchar(character, ptr, 1); + + return iswprint((wint_t) *character); } #endif /* TS_USE_WIDE */ @@ -126,7 +143,7 @@ lowerstr(char *str) if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("transalation failed from server encoding to wchar_t"))); + errmsg("translation failed from server encoding to wchar_t"))); Assert(wlen<=len); wstr[wlen] = 0; @@ -152,7 +169,7 @@ lowerstr(char *str) if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("transalation failed from wchar_t to server encoding %d", errno))); + errmsg("translation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } diff --git a/contrib/tsearch2/ts_locale.h b/contrib/tsearch2/ts_locale.h index e2e2248137..81d1a16600 100644 --- a/contrib/tsearch2/ts_locale.h +++ b/contrib/tsearch2/ts_locale.h @@ -30,16 +30,17 @@ #define TOUCHAR(x) (*((unsigned char*)(x))) #ifdef TS_USE_WIDE +size_t char2wchar(wchar_t *to, const char *from, size_t len); #ifdef WIN32 size_t wchar2char(char *to, const wchar_t *from, size_t len); -size_t char2wchar(wchar_t *to, const char *from, size_t len); + #else /* WIN32 */ -/* correct mbstowcs */ -#define char2wchar mbstowcs +/* correct wcstombs */ #define wchar2char wcstombs + #endif /* WIN32 */ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) @@ -55,10 +56,10 @@ extern int _t_isprint(const char *ptr); */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) -#define COPYCHAR(d,s) do { \ - int lll = pg_mblen( s ); \ - \ - while( lll-- ) \ +#define COPYCHAR(d,s) do { \ + int lll = pg_mblen( s ); \ + \ + while( lll-- ) \ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \ } while(0) diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c index fced41ec5e..3706a0efb7 100644 --- a/contrib/tsearch2/wordparser/parser.c +++ b/contrib/tsearch2/wordparser/parser.c @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11 2006/10/04 00:29:47 momjian Exp $ */ +/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.12 2007/01/15 15:16:28 teodor Exp $ */ #include "postgres.h" @@ -40,16 +40,13 @@ TParserInit(char *str, int len) #ifdef TS_USE_WIDE /* - * Use wide char code only when max encoding length > 1 and ctype != C. - * Some operating systems fail with multi-byte encodings and a C locale. - * Also, for a C locale there is no need to process as multibyte. From - * backend/utils/adt/oracle_compat.c Teodor + * Use wide char code only when max encoding length > 1. */ - if (prs->charmaxlen > 1 && !lc_ctype_is_c()) + if (prs->charmaxlen > 1) { prs->usewide = true; - prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); + prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1)); prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); } else @@ -83,25 +80,99 @@ TParserClose(TParser * prs) /* * defining support function, equvalent is* macroses, but - * working with any possible encodings and locales + * working with any possible encodings and locales. Note, + * that with multibyte encoding and C-locale isw* function may fail + * or give wrong result. Note 2: multibyte encoding and C-locale + * often are used for Asian languages. */ #ifdef TS_USE_WIDE -#define p_iswhat(type) \ -static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ - is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ -} \ - \ -static int \ -p_isnot##type(TParser *prs) { \ - return !p_is##type(prs); \ +#define p_iswhat(type) \ +static int \ +p_is##type(TParser *prs) { \ + Assert( prs->state ); \ + if ( prs->usewide ) \ + { \ + if ( lc_ctype_is_c() ) \ + return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \ + \ + return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \ + } \ + \ + return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) { \ + return !p_is##type(prs); \ } +static int +p_isalnum(TParser *prs) +{ + Assert( prs->state ); + if (prs->usewide) + { + if (lc_ctype_is_c()) + { + unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar); + + /* + * any non-ascii symbol with multibyte encoding + * with C-locale is an alpha character + */ + if ( c > 0x7f ) + return 1; + + return isalnum(0xff & c); + } + + return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar)); + } + + return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte )); +} + +static int +p_isnotalnum(TParser *prs) +{ + return !p_isalnum(prs); +} + +static int +p_isalpha(TParser *prs) +{ + Assert( prs->state ); + + if (prs->usewide) + { + if (lc_ctype_is_c()) + { + unsigned int c = *(prs->wstr + prs->state->poschar); + + /* + * any non-ascii symbol with multibyte encoding + * with C-locale is an alpha character + */ + if ( c > 0x7f ) + return 1; + + return isalpha(0xff & c); + } + + return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); + } + + return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte )); +} + +static int +p_isnotalpha(TParser *prs) +{ + return !p_isalpha(prs); +} /* p_iseq should be used only for ascii symbols */ @@ -111,18 +182,19 @@ p_iseq(TParser * prs, char c) Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } + #else /* TS_USE_WIDE */ -#define p_iswhat(type) \ -static int \ -p_is##type(TParser *prs) { \ - Assert( prs->state ); \ - return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ -} \ - \ -static int \ -p_isnot##type(TParser *prs) { \ - return !p_is##type(prs); \ +#define p_iswhat(type) \ +static int \ +p_is##type(TParser *prs) { \ + Assert( prs->state ); \ + return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) { \ + return !p_is##type(prs); \ } @@ -132,10 +204,12 @@ p_iseq(TParser * prs, char c) Assert(prs->state); return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; } -#endif /* TS_USE_WIDE */ p_iswhat(alnum) p_iswhat(alpha) + +#endif /* TS_USE_WIDE */ + p_iswhat(digit) p_iswhat(lower) p_iswhat(print)