Fix char2wchar/wchar2char to support collations properly.

These functions should take a pg_locale_t, not a collation OID, and should
call mbstowcs_l/wcstombs_l where available.  Where those functions are not
available, temporarily select the correct locale with uselocale().

This change removes the bogus assumption that all locales selectable in
a given database have the same wide-character conversion method; in
particular, the collate.linux.utf8 regression test now passes with
LC_CTYPE=C, so long as the database encoding is UTF8.

I decided to move the char2wchar/wchar2char functions out of mbutils.c and
into pg_locale.c, because they work on wchar_t not pg_wchar_t and thus
don't really belong with the mbutils.c functions.  Keeping them where they
were would have required importing pg_locale_t into pg_wchar.h somehow,
which did not seem like a good plan.
This commit is contained in:
Tom Lane 2011-04-23 12:35:41 -04:00
parent bb85030630
commit 2ab0796d7a
12 changed files with 217 additions and 144 deletions

3
configure vendored
View File

@ -18985,7 +18985,8 @@ fi
for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs
for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l
do
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
{ $as_echo "$as_me:$LINENO: checking for $ac_func" >&5

View File

@ -1187,7 +1187,7 @@ PGAC_VAR_INT_TIMEZONE
AC_FUNC_ACCEPT_ARGTYPES
PGAC_FUNC_GETTIMEOFDAY_1ARG
AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs])
AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l])
AC_REPLACE_FUNCS(fseeko)
case $host_os in

View File

@ -29,11 +29,12 @@ t_isdigit(const char *ptr)
int clen = pg_mblen(ptr);
wchar_t character[2];
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
pg_locale_t mylocale = 0; /* TODO */
if (clen == 1 || lc_ctype_is_c(collation))
return isdigit(TOUCHAR(ptr));
char2wchar(character, 2, ptr, clen, collation);
char2wchar(character, 2, ptr, clen, mylocale);
return iswdigit((wint_t) character[0]);
}
@ -44,11 +45,12 @@ t_isspace(const char *ptr)
int clen = pg_mblen(ptr);
wchar_t character[2];
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
pg_locale_t mylocale = 0; /* TODO */
if (clen == 1 || lc_ctype_is_c(collation))
return isspace(TOUCHAR(ptr));
char2wchar(character, 2, ptr, clen, collation);
char2wchar(character, 2, ptr, clen, mylocale);
return iswspace((wint_t) character[0]);
}
@ -59,11 +61,12 @@ t_isalpha(const char *ptr)
int clen = pg_mblen(ptr);
wchar_t character[2];
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
pg_locale_t mylocale = 0; /* TODO */
if (clen == 1 || lc_ctype_is_c(collation))
return isalpha(TOUCHAR(ptr));
char2wchar(character, 2, ptr, clen, collation);
char2wchar(character, 2, ptr, clen, mylocale);
return iswalpha((wint_t) character[0]);
}
@ -74,11 +77,12 @@ t_isprint(const char *ptr)
int clen = pg_mblen(ptr);
wchar_t character[2];
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
pg_locale_t mylocale = 0; /* TODO */
if (clen == 1 || lc_ctype_is_c(collation))
return isprint(TOUCHAR(ptr));
char2wchar(character, 2, ptr, clen, collation);
char2wchar(character, 2, ptr, clen, mylocale);
return iswprint((wint_t) character[0]);
}
@ -246,6 +250,7 @@ lowerstr_with_len(const char *str, int len)
#ifdef USE_WIDE_UPPER_LOWER
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
pg_locale_t mylocale = 0; /* TODO */
#endif
if (len == 0)
@ -272,7 +277,7 @@ lowerstr_with_len(const char *str, int len)
*/
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
wlen = char2wchar(wstr, len + 1, str, len, collation);
wlen = char2wchar(wstr, len + 1, str, len, mylocale);
Assert(wlen <= len);
while (*wptr)
@ -287,7 +292,7 @@ lowerstr_with_len(const char *str, int len)
len = pg_database_encoding_max_length() * wlen + 1;
out = (char *) palloc(len);
wlen = wchar2char(out, wstr, len, collation);
wlen = wchar2char(out, wstr, len, mylocale);
pfree(wstr);

View File

@ -300,13 +300,14 @@ TParserInit(char *str, int len)
if (prs->charmaxlen > 1)
{
Oid collation = DEFAULT_COLLATION_OID; /* TODO */
pg_locale_t mylocale = 0; /* TODO */
prs->usewide = true;
if (lc_ctype_is_c(collation))
{
/*
* char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
* be not equal to sizeof(wchar_t)
* be different from sizeof(wchar_t)
*/
prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
@ -314,7 +315,8 @@ TParserInit(char *str, int len)
else
{
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, collation);
char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
mylocale);
}
}
else

View File

@ -1454,6 +1454,10 @@ str_numth(char *dest, char *num, int type)
return dest;
}
/*****************************************************************************
* upper/lower/initcap functions
*****************************************************************************/
/*
* If the system provides the needed functions for wide-character manipulation
* (which are all standardized by C99), then we implement upper/lower/initcap
@ -1527,7 +1531,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
/* Output workspace cannot have more codes than input bytes */
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
{
@ -1543,7 +1547,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
result_size = curr_char * pg_database_encoding_max_length() + 1;
result = palloc(result_size);
wchar2char(result, workspace, result_size, collid);
wchar2char(result, workspace, result_size, mylocale);
pfree(workspace);
}
#endif /* USE_WIDE_UPPER_LOWER */
@ -1648,7 +1652,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
/* Output workspace cannot have more codes than input bytes */
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
{
@ -1664,7 +1668,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
result_size = curr_char * pg_database_encoding_max_length() + 1;
result = palloc(result_size);
wchar2char(result, workspace, result_size, collid);
wchar2char(result, workspace, result_size, mylocale);
pfree(workspace);
}
#endif /* USE_WIDE_UPPER_LOWER */
@ -1781,7 +1785,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
/* Output workspace cannot have more codes than input bytes */
workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
{
@ -1809,7 +1813,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
result_size = curr_char * pg_database_encoding_max_length() + 1;
result = palloc(result_size);
wchar2char(result, workspace, result_size, collid);
wchar2char(result, workspace, result_size, mylocale);
pfree(workspace);
}
#endif /* USE_WIDE_UPPER_LOWER */

View File

@ -1030,3 +1030,176 @@ pg_newlocale_from_collation(Oid collid)
return cache_entry->locale;
}
/*
* These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
* Therefore we keep them here rather than with the mbutils code.
*/
#ifdef USE_WIDE_UPPER_LOWER
/*
* wchar2char --- convert wide characters to multibyte format
*
* This has the same API as the standard wcstombs_l() function; in particular,
* tolen is the maximum number of bytes to store at *to, and *from must be
* zero-terminated. The output will be zero-terminated iff there is room.
*/
size_t
wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
{
size_t result;
if (tolen == 0)
return 0;
#ifdef WIN32
/*
* On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
* for some reason mbstowcs and wcstombs won't do this for us, so we use
* MultiByteToWideChar().
*/
if (GetDatabaseEncoding() == PG_UTF8)
{
result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
NULL, NULL);
/* A zero return is failure */
if (result <= 0)
result = -1;
else
{
Assert(result <= tolen);
/* Microsoft counts the zero terminator in the result */
result--;
}
}
else
#endif /* WIN32 */
if (locale == (pg_locale_t) 0)
{
/* Use wcstombs directly for the default locale */
result = wcstombs(to, from, tolen);
}
else
{
#ifdef HAVE_LOCALE_T
#ifdef HAVE_WCSTOMBS_L
/* Use wcstombs_l for nondefault locales */
result = wcstombs_l(to, from, tolen, locale);
#else /* !HAVE_WCSTOMBS_L */
/* We have to temporarily set the locale as current ... ugh */
locale_t save_locale = uselocale(locale);
result = wcstombs(to, from, tolen);
uselocale(save_locale);
#endif /* HAVE_WCSTOMBS_L */
#else /* !HAVE_LOCALE_T */
/* Can't have locale != 0 without HAVE_LOCALE_T */
elog(ERROR, "wcstombs_l is not available");
result = 0; /* keep compiler quiet */
#endif /* HAVE_LOCALE_T */
}
return result;
}
/*
* char2wchar --- convert multibyte characters to wide characters
*
* This has almost the API of mbstowcs_l(), except that *from need not be
* null-terminated; instead, the number of input bytes is specified as
* fromlen. Also, we ereport() rather than returning -1 for invalid
* input encoding. tolen is the maximum number of wchar_t's to store at *to.
* The output will be zero-terminated iff there is room.
*/
size_t
char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
pg_locale_t locale)
{
size_t result;
if (tolen == 0)
return 0;
#ifdef WIN32
/* See WIN32 "Unicode" comment above */
if (GetDatabaseEncoding() == PG_UTF8)
{
/* Win32 API does not work for zero-length input */
if (fromlen == 0)
result = 0;
else
{
result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
/* A zero return is failure */
if (result == 0)
result = -1;
}
if (result != -1)
{
Assert(result < tolen);
/* Append trailing null wchar (MultiByteToWideChar() does not) */
to[result] = 0;
}
}
else
#endif /* WIN32 */
{
/* mbstowcs requires ending '\0' */
char *str = pnstrdup(from, fromlen);
if (locale == (pg_locale_t) 0)
{
/* Use mbstowcs directly for the default locale */
result = mbstowcs(to, str, tolen);
}
else
{
#ifdef HAVE_LOCALE_T
#ifdef HAVE_WCSTOMBS_L
/* Use mbstowcs_l for nondefault locales */
result = mbstowcs_l(to, str, tolen, locale);
#else /* !HAVE_WCSTOMBS_L */
/* We have to temporarily set the locale as current ... ugh */
locale_t save_locale = uselocale(locale);
result = mbstowcs(to, str, tolen);
uselocale(save_locale);
#endif /* HAVE_WCSTOMBS_L */
#else /* !HAVE_LOCALE_T */
/* Can't have locale != 0 without HAVE_LOCALE_T */
elog(ERROR, "mbstowcs_l is not available");
result = 0; /* keep compiler quiet */
#endif /* HAVE_LOCALE_T */
}
pfree(str);
}
if (result == -1)
{
/*
* Invalid multibyte character encountered. We try to give a useful
* error message by letting pg_verifymbstr check the string. But it's
* possible that the string is OK to us, and not OK to mbstowcs ---
* this suggests that the LC_CTYPE locale is different from the
* database encoding. Give a generic error message if verifymbstr
* can't find anything wrong.
*/
pg_verifymbstr(from, fromlen, false); /* might not return */
/* but if it does ... */
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte character for locale"),
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
}
return result;
}
#endif /* USE_WIDE_UPPER_LOWER */

View File

@ -13,7 +13,6 @@
#include "mb/pg_wchar.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/pg_locale.h"
#include "utils/syscache.h"
/*
@ -689,126 +688,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_
}
#ifdef USE_WIDE_UPPER_LOWER
/*
* wchar2char --- convert wide characters to multibyte format
*
* This has the same API as the standard wcstombs() function; in particular,
* tolen is the maximum number of bytes to store at *to, and *from must be
* zero-terminated. The output will be zero-terminated iff there is room.
*/
size_t
wchar2char(char *to, const wchar_t *from, size_t tolen, Oid collation)
{
size_t result;
if (tolen == 0)
return 0;
#ifdef WIN32
/*
* On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
* for some reason mbstowcs and wcstombs won't do this for us, so we use
* MultiByteToWideChar().
*/
if (GetDatabaseEncoding() == PG_UTF8)
{
result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
NULL, NULL);
/* A zero return is failure */
if (result <= 0)
result = -1;
else
{
Assert(result <= tolen);
/* Microsoft counts the zero terminator in the result */
result--;
}
}
else
#endif /* WIN32 */
{
Assert(!lc_ctype_is_c(collation));
result = wcstombs(to, from, tolen);
}
return result;
}
/*
* char2wchar --- convert multibyte characters to wide characters
*
* This has almost the API of mbstowcs(), except that *from need not be
* null-terminated; instead, the number of input bytes is specified as
* fromlen. Also, we ereport() rather than returning -1 for invalid
* input encoding. tolen is the maximum number of wchar_t's to store at *to.
* The output will be zero-terminated iff there is room.
*/
size_t
char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, Oid collation)
{
size_t result;
if (tolen == 0)
return 0;
#ifdef WIN32
/* See WIN32 "Unicode" comment above */
if (GetDatabaseEncoding() == PG_UTF8)
{
/* Win32 API does not work for zero-length input */
if (fromlen == 0)
result = 0;
else
{
result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
/* A zero return is failure */
if (result == 0)
result = -1;
}
if (result != -1)
{
Assert(result < tolen);
/* Append trailing null wchar (MultiByteToWideChar() does not) */
to[result] = 0;
}
}
else
#endif /* WIN32 */
{
/* mbstowcs requires ending '\0' */
char *str = pnstrdup(from, fromlen);
Assert(!lc_ctype_is_c(collation));
result = mbstowcs(to, str, tolen);
pfree(str);
}
if (result == -1)
{
/*
* Invalid multibyte character encountered. We try to give a useful
* error message by letting pg_verifymbstr check the string. But it's
* possible that the string is OK to us, and not OK to mbstowcs ---
* this suggests that the LC_CTYPE locale is different from the
* database encoding. Give a generic error message if verifymbstr
* can't find anything wrong.
*/
pg_verifymbstr(from, fromlen, false); /* might not return */
/* but if it does ... */
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte character for locale"),
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
}
return result;
}
#endif
/* convert a multibyte string to a wchar */
int
pg_mb2wchar(const char *from, pg_wchar *to)

View File

@ -19,8 +19,6 @@
#ifndef PG_WCHAR_H
#define PG_WCHAR_H
#include <sys/types.h>
/*
* The pg_wchar type
*/
@ -392,11 +390,6 @@ extern int pg_mbcharcliplen(const char *mbstr, int len, int imit);
extern int pg_encoding_max_length(int encoding);
extern int pg_database_encoding_max_length(void);
#ifdef USE_WIDE_UPPER_LOWER
extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen, Oid collation);
extern size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, Oid collation);
#endif
extern int PrepareClientEncoding(int encoding);
extern int SetClientEncoding(int encoding);
extern void InitializeClientEncoding(void);

View File

@ -656,6 +656,9 @@
/* Define to 1 if you have the `wcstombs' function. */
#undef HAVE_WCSTOMBS
/* Define to 1 if you have the `wcstombs_l' function. */
#undef HAVE_WCSTOMBS_L
/* Define to 1 if you have the <wctype.h> header file. */
#undef HAVE_WCTYPE_H

View File

@ -538,6 +538,9 @@
/* Define to 1 if you have the `wcstombs' function. */
#define HAVE_WCSTOMBS 1
/* Define to 1 if you have the `wcstombs_l' function. */
#define HAVE_WCSTOMBS_L 1
/* Define to 1 if you have the <wctype.h> header file. */
#define HAVE_WCTYPE_H 1

View File

@ -304,6 +304,8 @@ typedef int pid_t;
#define iswspace_l _iswspace_l
#define strcoll_l _strcoll_l
#define wcscoll_l _wcscoll_l
#define wcstombs_l _wcstombs_l
#define mbstowcs_l _mbstowcs_l
/* In backend/port/win32/signal.c */

View File

@ -72,4 +72,12 @@ typedef int pg_locale_t;
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
#ifdef USE_WIDE_UPPER_LOWER
extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
pg_locale_t locale);
extern size_t char2wchar(wchar_t *to, size_t tolen,
const char *from, size_t fromlen, pg_locale_t locale);
#endif
#endif /* _PG_LOCALE_ */