Fix char2wchar/wchar2char to support collations properly.

These functions should take a pg_locale_t, not a collation OID, and should call mbstowcs_l/wcstombs_l where available. Where those functions are not available, temporarily select the correct locale with uselocale(). This change removes the bogus assumption that all locales selectable in a given database have the same wide-character conversion method; in particular, the collate.linux.utf8 regression test now passes with LC_CTYPE=C, so long as the database encoding is UTF8. I decided to move the char2wchar/wchar2char functions out of mbutils.c and into pg_locale.c, because they work on wchar_t not pg_wchar_t and thus don't really belong with the mbutils.c functions. Keeping them where they were would have required importing pg_locale_t into pg_wchar.h somehow, which did not seem like a good plan.
2011-04-23 12:35:41 -04:00 · 2011-04-23 12:35:41 -04:00 · 2ab0796d7a
parent bb85030630
commit 2ab0796d7a
12 changed files with 217 additions and 144 deletions
--- a/3
+++ b/3
@ -18985,7 +18985,8 @@ fi



-for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs
+
+for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l
 do
 as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 { $as_echo "$as_me:$LINENO: checking for $ac_func" >&5
--- a/configure.in
+++ b/configure.in
@ -1187,7 +1187,7 @@ PGAC_VAR_INT_TIMEZONE
 AC_FUNC_ACCEPT_ARGTYPES
 PGAC_FUNC_GETTIMEOFDAY_1ARG

-AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs])
+AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeereid getpeerucred getrlimit memmove poll pstat readlink scandir setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l])

 AC_REPLACE_FUNCS(fseeko)
 case $host_os in
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@ -29,11 +29,12 @@ t_isdigit(const char *ptr)
 	int			clen = pg_mblen(ptr);
 	wchar_t		character[2];
 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
+	pg_locale_t	mylocale = 0;	/* TODO */

 	if (clen == 1 || lc_ctype_is_c(collation))
 		return isdigit(TOUCHAR(ptr));

-	char2wchar(character, 2, ptr, clen, collation);
+	char2wchar(character, 2, ptr, clen, mylocale);

 	return iswdigit((wint_t) character[0]);
 }
@ -44,11 +45,12 @@ t_isspace(const char *ptr)
 	int			clen = pg_mblen(ptr);
 	wchar_t		character[2];
 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
+	pg_locale_t	mylocale = 0;	/* TODO */

 	if (clen == 1 || lc_ctype_is_c(collation))
 		return isspace(TOUCHAR(ptr));

-	char2wchar(character, 2, ptr, clen, collation);
+	char2wchar(character, 2, ptr, clen, mylocale);

 	return iswspace((wint_t) character[0]);
 }
@ -59,11 +61,12 @@ t_isalpha(const char *ptr)
 	int			clen = pg_mblen(ptr);
 	wchar_t		character[2];
 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
+	pg_locale_t	mylocale = 0;	/* TODO */

 	if (clen == 1 || lc_ctype_is_c(collation))
 		return isalpha(TOUCHAR(ptr));

-	char2wchar(character, 2, ptr, clen, collation);
+	char2wchar(character, 2, ptr, clen, mylocale);

 	return iswalpha((wint_t) character[0]);
 }
@ -74,11 +77,12 @@ t_isprint(const char *ptr)
 	int			clen = pg_mblen(ptr);
 	wchar_t		character[2];
 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
+	pg_locale_t	mylocale = 0;	/* TODO */

 	if (clen == 1 || lc_ctype_is_c(collation))
 		return isprint(TOUCHAR(ptr));

-	char2wchar(character, 2, ptr, clen, collation);
+	char2wchar(character, 2, ptr, clen, mylocale);

 	return iswprint((wint_t) character[0]);
 }
@ -246,6 +250,7 @@ lowerstr_with_len(const char *str, int len)

 #ifdef USE_WIDE_UPPER_LOWER
 	Oid			collation = DEFAULT_COLLATION_OID;		/* TODO */
+	pg_locale_t	mylocale = 0;	/* TODO */
 #endif

 	if (len == 0)
@ -272,7 +277,7 @@ lowerstr_with_len(const char *str, int len)
 		 */
 		wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));

-		wlen = char2wchar(wstr, len + 1, str, len, collation);
+		wlen = char2wchar(wstr, len + 1, str, len, mylocale);
 		Assert(wlen <= len);

 		while (*wptr)
@ -287,7 +292,7 @@ lowerstr_with_len(const char *str, int len)
 		len = pg_database_encoding_max_length() * wlen + 1;
 		out = (char *) palloc(len);

-		wlen = wchar2char(out, wstr, len, collation);
+		wlen = wchar2char(out, wstr, len, mylocale);

 		pfree(wstr);

--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@ -300,13 +300,14 @@ TParserInit(char *str, int len)
 	if (prs->charmaxlen > 1)
 	{
 		Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
+		pg_locale_t	mylocale = 0;	/* TODO */

 		prs->usewide = true;
 		if (lc_ctype_is_c(collation))
 		{
 			/*
 			 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
-			 * be not equal to sizeof(wchar_t)
+			 * be different from sizeof(wchar_t)
 			 */
 			prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
 			pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
@ -314,7 +315,8 @@ TParserInit(char *str, int len)
 		else
 		{
 			prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
-			char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, collation);
+			char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
+					   mylocale);
 		}
 	}
 	else
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@ -1454,6 +1454,10 @@ str_numth(char *dest, char *num, int type)
 	return dest;
 }

+/*****************************************************************************
+ *			upper/lower/initcap functions
+ *****************************************************************************/
+
 /*
 * If the system provides the needed functions for wide-character manipulation
 * (which are all standardized by C99), then we implement upper/lower/initcap
@ -1527,7 +1531,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 		/* Output workspace cannot have more codes than input bytes */
 		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));

-		char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
+		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);

 		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
 		{
@ -1543,7 +1547,7 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
 		result_size = curr_char * pg_database_encoding_max_length() + 1;
 		result = palloc(result_size);

-		wchar2char(result, workspace, result_size, collid);
+		wchar2char(result, workspace, result_size, mylocale);
 		pfree(workspace);
 	}
 #endif   /* USE_WIDE_UPPER_LOWER */
@ -1648,7 +1652,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 		/* Output workspace cannot have more codes than input bytes */
 		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));

-		char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
+		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);

 		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
 		{
@ -1664,7 +1668,7 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
 		result_size = curr_char * pg_database_encoding_max_length() + 1;
 		result = palloc(result_size);

-		wchar2char(result, workspace, result_size, collid);
+		wchar2char(result, workspace, result_size, mylocale);
 		pfree(workspace);
 	}
 #endif   /* USE_WIDE_UPPER_LOWER */
@ -1781,7 +1785,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 		/* Output workspace cannot have more codes than input bytes */
 		workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));

-		char2wchar(workspace, nbytes + 1, buff, nbytes, collid);
+		char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);

 		for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
 		{
@ -1809,7 +1813,7 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 		result_size = curr_char * pg_database_encoding_max_length() + 1;
 		result = palloc(result_size);

-		wchar2char(result, workspace, result_size, collid);
+		wchar2char(result, workspace, result_size, mylocale);
 		pfree(workspace);
 	}
 #endif   /* USE_WIDE_UPPER_LOWER */
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@ -1030,3 +1030,176 @@ pg_newlocale_from_collation(Oid collid)

 	return cache_entry->locale;
 }
+
+
+/*
+ * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
+ * Therefore we keep them here rather than with the mbutils code.
+ */
+
+#ifdef USE_WIDE_UPPER_LOWER
+
+/*
+ * wchar2char --- convert wide characters to multibyte format
+ *
+ * This has the same API as the standard wcstombs_l() function; in particular,
+ * tolen is the maximum number of bytes to store at *to, and *from must be
+ * zero-terminated.  The output will be zero-terminated iff there is room.
+ */
+size_t
+wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
+{
+	size_t		result;
+
+	if (tolen == 0)
+		return 0;
+
+#ifdef WIN32
+
+	/*
+	 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
+	 * for some reason mbstowcs and wcstombs won't do this for us, so we use
+	 * MultiByteToWideChar().
+	 */
+	if (GetDatabaseEncoding() == PG_UTF8)
+	{
+		result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
+									 NULL, NULL);
+		/* A zero return is failure */
+		if (result <= 0)
+			result = -1;
+		else
+		{
+			Assert(result <= tolen);
+			/* Microsoft counts the zero terminator in the result */
+			result--;
+		}
+	}
+	else
+#endif   /* WIN32 */
+	if (locale == (pg_locale_t) 0)
+	{
+		/* Use wcstombs directly for the default locale */
+		result = wcstombs(to, from, tolen);
+	}
+	else
+	{
+#ifdef HAVE_LOCALE_T
+#ifdef HAVE_WCSTOMBS_L
+		/* Use wcstombs_l for nondefault locales */
+		result = wcstombs_l(to, from, tolen, locale);
+#else /* !HAVE_WCSTOMBS_L */
+		/* We have to temporarily set the locale as current ... ugh */
+		locale_t	save_locale = uselocale(locale);
+
+		result = wcstombs(to, from, tolen);
+
+		uselocale(save_locale);
+#endif /* HAVE_WCSTOMBS_L */
+#else /* !HAVE_LOCALE_T */
+		/* Can't have locale != 0 without HAVE_LOCALE_T */
+		elog(ERROR, "wcstombs_l is not available");
+		result = 0;				/* keep compiler quiet */
+#endif /* HAVE_LOCALE_T */
+	}
+
+	return result;
+}
+
+/*
+ * char2wchar --- convert multibyte characters to wide characters
+ *
+ * This has almost the API of mbstowcs_l(), except that *from need not be
+ * null-terminated; instead, the number of input bytes is specified as
+ * fromlen.  Also, we ereport() rather than returning -1 for invalid
+ * input encoding.	tolen is the maximum number of wchar_t's to store at *to.
+ * The output will be zero-terminated iff there is room.
+ */
+size_t
+char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
+		   pg_locale_t locale)
+{
+	size_t		result;
+
+	if (tolen == 0)
+		return 0;
+
+#ifdef WIN32
+	/* See WIN32 "Unicode" comment above */
+	if (GetDatabaseEncoding() == PG_UTF8)
+	{
+		/* Win32 API does not work for zero-length input */
+		if (fromlen == 0)
+			result = 0;
+		else
+		{
+			result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
+			/* A zero return is failure */
+			if (result == 0)
+				result = -1;
+		}
+
+		if (result != -1)
+		{
+			Assert(result < tolen);
+			/* Append trailing null wchar (MultiByteToWideChar() does not) */
+			to[result] = 0;
+		}
+	}
+	else
+#endif   /* WIN32 */
+	{
+		/* mbstowcs requires ending '\0' */
+		char	   *str = pnstrdup(from, fromlen);
+
+		if (locale == (pg_locale_t) 0)
+		{
+			/* Use mbstowcs directly for the default locale */
+			result = mbstowcs(to, str, tolen);
+		}
+		else
+		{
+#ifdef HAVE_LOCALE_T
+#ifdef HAVE_WCSTOMBS_L
+			/* Use mbstowcs_l for nondefault locales */
+			result = mbstowcs_l(to, str, tolen, locale);
+#else /* !HAVE_WCSTOMBS_L */
+			/* We have to temporarily set the locale as current ... ugh */
+			locale_t	save_locale = uselocale(locale);
+
+			result = mbstowcs(to, str, tolen);
+
+			uselocale(save_locale);
+#endif /* HAVE_WCSTOMBS_L */
+#else /* !HAVE_LOCALE_T */
+			/* Can't have locale != 0 without HAVE_LOCALE_T */
+			elog(ERROR, "mbstowcs_l is not available");
+			result = 0;				/* keep compiler quiet */
+#endif /* HAVE_LOCALE_T */
+		}
+
+		pfree(str);
+	}
+
+	if (result == -1)
+	{
+		/*
+		 * Invalid multibyte character encountered.  We try to give a useful
+		 * error message by letting pg_verifymbstr check the string.  But it's
+		 * possible that the string is OK to us, and not OK to mbstowcs ---
+		 * this suggests that the LC_CTYPE locale is different from the
+		 * database encoding.  Give a generic error message if verifymbstr
+		 * can't find anything wrong.
+		 */
+		pg_verifymbstr(from, fromlen, false);	/* might not return */
+		/* but if it does ... */
+		ereport(ERROR,
+				(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+				 errmsg("invalid multibyte character for locale"),
+				 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
+	}
+
+	return result;
+}
+
+#endif /* USE_WIDE_UPPER_LOWER */
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@ -13,7 +13,6 @@
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
-#include "utils/pg_locale.h"
 #include "utils/syscache.h"

 /*
@ -689,126 +688,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_
 }


-
-#ifdef USE_WIDE_UPPER_LOWER
-
-/*
- * wchar2char --- convert wide characters to multibyte format
- *
- * This has the same API as the standard wcstombs() function; in particular,
- * tolen is the maximum number of bytes to store at *to, and *from must be
- * zero-terminated.  The output will be zero-terminated iff there is room.
- */
-size_t
-wchar2char(char *to, const wchar_t *from, size_t tolen, Oid collation)
-{
-	size_t		result;
-
-	if (tolen == 0)
-		return 0;
-
-#ifdef WIN32
-
-	/*
-	 * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
-	 * for some reason mbstowcs and wcstombs won't do this for us, so we use
-	 * MultiByteToWideChar().
-	 */
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
-									 NULL, NULL);
-		/* A zero return is failure */
-		if (result <= 0)
-			result = -1;
-		else
-		{
-			Assert(result <= tolen);
-			/* Microsoft counts the zero terminator in the result */
-			result--;
-		}
-	}
-	else
-#endif   /* WIN32 */
-	{
-		Assert(!lc_ctype_is_c(collation));
-		result = wcstombs(to, from, tolen);
-	}
-	return result;
-}
-
-/*
- * char2wchar --- convert multibyte characters to wide characters
- *
- * This has almost the API of mbstowcs(), except that *from need not be
- * null-terminated; instead, the number of input bytes is specified as
- * fromlen.  Also, we ereport() rather than returning -1 for invalid
- * input encoding.	tolen is the maximum number of wchar_t's to store at *to.
- * The output will be zero-terminated iff there is room.
- */
-size_t
-char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, Oid collation)
-{
-	size_t		result;
-
-	if (tolen == 0)
-		return 0;
-
-#ifdef WIN32
-	/* See WIN32 "Unicode" comment above */
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		/* Win32 API does not work for zero-length input */
-		if (fromlen == 0)
-			result = 0;
-		else
-		{
-			result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
-			/* A zero return is failure */
-			if (result == 0)
-				result = -1;
-		}
-
-		if (result != -1)
-		{
-			Assert(result < tolen);
-			/* Append trailing null wchar (MultiByteToWideChar() does not) */
-			to[result] = 0;
-		}
-	}
-	else
-#endif   /* WIN32 */
-	{
-		/* mbstowcs requires ending '\0' */
-		char	   *str = pnstrdup(from, fromlen);
-
-		Assert(!lc_ctype_is_c(collation));
-		result = mbstowcs(to, str, tolen);
-		pfree(str);
-	}
-
-	if (result == -1)
-	{
-		/*
-		 * Invalid multibyte character encountered.  We try to give a useful
-		 * error message by letting pg_verifymbstr check the string.  But it's
-		 * possible that the string is OK to us, and not OK to mbstowcs ---
-		 * this suggests that the LC_CTYPE locale is different from the
-		 * database encoding.  Give a generic error message if verifymbstr
-		 * can't find anything wrong.
-		 */
-		pg_verifymbstr(from, fromlen, false);	/* might not return */
-		/* but if it does ... */
-		ereport(ERROR,
-				(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-				 errmsg("invalid multibyte character for locale"),
-				 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
-	}
-
-	return result;
-}
-#endif
-
 /* convert a multibyte string to a wchar */
 int
 pg_mb2wchar(const char *from, pg_wchar *to)
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@ -19,8 +19,6 @@
 #ifndef PG_WCHAR_H
 #define PG_WCHAR_H

-#include <sys/types.h>
-
 /*
 * The pg_wchar type
 */
@ -392,11 +390,6 @@ extern int	pg_mbcharcliplen(const char *mbstr, int len, int imit);
 extern int	pg_encoding_max_length(int encoding);
 extern int	pg_database_encoding_max_length(void);

-#ifdef USE_WIDE_UPPER_LOWER
-extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen, Oid collation);
-extern size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, Oid collation);
-#endif
-
 extern int	PrepareClientEncoding(int encoding);
 extern int	SetClientEncoding(int encoding);
 extern void InitializeClientEncoding(void);
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@ -656,6 +656,9 @@
 /* Define to 1 if you have the `wcstombs' function. */
 #undef HAVE_WCSTOMBS

+/* Define to 1 if you have the `wcstombs_l' function. */
+#undef HAVE_WCSTOMBS_L
+
 /* Define to 1 if you have the <wctype.h> header file. */
 #undef HAVE_WCTYPE_H

--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@ -538,6 +538,9 @@
 /* Define to 1 if you have the `wcstombs' function. */
 #define HAVE_WCSTOMBS 1

+/* Define to 1 if you have the `wcstombs_l' function. */
+#define HAVE_WCSTOMBS_L 1
+
 /* Define to 1 if you have the <wctype.h> header file. */
 #define HAVE_WCTYPE_H 1

--- a/src/include/port/win32.h
+++ b/src/include/port/win32.h
@ -304,6 +304,8 @@ typedef int pid_t;
 #define iswspace_l _iswspace_l
 #define strcoll_l _strcoll_l
 #define wcscoll_l _wcscoll_l
+#define wcstombs_l _wcstombs_l
+#define mbstowcs_l _mbstowcs_l


 /* In backend/port/win32/signal.c */
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@ -72,4 +72,12 @@ typedef int pg_locale_t;

 extern pg_locale_t pg_newlocale_from_collation(Oid collid);

+/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
+#ifdef USE_WIDE_UPPER_LOWER
+extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
+		   pg_locale_t locale);
+extern size_t char2wchar(wchar_t *to, size_t tolen,
+		   const char *from, size_t fromlen, pg_locale_t locale);
+#endif
+
 #endif   /* _PG_LOCALE_ */