postgresql/src/port/chklocale.c

/*-------------------------------------------------------------------------
 *
 * chklocale.c
 *		Functions for handling locale-related info
 *
 *
 * Copyright (c) 1996-2007, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/port/chklocale.c,v 1.3 2007/09/29 00:01:43 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */

#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#include <locale.h>
#ifdef HAVE_LANGINFO_H
#include <langinfo.h>
#endif

#include "mb/pg_wchar.h"


#if defined(HAVE_LANGINFO_H) && defined(CODESET)

/*
 * This table needs to recognize all the CODESET spellings for supported
 * backend encodings, as well as frontend-only encodings where possible
 * (the latter case is currently only needed for initdb to recognize
 * error situations).
 *
 * Note that we search the table with pg_strcasecmp(), so variant
 * capitalizations don't need their own entries.
 */
struct encoding_match
{
	enum pg_enc pg_enc_code;
	const char *system_enc_name;
};

static const struct encoding_match encoding_match_list[] = {
	{PG_EUC_JP, "EUC-JP"},
	{PG_EUC_JP, "eucJP"},
	{PG_EUC_JP, "IBM-eucJP"},
	{PG_EUC_JP, "sdeckanji"},

	{PG_EUC_CN, "EUC-CN"},
	{PG_EUC_CN, "eucCN"},
	{PG_EUC_CN, "IBM-eucCN"},
	{PG_EUC_CN, "GB2312"},
	{PG_EUC_CN, "dechanzi"},

	{PG_EUC_KR, "EUC-KR"},
	{PG_EUC_KR, "eucKR"},
	{PG_EUC_KR, "IBM-eucKR"},
	{PG_EUC_KR, "deckorean"},
	{PG_EUC_KR, "5601"},

	{PG_EUC_TW, "EUC-TW"},
	{PG_EUC_TW, "eucTW"},
	{PG_EUC_TW, "IBM-eucTW"},
	{PG_EUC_TW, "cns11643"},

	{PG_UTF8, "UTF-8"},
	{PG_UTF8, "utf8"},

	{PG_LATIN1, "ISO-8859-1"},
	{PG_LATIN1, "ISO8859-1"},
	{PG_LATIN1, "iso88591"},

	{PG_LATIN2, "ISO-8859-2"},
	{PG_LATIN2, "ISO8859-2"},
	{PG_LATIN2, "iso88592"},

	{PG_LATIN3, "ISO-8859-3"},
	{PG_LATIN3, "ISO8859-3"},
	{PG_LATIN3, "iso88593"},

	{PG_LATIN4, "ISO-8859-4"},
	{PG_LATIN4, "ISO8859-4"},
	{PG_LATIN4, "iso88594"},

	{PG_LATIN5, "ISO-8859-9"},
	{PG_LATIN5, "ISO8859-9"},
	{PG_LATIN5, "iso88599"},

	{PG_LATIN6, "ISO-8859-10"},
	{PG_LATIN6, "ISO8859-10"},
	{PG_LATIN6, "iso885910"},

	{PG_LATIN7, "ISO-8859-13"},
	{PG_LATIN7, "ISO8859-13"},
	{PG_LATIN7, "iso885913"},

	{PG_LATIN8, "ISO-8859-14"},
	{PG_LATIN8, "ISO8859-14"},
	{PG_LATIN8, "iso885914"},

	{PG_LATIN9, "ISO-8859-15"},
	{PG_LATIN9, "ISO8859-15"},
	{PG_LATIN9, "iso885915"},

	{PG_LATIN10, "ISO-8859-16"},
	{PG_LATIN10, "ISO8859-16"},
	{PG_LATIN10, "iso885916"},

	{PG_KOI8R, "KOI8-R"},

	{PG_WIN1252, "CP1252"},
	{PG_WIN1253, "CP1253"},
	{PG_WIN1254, "CP1254"},
	{PG_WIN1255, "CP1255"},
	{PG_WIN1256, "CP1256"},
	{PG_WIN1257, "CP1257"},
	{PG_WIN1258, "CP1258"},
#ifdef NOT_VERIFIED
	{PG_WIN874, "???"},
#endif
	{PG_WIN1251, "CP1251"},
	{PG_WIN866, "CP866"},

	{PG_ISO_8859_5, "ISO-8859-5"},
	{PG_ISO_8859_5, "ISO8859-5"},
	{PG_ISO_8859_5, "iso88595"},

	{PG_ISO_8859_6, "ISO-8859-6"},
	{PG_ISO_8859_6, "ISO8859-6"},
	{PG_ISO_8859_6, "iso88596"},

	{PG_ISO_8859_7, "ISO-8859-7"},
	{PG_ISO_8859_7, "ISO8859-7"},
	{PG_ISO_8859_7, "iso88597"},

	{PG_ISO_8859_8, "ISO-8859-8"},
	{PG_ISO_8859_8, "ISO8859-8"},
	{PG_ISO_8859_8, "iso88598"},

	{PG_SJIS, "SJIS"},
	{PG_SJIS, "PCK"},

	{PG_BIG5, "BIG5"},
	{PG_BIG5, "BIG5HKSCS"},

	{PG_GBK, "GBK"},

	{PG_UHC, "UHC"},

	{PG_JOHAB, "JOHAB"},

	{PG_GB18030, "GB18030"},

	{PG_SHIFT_JIS_2004, "SJIS_2004"},

	{PG_SQL_ASCII, NULL}		/* end marker */
};


/*
 * Given a setting for LC_CTYPE, return the Postgres ID of the associated
 * encoding, if we can determine it.
 *
 * Pass in NULL to get the encoding for the current locale setting.
 *
 * If the result is PG_SQL_ASCII, callers should treat it as being compatible
 * with any desired encoding.  We return this if the locale is C/POSIX or we
 * can't determine the encoding.
 */
int
pg_get_encoding_from_locale(const char *ctype)
{
	char	   *sys;
	int			i;

	/* Get the CODESET property, and also LC_CTYPE if not passed in */
	if (ctype)
	{
		char	   *save;

		save = setlocale(LC_CTYPE, NULL);
		if (!save)
			return PG_SQL_ASCII;		/* setlocale() broken? */
		/* must copy result, or it might change after setlocale */
		save = strdup(save);
		if (!save)
			return PG_SQL_ASCII;		/* out of memory; unlikely */

		if (!setlocale(LC_CTYPE, ctype))
		{
			free(save);
			return PG_SQL_ASCII;		/* bogus ctype passed in? */
		}

		sys = nl_langinfo(CODESET);
		if (sys)
			sys = strdup(sys);

		setlocale(LC_CTYPE, save);
		free(save);
	}
	else
	{
		/* much easier... */
		ctype = setlocale(LC_CTYPE, NULL);
		if (!ctype)
			return PG_SQL_ASCII;		/* setlocale() broken? */
		sys = nl_langinfo(CODESET);
		if (sys)
			sys = strdup(sys);
	}

	if (!sys)
		return PG_SQL_ASCII;		/* out of memory; unlikely */

	/* If locale is C or POSIX, we can allow all encodings */
	if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
	{
		free(sys);
		return PG_SQL_ASCII;
	}

	/* Check the table */
	for (i = 0; encoding_match_list[i].system_enc_name; i++)
	{
		if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
		{
			free(sys);
			return encoding_match_list[i].pg_enc_code;
		}
	}

	/* Special-case kluges for particular platforms go here */

#ifdef __darwin__
	/*
	 * Current OS X has many locales that report an empty string for CODESET,
	 * but they all seem to actually use UTF-8.
	 */
	if (strlen(sys) == 0)
	{
		free(sys);
		return PG_UTF8;
	}
#endif

	/*
	 * We print a warning if we got a CODESET string but couldn't recognize
	 * it.  This means we need another entry in the table.
	 */
#ifdef FRONTEND
	fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
			ctype, sys);
	/* keep newline separate so there's only one translatable string */
	fputc('\n', stderr);
#else
	ereport(WARNING,
			(errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
					ctype, sys),
			 errdetail("Please report this to <pgsql-bugs@postgresql.org>.")));
#endif

	free(sys);
	return PG_SQL_ASCII;
}

#else /* !(HAVE_LANGINFO_H && CODESET) */

/*
 * stub if no platform support
 */
int
pg_get_encoding_from_locale(const char *ctype)
{
	return PG_SQL_ASCII;
}

#endif /* HAVE_LANGINFO_H && CODESET */
Change initdb and CREATE DATABASE to actively reject attempts to create databases with encodings that are incompatible with the server's LC_CTYPE locale, when we can determine that (which we can on most modern platforms, I believe). C/POSIX locale is compatible with all encodings, of course, so there is still some usefulness to CREATE DATABASE's ENCODING option, but this will insulate us against all sorts of recurring complaints caused by mismatched settings. I moved initdb's existing LC_CTYPE-to-encoding mapping knowledge into a new src/port/ file so it could be shared by CREATE DATABASE. 2007-09-29 00:25:49 +02:00			`/*-------------------------------------------------------------------------`
			`*`
			`* chklocale.c`
			`* Functions for handling locale-related info`
			`*`
			`*`
			`* Copyright (c) 1996-2007, PostgreSQL Global Development Group`
			`*`
			`*`
			`* IDENTIFICATION`
Teach chklocale.c about a few names for frontend-only encodings, since this will allow initdb to reject attempts to initdb in a locale that uses such an encoding. We'll probably find out more such names during beta ... 2007-09-29 02:01:43 +02:00			`* $PostgreSQL: pgsql/src/port/chklocale.c,v 1.3 2007/09/29 00:01:43 tgl Exp $`
Change initdb and CREATE DATABASE to actively reject attempts to create databases with encodings that are incompatible with the server's LC_CTYPE locale, when we can determine that (which we can on most modern platforms, I believe). C/POSIX locale is compatible with all encodings, of course, so there is still some usefulness to CREATE DATABASE's ENCODING option, but this will insulate us against all sorts of recurring complaints caused by mismatched settings. I moved initdb's existing LC_CTYPE-to-encoding mapping knowledge into a new src/port/ file so it could be shared by CREATE DATABASE. 2007-09-29 00:25:49 +02:00			`*`
			`*-------------------------------------------------------------------------`
			`*/`

			`#ifndef FRONTEND`
			`#include "postgres.h"`
			`#else`
			`#include "postgres_fe.h"`
			`#endif`

			`#include <locale.h>`
			`#ifdef HAVE_LANGINFO_H`
			`#include <langinfo.h>`
			`#endif`

			`#include "mb/pg_wchar.h"`


			`#if defined(HAVE_LANGINFO_H) && defined(CODESET)`

			`/*`
			`* This table needs to recognize all the CODESET spellings for supported`
Teach chklocale.c about a few names for frontend-only encodings, since this will allow initdb to reject attempts to initdb in a locale that uses such an encoding. We'll probably find out more such names during beta ... 2007-09-29 02:01:43 +02:00			`* backend encodings, as well as frontend-only encodings where possible`
			`* (the latter case is currently only needed for initdb to recognize`
			`* error situations).`
			`*`
Change initdb and CREATE DATABASE to actively reject attempts to create databases with encodings that are incompatible with the server's LC_CTYPE locale, when we can determine that (which we can on most modern platforms, I believe). C/POSIX locale is compatible with all encodings, of course, so there is still some usefulness to CREATE DATABASE's ENCODING option, but this will insulate us against all sorts of recurring complaints caused by mismatched settings. I moved initdb's existing LC_CTYPE-to-encoding mapping knowledge into a new src/port/ file so it could be shared by CREATE DATABASE. 2007-09-29 00:25:49 +02:00			`* Note that we search the table with pg_strcasecmp(), so variant`
			`* capitalizations don't need their own entries.`
			`*/`
			`struct encoding_match`
			`{`
			`enum pg_enc pg_enc_code;`
			`const char *system_enc_name;`
			`};`

			`static const struct encoding_match encoding_match_list[] = {`
			`{PG_EUC_JP, "EUC-JP"},`
			`{PG_EUC_JP, "eucJP"},`
			`{PG_EUC_JP, "IBM-eucJP"},`
			`{PG_EUC_JP, "sdeckanji"},`

			`{PG_EUC_CN, "EUC-CN"},`
			`{PG_EUC_CN, "eucCN"},`
			`{PG_EUC_CN, "IBM-eucCN"},`
			`{PG_EUC_CN, "GB2312"},`
			`{PG_EUC_CN, "dechanzi"},`

			`{PG_EUC_KR, "EUC-KR"},`
			`{PG_EUC_KR, "eucKR"},`
			`{PG_EUC_KR, "IBM-eucKR"},`
			`{PG_EUC_KR, "deckorean"},`
			`{PG_EUC_KR, "5601"},`

			`{PG_EUC_TW, "EUC-TW"},`
			`{PG_EUC_TW, "eucTW"},`
			`{PG_EUC_TW, "IBM-eucTW"},`
			`{PG_EUC_TW, "cns11643"},`

			`{PG_UTF8, "UTF-8"},`
			`{PG_UTF8, "utf8"},`

			`{PG_LATIN1, "ISO-8859-1"},`
			`{PG_LATIN1, "ISO8859-1"},`
			`{PG_LATIN1, "iso88591"},`

			`{PG_LATIN2, "ISO-8859-2"},`
			`{PG_LATIN2, "ISO8859-2"},`
			`{PG_LATIN2, "iso88592"},`

			`{PG_LATIN3, "ISO-8859-3"},`
			`{PG_LATIN3, "ISO8859-3"},`
			`{PG_LATIN3, "iso88593"},`

			`{PG_LATIN4, "ISO-8859-4"},`
			`{PG_LATIN4, "ISO8859-4"},`
			`{PG_LATIN4, "iso88594"},`

			`{PG_LATIN5, "ISO-8859-9"},`
			`{PG_LATIN5, "ISO8859-9"},`
			`{PG_LATIN5, "iso88599"},`

			`{PG_LATIN6, "ISO-8859-10"},`
			`{PG_LATIN6, "ISO8859-10"},`
			`{PG_LATIN6, "iso885910"},`

			`{PG_LATIN7, "ISO-8859-13"},`
			`{PG_LATIN7, "ISO8859-13"},`
			`{PG_LATIN7, "iso885913"},`

			`{PG_LATIN8, "ISO-8859-14"},`
			`{PG_LATIN8, "ISO8859-14"},`
			`{PG_LATIN8, "iso885914"},`

			`{PG_LATIN9, "ISO-8859-15"},`
			`{PG_LATIN9, "ISO8859-15"},`
			`{PG_LATIN9, "iso885915"},`

			`{PG_LATIN10, "ISO-8859-16"},`
			`{PG_LATIN10, "ISO8859-16"},`
			`{PG_LATIN10, "iso885916"},`

			`{PG_KOI8R, "KOI8-R"},`

			`{PG_WIN1252, "CP1252"},`
			`{PG_WIN1253, "CP1253"},`
			`{PG_WIN1254, "CP1254"},`
			`{PG_WIN1255, "CP1255"},`
			`{PG_WIN1256, "CP1256"},`
			`{PG_WIN1257, "CP1257"},`
			`{PG_WIN1258, "CP1258"},`
			`#ifdef NOT_VERIFIED`
			`{PG_WIN874, "???"},`
			`#endif`
			`{PG_WIN1251, "CP1251"},`
			`{PG_WIN866, "CP866"},`

			`{PG_ISO_8859_5, "ISO-8859-5"},`
			`{PG_ISO_8859_5, "ISO8859-5"},`
			`{PG_ISO_8859_5, "iso88595"},`

			`{PG_ISO_8859_6, "ISO-8859-6"},`
			`{PG_ISO_8859_6, "ISO8859-6"},`
			`{PG_ISO_8859_6, "iso88596"},`

			`{PG_ISO_8859_7, "ISO-8859-7"},`
			`{PG_ISO_8859_7, "ISO8859-7"},`
			`{PG_ISO_8859_7, "iso88597"},`

			`{PG_ISO_8859_8, "ISO-8859-8"},`
			`{PG_ISO_8859_8, "ISO8859-8"},`
			`{PG_ISO_8859_8, "iso88598"},`

Teach chklocale.c about a few names for frontend-only encodings, since this will allow initdb to reject attempts to initdb in a locale that uses such an encoding. We'll probably find out more such names during beta ... 2007-09-29 02:01:43 +02:00			`{PG_SJIS, "SJIS"},`
			`{PG_SJIS, "PCK"},`

			`{PG_BIG5, "BIG5"},`
			`{PG_BIG5, "BIG5HKSCS"},`

			`{PG_GBK, "GBK"},`

			`{PG_UHC, "UHC"},`

			`{PG_JOHAB, "JOHAB"},`

			`{PG_GB18030, "GB18030"},`

			`{PG_SHIFT_JIS_2004, "SJIS_2004"},`

Change initdb and CREATE DATABASE to actively reject attempts to create databases with encodings that are incompatible with the server's LC_CTYPE locale, when we can determine that (which we can on most modern platforms, I believe). C/POSIX locale is compatible with all encodings, of course, so there is still some usefulness to CREATE DATABASE's ENCODING option, but this will insulate us against all sorts of recurring complaints caused by mismatched settings. I moved initdb's existing LC_CTYPE-to-encoding mapping knowledge into a new src/port/ file so it could be shared by CREATE DATABASE. 2007-09-29 00:25:49 +02:00			`{PG_SQL_ASCII, NULL} /* end marker */`
			`};`


			`/*`
			`* Given a setting for LC_CTYPE, return the Postgres ID of the associated`
			`* encoding, if we can determine it.`
			`*`
			`* Pass in NULL to get the encoding for the current locale setting.`
			`*`
			`* If the result is PG_SQL_ASCII, callers should treat it as being compatible`
			`* with any desired encoding. We return this if the locale is C/POSIX or we`
			`* can't determine the encoding.`
			`*/`
			`int`
			`pg_get_encoding_from_locale(const char *ctype)`
			`{`
			`char *sys;`
			`int i;`

On OS X, assume that an empty-string result for nl_langinfo(CODESET) means UTF-8. Per examination of /usr/share/locale in 10.4.10. 2007-09-29 01:36:06 +02:00			`/* Get the CODESET property, and also LC_CTYPE if not passed in */`
Change initdb and CREATE DATABASE to actively reject attempts to create databases with encodings that are incompatible with the server's LC_CTYPE locale, when we can determine that (which we can on most modern platforms, I believe). C/POSIX locale is compatible with all encodings, of course, so there is still some usefulness to CREATE DATABASE's ENCODING option, but this will insulate us against all sorts of recurring complaints caused by mismatched settings. I moved initdb's existing LC_CTYPE-to-encoding mapping knowledge into a new src/port/ file so it could be shared by CREATE DATABASE. 2007-09-29 00:25:49 +02:00			`if (ctype)`
			`{`
			`char *save;`

			`save = setlocale(LC_CTYPE, NULL);`
			`if (!save)`
			`return PG_SQL_ASCII; /* setlocale() broken? */`
			`/* must copy result, or it might change after setlocale */`
			`save = strdup(save);`
			`if (!save)`
			`return PG_SQL_ASCII; /* out of memory; unlikely */`

			`if (!setlocale(LC_CTYPE, ctype))`
			`{`
			`free(save);`
			`return PG_SQL_ASCII; /* bogus ctype passed in? */`
			`}`

			`sys = nl_langinfo(CODESET);`
			`if (sys)`
			`sys = strdup(sys);`

			`setlocale(LC_CTYPE, save);`
			`free(save);`
			`}`
			`else`
			`{`
			`/* much easier... */`
			`ctype = setlocale(LC_CTYPE, NULL);`
			`if (!ctype)`
			`return PG_SQL_ASCII; /* setlocale() broken? */`
			`sys = nl_langinfo(CODESET);`
			`if (sys)`
			`sys = strdup(sys);`
			`}`

			`if (!sys)`
			`return PG_SQL_ASCII; /* out of memory; unlikely */`

On OS X, assume that an empty-string result for nl_langinfo(CODESET) means UTF-8. Per examination of /usr/share/locale in 10.4.10. 2007-09-29 01:36:06 +02:00			`/* If locale is C or POSIX, we can allow all encodings */`
Change initdb and CREATE DATABASE to actively reject attempts to create databases with encodings that are incompatible with the server's LC_CTYPE locale, when we can determine that (which we can on most modern platforms, I believe). C/POSIX locale is compatible with all encodings, of course, so there is still some usefulness to CREATE DATABASE's ENCODING option, but this will insulate us against all sorts of recurring complaints caused by mismatched settings. I moved initdb's existing LC_CTYPE-to-encoding mapping knowledge into a new src/port/ file so it could be shared by CREATE DATABASE. 2007-09-29 00:25:49 +02:00			`if (pg_strcasecmp(ctype, "C") == 0 \|\| pg_strcasecmp(ctype, "POSIX") == 0)`
			`{`
			`free(sys);`
			`return PG_SQL_ASCII;`
			`}`

On OS X, assume that an empty-string result for nl_langinfo(CODESET) means UTF-8. Per examination of /usr/share/locale in 10.4.10. 2007-09-29 01:36:06 +02:00			`/* Check the table */`
Change initdb and CREATE DATABASE to actively reject attempts to create databases with encodings that are incompatible with the server's LC_CTYPE locale, when we can determine that (which we can on most modern platforms, I believe). C/POSIX locale is compatible with all encodings, of course, so there is still some usefulness to CREATE DATABASE's ENCODING option, but this will insulate us against all sorts of recurring complaints caused by mismatched settings. I moved initdb's existing LC_CTYPE-to-encoding mapping knowledge into a new src/port/ file so it could be shared by CREATE DATABASE. 2007-09-29 00:25:49 +02:00			`for (i = 0; encoding_match_list[i].system_enc_name; i++)`
			`{`
			`if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)`
			`{`
			`free(sys);`
			`return encoding_match_list[i].pg_enc_code;`
			`}`
			`}`

On OS X, assume that an empty-string result for nl_langinfo(CODESET) means UTF-8. Per examination of /usr/share/locale in 10.4.10. 2007-09-29 01:36:06 +02:00			`/* Special-case kluges for particular platforms go here */`

			`#ifdef __darwin__`
			`/*`
			`* Current OS X has many locales that report an empty string for CODESET,`
			`* but they all seem to actually use UTF-8.`
			`*/`
			`if (strlen(sys) == 0)`
			`{`
			`free(sys);`
			`return PG_UTF8;`
			`}`
			`#endif`

Change initdb and CREATE DATABASE to actively reject attempts to create databases with encodings that are incompatible with the server's LC_CTYPE locale, when we can determine that (which we can on most modern platforms, I believe). C/POSIX locale is compatible with all encodings, of course, so there is still some usefulness to CREATE DATABASE's ENCODING option, but this will insulate us against all sorts of recurring complaints caused by mismatched settings. I moved initdb's existing LC_CTYPE-to-encoding mapping knowledge into a new src/port/ file so it could be shared by CREATE DATABASE. 2007-09-29 00:25:49 +02:00			`/*`
			`* We print a warning if we got a CODESET string but couldn't recognize`
			`* it. This means we need another entry in the table.`
			`*/`
			`#ifdef FRONTEND`
			`fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),`
			`ctype, sys);`
			`/* keep newline separate so there's only one translatable string */`
			`fputc('\n', stderr);`
			`#else`
			`ereport(WARNING,`
			`(errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",`
			`ctype, sys),`
			`errdetail("Please report this to <pgsql-bugs@postgresql.org>.")));`
			`#endif`

			`free(sys);`
			`return PG_SQL_ASCII;`
			`}`

			`#else /* !(HAVE_LANGINFO_H && CODESET) */`

			`/*`
			`* stub if no platform support`
			`*/`
			`int`
			`pg_get_encoding_from_locale(const char *ctype)`
			`{`
			`return PG_SQL_ASCII;`
			`}`

			`#endif /* HAVE_LANGINFO_H && CODESET */`