postgresql/src/common/encnames.c

/*-------------------------------------------------------------------------
 *
 * encnames.c
 *	  Encoding names and routines for working with them.
 *
 * Portions Copyright (c) 2001-2024, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/encnames.c
 *
 *-------------------------------------------------------------------------
 */
#include "c.h"

#include <ctype.h>
#include <unistd.h>

#include "mb/pg_wchar.h"


/* ----------
 * All encoding names, sorted:		 *** A L P H A B E T I C ***
 *
 * All names must be without irrelevant chars, search routines use
 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
 * are always converted to 'iso88591'. All must be lower case.
 *
 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
 *
 * Karel Zak, Aug 2001
 * ----------
 */
typedef struct pg_encname
{
	const char *name;
	pg_enc		encoding;
} pg_encname;

static const pg_encname pg_encname_tbl[] =
{
	{
		"abc", PG_WIN1258
	},							/* alias for WIN1258 */
	{
		"alt", PG_WIN866
	},							/* IBM866 */
	{
		"big5", PG_BIG5
	},							/* Big5; Chinese for Taiwan multibyte set */
	{
		"euccn", PG_EUC_CN
	},							/* EUC-CN; Extended Unix Code for simplified
								 * Chinese */
	{
		"eucjis2004", PG_EUC_JIS_2004
	},							/* EUC-JIS-2004; Extended UNIX Code fixed
								 * Width for Japanese, standard JIS X 0213 */
	{
		"eucjp", PG_EUC_JP
	},							/* EUC-JP; Extended UNIX Code fixed Width for
								 * Japanese, standard OSF */
	{
		"euckr", PG_EUC_KR
	},							/* EUC-KR; Extended Unix Code for Korean , KS
								 * X 1001 standard */
	{
		"euctw", PG_EUC_TW
	},							/* EUC-TW; Extended Unix Code for
								 *
								 * traditional Chinese */
	{
		"gb18030", PG_GB18030
	},							/* GB18030;GB18030 */
	{
		"gbk", PG_GBK
	},							/* GBK; Chinese Windows CodePage 936
								 * simplified Chinese */
	{
		"iso88591", PG_LATIN1
	},							/* ISO-8859-1; RFC1345,KXS2 */
	{
		"iso885910", PG_LATIN6
	},							/* ISO-8859-10; RFC1345,KXS2 */
	{
		"iso885913", PG_LATIN7
	},							/* ISO-8859-13; RFC1345,KXS2 */
	{
		"iso885914", PG_LATIN8
	},							/* ISO-8859-14; RFC1345,KXS2 */
	{
		"iso885915", PG_LATIN9
	},							/* ISO-8859-15; RFC1345,KXS2 */
	{
		"iso885916", PG_LATIN10
	},							/* ISO-8859-16; RFC1345,KXS2 */
	{
		"iso88592", PG_LATIN2
	},							/* ISO-8859-2; RFC1345,KXS2 */
	{
		"iso88593", PG_LATIN3
	},							/* ISO-8859-3; RFC1345,KXS2 */
	{
		"iso88594", PG_LATIN4
	},							/* ISO-8859-4; RFC1345,KXS2 */
	{
		"iso88595", PG_ISO_8859_5
	},							/* ISO-8859-5; RFC1345,KXS2 */
	{
		"iso88596", PG_ISO_8859_6
	},							/* ISO-8859-6; RFC1345,KXS2 */
	{
		"iso88597", PG_ISO_8859_7
	},							/* ISO-8859-7; RFC1345,KXS2 */
	{
		"iso88598", PG_ISO_8859_8
	},							/* ISO-8859-8; RFC1345,KXS2 */
	{
		"iso88599", PG_LATIN5
	},							/* ISO-8859-9; RFC1345,KXS2 */
	{
		"johab", PG_JOHAB
	},							/* JOHAB; Extended Unix Code for simplified
								 * Chinese */
	{
		"koi8", PG_KOI8R
	},							/* _dirty_ alias for KOI8-R (backward
								 * compatibility) */
	{
		"koi8r", PG_KOI8R
	},							/* KOI8-R; RFC1489 */
	{
		"koi8u", PG_KOI8U
	},							/* KOI8-U; RFC2319 */
	{
		"latin1", PG_LATIN1
	},							/* alias for ISO-8859-1 */
	{
		"latin10", PG_LATIN10
	},							/* alias for ISO-8859-16 */
	{
		"latin2", PG_LATIN2
	},							/* alias for ISO-8859-2 */
	{
		"latin3", PG_LATIN3
	},							/* alias for ISO-8859-3 */
	{
		"latin4", PG_LATIN4
	},							/* alias for ISO-8859-4 */
	{
		"latin5", PG_LATIN5
	},							/* alias for ISO-8859-9 */
	{
		"latin6", PG_LATIN6
	},							/* alias for ISO-8859-10 */
	{
		"latin7", PG_LATIN7
	},							/* alias for ISO-8859-13 */
	{
		"latin8", PG_LATIN8
	},							/* alias for ISO-8859-14 */
	{
		"latin9", PG_LATIN9
	},							/* alias for ISO-8859-15 */
	{
		"mskanji", PG_SJIS
	},							/* alias for Shift_JIS */
	{
		"muleinternal", PG_MULE_INTERNAL
	},
	{
		"shiftjis", PG_SJIS
	},							/* Shift_JIS; JIS X 0202-1991 */

	{
		"shiftjis2004", PG_SHIFT_JIS_2004
	},							/* SHIFT-JIS-2004; Shift JIS for Japanese,
								 * standard JIS X 0213 */
	{
		"sjis", PG_SJIS
	},							/* alias for Shift_JIS */
	{
		"sqlascii", PG_SQL_ASCII
	},
	{
		"tcvn", PG_WIN1258
	},							/* alias for WIN1258 */
	{
		"tcvn5712", PG_WIN1258
	},							/* alias for WIN1258 */
	{
		"uhc", PG_UHC
	},							/* UHC; Korean Windows CodePage 949 */
	{
		"unicode", PG_UTF8
	},							/* alias for UTF8 */
	{
		"utf8", PG_UTF8
	},							/* alias for UTF8 */
	{
		"vscii", PG_WIN1258
	},							/* alias for WIN1258 */
	{
		"win", PG_WIN1251
	},							/* _dirty_ alias for windows-1251 (backward
								 * compatibility) */
	{
		"win1250", PG_WIN1250
	},							/* alias for Windows-1250 */
	{
		"win1251", PG_WIN1251
	},							/* alias for Windows-1251 */
	{
		"win1252", PG_WIN1252
	},							/* alias for Windows-1252 */
	{
		"win1253", PG_WIN1253
	},							/* alias for Windows-1253 */
	{
		"win1254", PG_WIN1254
	},							/* alias for Windows-1254 */
	{
		"win1255", PG_WIN1255
	},							/* alias for Windows-1255 */
	{
		"win1256", PG_WIN1256
	},							/* alias for Windows-1256 */
	{
		"win1257", PG_WIN1257
	},							/* alias for Windows-1257 */
	{
		"win1258", PG_WIN1258
	},							/* alias for Windows-1258 */
	{
		"win866", PG_WIN866
	},							/* IBM866 */
	{
		"win874", PG_WIN874
	},							/* alias for Windows-874 */
	{
		"win932", PG_SJIS
	},							/* alias for Shift_JIS */
	{
		"win936", PG_GBK
	},							/* alias for GBK */
	{
		"win949", PG_UHC
	},							/* alias for UHC */
	{
		"win950", PG_BIG5
	},							/* alias for BIG5 */
	{
		"windows1250", PG_WIN1250
	},							/* Windows-1251; Microsoft */
	{
		"windows1251", PG_WIN1251
	},							/* Windows-1251; Microsoft */
	{
		"windows1252", PG_WIN1252
	},							/* Windows-1252; Microsoft */
	{
		"windows1253", PG_WIN1253
	},							/* Windows-1253; Microsoft */
	{
		"windows1254", PG_WIN1254
	},							/* Windows-1254; Microsoft */
	{
		"windows1255", PG_WIN1255
	},							/* Windows-1255; Microsoft */
	{
		"windows1256", PG_WIN1256
	},							/* Windows-1256; Microsoft */
	{
		"windows1257", PG_WIN1257
	},							/* Windows-1257; Microsoft */
	{
		"windows1258", PG_WIN1258
	},							/* Windows-1258; Microsoft */
	{
		"windows866", PG_WIN866
	},							/* IBM866 */
	{
		"windows874", PG_WIN874
	},							/* Windows-874; Microsoft */
	{
		"windows932", PG_SJIS
	},							/* alias for Shift_JIS */
	{
		"windows936", PG_GBK
	},							/* alias for GBK */
	{
		"windows949", PG_UHC
	},							/* alias for UHC */
	{
		"windows950", PG_BIG5
	}							/* alias for BIG5 */
};

/* ----------
 * These are "official" encoding names.
 * ----------
 */
#ifndef WIN32
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
#else
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
#endif

const pg_enc2name pg_enc2name_tbl[] =
{
	[PG_SQL_ASCII] = DEF_ENC2NAME(SQL_ASCII, 0),
	[PG_EUC_JP] = DEF_ENC2NAME(EUC_JP, 20932),
	[PG_EUC_CN] = DEF_ENC2NAME(EUC_CN, 20936),
	[PG_EUC_KR] = DEF_ENC2NAME(EUC_KR, 51949),
	[PG_EUC_TW] = DEF_ENC2NAME(EUC_TW, 0),
	[PG_EUC_JIS_2004] = DEF_ENC2NAME(EUC_JIS_2004, 20932),
	[PG_UTF8] = DEF_ENC2NAME(UTF8, 65001),
	[PG_MULE_INTERNAL] = DEF_ENC2NAME(MULE_INTERNAL, 0),
	[PG_LATIN1] = DEF_ENC2NAME(LATIN1, 28591),
	[PG_LATIN2] = DEF_ENC2NAME(LATIN2, 28592),
	[PG_LATIN3] = DEF_ENC2NAME(LATIN3, 28593),
	[PG_LATIN4] = DEF_ENC2NAME(LATIN4, 28594),
	[PG_LATIN5] = DEF_ENC2NAME(LATIN5, 28599),
	[PG_LATIN6] = DEF_ENC2NAME(LATIN6, 0),
	[PG_LATIN7] = DEF_ENC2NAME(LATIN7, 0),
	[PG_LATIN8] = DEF_ENC2NAME(LATIN8, 0),
	[PG_LATIN9] = DEF_ENC2NAME(LATIN9, 28605),
	[PG_LATIN10] = DEF_ENC2NAME(LATIN10, 0),
	[PG_WIN1256] = DEF_ENC2NAME(WIN1256, 1256),
	[PG_WIN1258] = DEF_ENC2NAME(WIN1258, 1258),
	[PG_WIN866] = DEF_ENC2NAME(WIN866, 866),
	[PG_WIN874] = DEF_ENC2NAME(WIN874, 874),
	[PG_KOI8R] = DEF_ENC2NAME(KOI8R, 20866),
	[PG_WIN1251] = DEF_ENC2NAME(WIN1251, 1251),
	[PG_WIN1252] = DEF_ENC2NAME(WIN1252, 1252),
	[PG_ISO_8859_5] = DEF_ENC2NAME(ISO_8859_5, 28595),
	[PG_ISO_8859_6] = DEF_ENC2NAME(ISO_8859_6, 28596),
	[PG_ISO_8859_7] = DEF_ENC2NAME(ISO_8859_7, 28597),
	[PG_ISO_8859_8] = DEF_ENC2NAME(ISO_8859_8, 28598),
	[PG_WIN1250] = DEF_ENC2NAME(WIN1250, 1250),
	[PG_WIN1253] = DEF_ENC2NAME(WIN1253, 1253),
	[PG_WIN1254] = DEF_ENC2NAME(WIN1254, 1254),
	[PG_WIN1255] = DEF_ENC2NAME(WIN1255, 1255),
	[PG_WIN1257] = DEF_ENC2NAME(WIN1257, 1257),
	[PG_KOI8U] = DEF_ENC2NAME(KOI8U, 21866),
	[PG_SJIS] = DEF_ENC2NAME(SJIS, 932),
	[PG_BIG5] = DEF_ENC2NAME(BIG5, 950),
	[PG_GBK] = DEF_ENC2NAME(GBK, 936),
	[PG_UHC] = DEF_ENC2NAME(UHC, 949),
	[PG_GB18030] = DEF_ENC2NAME(GB18030, 54936),
	[PG_JOHAB] = DEF_ENC2NAME(JOHAB, 0),
	[PG_SHIFT_JIS_2004] = DEF_ENC2NAME(SHIFT_JIS_2004, 932),
};

/* ----------
 * These are encoding names for gettext.
 *
 * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
 * ----------
 */
const char *pg_enc2gettext_tbl[] =
{
	[PG_SQL_ASCII] = "US-ASCII",
	[PG_UTF8] = "UTF-8",
	[PG_MULE_INTERNAL] = NULL,
	[PG_LATIN1] = "LATIN1",
	[PG_LATIN2] = "LATIN2",
	[PG_LATIN3] = "LATIN3",
	[PG_LATIN4] = "LATIN4",
	[PG_ISO_8859_5] = "ISO-8859-5",
	[PG_ISO_8859_6] = "ISO_8859-6",
	[PG_ISO_8859_7] = "ISO-8859-7",
	[PG_ISO_8859_8] = "ISO-8859-8",
	[PG_LATIN5] = "LATIN5",
	[PG_LATIN6] = "LATIN6",
	[PG_LATIN7] = "LATIN7",
	[PG_LATIN8] = "LATIN8",
	[PG_LATIN9] = "LATIN-9",
	[PG_LATIN10] = "LATIN10",
	[PG_KOI8R] = "KOI8-R",
	[PG_KOI8U] = "KOI8-U",
	[PG_WIN1250] = "CP1250",
	[PG_WIN1251] = "CP1251",
	[PG_WIN1252] = "CP1252",
	[PG_WIN1253] = "CP1253",
	[PG_WIN1254] = "CP1254",
	[PG_WIN1255] = "CP1255",
	[PG_WIN1256] = "CP1256",
	[PG_WIN1257] = "CP1257",
	[PG_WIN1258] = "CP1258",
	[PG_WIN866] = "CP866",
	[PG_WIN874] = "CP874",
	[PG_EUC_CN] = "EUC-CN",
	[PG_EUC_JP] = "EUC-JP",
	[PG_EUC_KR] = "EUC-KR",
	[PG_EUC_TW] = "EUC-TW",
	[PG_EUC_JIS_2004] = "EUC-JP",
	[PG_SJIS] = "SHIFT-JIS",
	[PG_BIG5] = "BIG5",
	[PG_GBK] = "GBK",
	[PG_UHC] = "UHC",
	[PG_GB18030] = "GB18030",
	[PG_JOHAB] = "JOHAB",
	[PG_SHIFT_JIS_2004] = "SHIFT_JISX0213",
};


/*
 * Table of encoding names for ICU (currently covers backend encodings only)
 *
 * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
 *
 * NULL entries are not supported by ICU, or their mapping is unclear.
 */
static const char *const pg_enc2icu_tbl[] =
{
	[PG_SQL_ASCII] = NULL,
	[PG_EUC_JP] = "EUC-JP",
	[PG_EUC_CN] = "EUC-CN",
	[PG_EUC_KR] = "EUC-KR",
	[PG_EUC_TW] = "EUC-TW",
	[PG_EUC_JIS_2004] = NULL,
	[PG_UTF8] = "UTF-8",
	[PG_MULE_INTERNAL] = NULL,
	[PG_LATIN1] = "ISO-8859-1",
	[PG_LATIN2] = "ISO-8859-2",
	[PG_LATIN3] = "ISO-8859-3",
	[PG_LATIN4] = "ISO-8859-4",
	[PG_LATIN5] = "ISO-8859-9",
	[PG_LATIN6] = "ISO-8859-10",
	[PG_LATIN7] = "ISO-8859-13",
	[PG_LATIN8] = "ISO-8859-14",
	[PG_LATIN9] = "ISO-8859-15",
	[PG_LATIN10] = NULL,
	[PG_WIN1256] = "CP1256",
	[PG_WIN1258] = "CP1258",
	[PG_WIN866] = "CP866",
	[PG_WIN874] = NULL,
	[PG_KOI8R] = "KOI8-R",
	[PG_WIN1251] = "CP1251",
	[PG_WIN1252] = "CP1252",
	[PG_ISO_8859_5] = "ISO-8859-5",
	[PG_ISO_8859_6] = "ISO-8859-6",
	[PG_ISO_8859_7] = "ISO-8859-7",
	[PG_ISO_8859_8] = "ISO-8859-8",
	[PG_WIN1250] = "CP1250",
	[PG_WIN1253] = "CP1253",
	[PG_WIN1254] = "CP1254",
	[PG_WIN1255] = "CP1255",
	[PG_WIN1257] = "CP1257",
	[PG_KOI8U] = "KOI8-U",
};

StaticAssertDecl(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
				 "pg_enc2icu_tbl incomplete");


/*
 * Is this encoding supported by ICU?
 */
bool
is_encoding_supported_by_icu(int encoding)
{
	if (!PG_VALID_BE_ENCODING(encoding))
		return false;
	return (pg_enc2icu_tbl[encoding] != NULL);
}

/*
 * Returns ICU's name for encoding, or NULL if not supported
 */
const char *
get_encoding_name_for_icu(int encoding)
{
	if (!PG_VALID_BE_ENCODING(encoding))
		return NULL;
	return pg_enc2icu_tbl[encoding];
}


/* ----------
 * Encoding checks, for error returns -1 else encoding id
 * ----------
 */
int
pg_valid_client_encoding(const char *name)
{
	int			enc;

	if ((enc = pg_char_to_encoding(name)) < 0)
		return -1;

	if (!PG_VALID_FE_ENCODING(enc))
		return -1;

	return enc;
}

int
pg_valid_server_encoding(const char *name)
{
	int			enc;

	if ((enc = pg_char_to_encoding(name)) < 0)
		return -1;

	if (!PG_VALID_BE_ENCODING(enc))
		return -1;

	return enc;
}

int
pg_valid_server_encoding_id(int encoding)
{
	return PG_VALID_BE_ENCODING(encoding);
}

/*
 * Remove irrelevant chars from encoding name, store at *newkey
 *
 * (Caller's responsibility to provide a large enough buffer)
 */
static char *
clean_encoding_name(const char *key, char *newkey)
{
	const char *p;
	char	   *np;

	for (p = key, np = newkey; *p != '\0'; p++)
	{
		if (isalnum((unsigned char) *p))
		{
			if (*p >= 'A' && *p <= 'Z')
				*np++ = *p + 'a' - 'A';
			else
				*np++ = *p;
		}
	}
	*np = '\0';
	return newkey;
}

/*
 * Search encoding by encoding name
 *
 * Returns encoding ID, or -1 if not recognized
 */
int
pg_char_to_encoding(const char *name)
{
	unsigned int nel = lengthof(pg_encname_tbl);
	const pg_encname *base = pg_encname_tbl,
			   *last = base + nel - 1,
			   *position;
	int			result;
	char		buff[NAMEDATALEN],
			   *key;

	if (name == NULL || *name == '\0')
		return -1;

	if (strlen(name) >= NAMEDATALEN)
		return -1;				/* it's certainly not in the table */

	key = clean_encoding_name(name, buff);

	while (last >= base)
	{
		position = base + ((last - base) >> 1);
		result = key[0] - position->name[0];

		if (result == 0)
		{
			result = strcmp(key, position->name);
			if (result == 0)
				return position->encoding;
		}
		if (result < 0)
			last = position - 1;
		else
			base = position + 1;
	}
	return -1;
}

const char *
pg_encoding_to_char(int encoding)
{
	if (PG_VALID_ENCODING(encoding))
	{
		const pg_enc2name *p = &pg_enc2name_tbl[encoding];

		Assert(encoding == p->encoding);
		return p->name;
	}
	return "";
}