diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 19285ae136..fbf6062d0a 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3551,6 +3551,52 @@ repeat('Pg', 4) PgPgPgPg + + + + unistr + + unistr ( text ) + text + + + Evaluate escaped Unicode characters in argument. Unicode characters + can be specified as + \XXXX (4 hexadecimal + digits), \+XXXXXX (6 + hexadecimal digits), + \uXXXX (4 hexadecimal + digits), or \UXXXXXXXX + (8 hexadecimal digits). To specify a backslash, write two + backslashes. All other characters are taken literally. + + + + If the server encoding is not UTF-8, the Unicode code point identified + by one of these escape sequences is converted to the actual server + encoding; an error is reported if that's not possible. + + + + This function provides a (non-standard) alternative to string + constants with Unicode escapes (see ). + + + + unistr('\0441\043B\043E\043D') + слон + + + unistr('d\0061t\+000061') + data + + + unistr('d\u0061t\U00000061') + data + + + diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 640e3fd4c0..efc74e8f2d 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS) PG_RETURN_BOOL(result); } + +/* + * Check if first n chars are hexadecimal digits + */ +static bool +isxdigits_n(const char *instr, size_t n) +{ + for (size_t i = 0; i < n; i++) + if (!isxdigit((unsigned char) instr[i])) + return false; + + return true; +} + +static unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +/* + * Translate string with hexadecimal digits to number + */ +static unsigned int +hexval_n(const char *instr, size_t n) +{ + unsigned int result = 0; + + for (size_t i = 0; i < n; i++) + result += hexval(instr[i]) << (4 * (n - i - 1)); + + return result; +} + +/* + * Replaces Unicode escape sequences by Unicode characters + */ +Datum +unistr(PG_FUNCTION_ARGS) +{ + text *input_text = PG_GETARG_TEXT_PP(0); + char *instr; + int len; + StringInfoData str; + text *result; + pg_wchar pair_first = 0; + char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; + + instr = VARDATA_ANY(input_text); + len = VARSIZE_ANY_EXHDR(input_text); + + initStringInfo(&str); + + while (len > 0) + { + if (instr[0] == '\\') + { + if (len >= 2 && + instr[1] == '\\') + { + if (pair_first) + goto invalid_pair; + appendStringInfoChar(&str, '\\'); + instr += 2; + len -= 2; + } + else if ((len >= 5 && isxdigits_n(instr + 1, 4)) || + (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4))) + { + pg_wchar unicode; + int offset = instr[1] == 'u' ? 2 : 1; + + unicode = hexval_n(instr + offset, 4); + + if (!is_valid_unicode_codepoint(unicode)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid Unicode code point: %04X", unicode)); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 4 + offset; + len -= 4 + offset; + } + else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6)) + { + pg_wchar unicode; + + unicode = hexval_n(instr + 2, 6); + + if (!is_valid_unicode_codepoint(unicode)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid Unicode code point: %04X", unicode)); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 8; + len -= 8; + } + else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8)) + { + pg_wchar unicode; + + unicode = hexval_n(instr + 2, 8); + + if (!is_valid_unicode_codepoint(unicode)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid Unicode code point: %04X", unicode)); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 10; + len -= 10; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX."))); + } + else + { + if (pair_first) + goto invalid_pair; + + appendStringInfoChar(&str, *instr++); + len--; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + result = cstring_to_text_with_len(str.data, str.len); + pfree(str.data); + + PG_RETURN_TEXT_P(result); + +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"))); +} diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 4a39da3c9d..489f5be427 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202103266 +#define CATALOG_VERSION_NO 202103291 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index cc7d90d2b0..bfb89e0575 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11527,6 +11527,10 @@ proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text', prosrc => 'unicode_is_normalized' }, +{ oid => '9822', descr => 'unescape Unicode characters', + proname => 'unistr', prorettype => 'text', proargtypes => 'text', + prosrc => 'unistr' }, + { oid => '4596', descr => 'I/O', proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary', proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' }, diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index afd84249c8..91aa819804 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea); 15 (1 row) +SELECT unistr('\0064at\+0000610'); + unistr +-------- + data0 +(1 row) + +SELECT unistr('d\u0061t\U000000610'); + unistr +-------- + data0 +(1 row) + +SELECT unistr('a\\b'); + unistr +-------- + a\b +(1 row) + +-- errors: +SELECT unistr('wrong: \db99'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \db99\0061'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \+00db99\+000061'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \+2FFFFF'); +ERROR: invalid Unicode code point: 2FFFFF +SELECT unistr('wrong: \udb99\u0061'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \U0000db99\U00000061'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \U002FFFFF'); +ERROR: invalid Unicode code point: 2FFFFF +SELECT unistr('wrong: \xyz'); +ERROR: invalid Unicode escape +HINT: Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX. diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 9aa1825f92..2c502534c2 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8) SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape'); SELECT bit_count('\x1234567890'::bytea); + +SELECT unistr('\0064at\+0000610'); +SELECT unistr('d\u0061t\U000000610'); +SELECT unistr('a\\b'); +-- errors: +SELECT unistr('wrong: \db99'); +SELECT unistr('wrong: \db99\0061'); +SELECT unistr('wrong: \+00db99\+000061'); +SELECT unistr('wrong: \+2FFFFF'); +SELECT unistr('wrong: \udb99\u0061'); +SELECT unistr('wrong: \U0000db99\U00000061'); +SELECT unistr('wrong: \U002FFFFF'); +SELECT unistr('wrong: \xyz');