Add unistr function

This allows decoding a string with Unicode escape sequences.  It is
similar to Unicode escape strings, but offers some more flexibility.

Author: Pavel Stehule <pavel.stehule@gmail.com>
Reviewed-by: Asif Rehman <asifr.rehman@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com
This commit is contained in:
Peter Eisentraut 2021-03-28 08:16:15 +02:00
parent ebedd0c78f
commit f37fec837c
6 changed files with 310 additions and 1 deletions

View File

@ -3551,6 +3551,52 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
</para></entry>
</row>
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
<primary>unistr</primary>
</indexterm>
<function>unistr</function> ( <type>text</type> )
<returnvalue>text</returnvalue>
</para>
<para>
Evaluate escaped Unicode characters in argument. Unicode characters
can be specified as
<literal>\<replaceable>XXXX</replaceable></literal> (4 hexadecimal
digits), <literal>\+<replaceable>XXXXXX</replaceable></literal> (6
hexadecimal digits),
<literal>\u<replaceable>XXXX</replaceable></literal> (4 hexadecimal
digits), or <literal>\U<replaceable>XXXXXXXX</replaceable></literal>
(8 hexadecimal digits). To specify a backslash, write two
backslashes. All other characters are taken literally.
</para>
<para>
If the server encoding is not UTF-8, the Unicode code point identified
by one of these escape sequences is converted to the actual server
encoding; an error is reported if that's not possible.
</para>
<para>
This function provides a (non-standard) alternative to string
constants with Unicode escapes (see <xref
linkend="sql-syntax-strings-uescape"/>).
</para>
<para>
<literal>unistr('\0441\043B\043E\043D')</literal>
<returnvalue>слон</returnvalue>
</para>
<para>
<literal>unistr('d\0061t\+000061')</literal>
<returnvalue>data</returnvalue>
</para>
<para>
<literal>unistr('d\u0061t\U00000061')</literal>
<returnvalue>data</returnvalue>
</para></entry>
</row>
</tbody>
</tgroup>
</table>

View File

@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(result);
}
/*
* Check if first n chars are hexadecimal digits
*/
static bool
isxdigits_n(const char *instr, size_t n)
{
for (size_t i = 0; i < n; i++)
if (!isxdigit((unsigned char) instr[i]))
return false;
return true;
}
static unsigned int
hexval(unsigned char c)
{
if (c >= '0' && c <= '9')
return c - '0';
if (c >= 'a' && c <= 'f')
return c - 'a' + 0xA;
if (c >= 'A' && c <= 'F')
return c - 'A' + 0xA;
elog(ERROR, "invalid hexadecimal digit");
return 0; /* not reached */
}
/*
* Translate string with hexadecimal digits to number
*/
static unsigned int
hexval_n(const char *instr, size_t n)
{
unsigned int result = 0;
for (size_t i = 0; i < n; i++)
result += hexval(instr[i]) << (4 * (n - i - 1));
return result;
}
/*
* Replaces Unicode escape sequences by Unicode characters
*/
Datum
unistr(PG_FUNCTION_ARGS)
{
text *input_text = PG_GETARG_TEXT_PP(0);
char *instr;
int len;
StringInfoData str;
text *result;
pg_wchar pair_first = 0;
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
instr = VARDATA_ANY(input_text);
len = VARSIZE_ANY_EXHDR(input_text);
initStringInfo(&str);
while (len > 0)
{
if (instr[0] == '\\')
{
if (len >= 2 &&
instr[1] == '\\')
{
if (pair_first)
goto invalid_pair;
appendStringInfoChar(&str, '\\');
instr += 2;
len -= 2;
}
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
{
pg_wchar unicode;
int offset = instr[1] == 'u' ? 2 : 1;
unicode = hexval_n(instr + offset, 4);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 4 + offset;
len -= 4 + offset;
}
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
{
pg_wchar unicode;
unicode = hexval_n(instr + 2, 6);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 8;
len -= 8;
}
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
{
pg_wchar unicode;
unicode = hexval_n(instr + 2, 8);
if (!is_valid_unicode_codepoint(unicode))
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid Unicode code point: %04X", unicode));
if (pair_first)
{
if (is_utf16_surrogate_second(unicode))
{
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
pair_first = 0;
}
else
goto invalid_pair;
}
else if (is_utf16_surrogate_second(unicode))
goto invalid_pair;
if (is_utf16_surrogate_first(unicode))
pair_first = unicode;
else
{
pg_unicode_to_server(unicode, (unsigned char *) cbuf);
appendStringInfoString(&str, cbuf);
}
instr += 10;
len -= 10;
}
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid Unicode escape"),
errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
}
else
{
if (pair_first)
goto invalid_pair;
appendStringInfoChar(&str, *instr++);
len--;
}
}
/* unfinished surrogate pair? */
if (pair_first)
goto invalid_pair;
result = cstring_to_text_with_len(str.data, str.len);
pfree(str.data);
PG_RETURN_TEXT_P(result);
invalid_pair:
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid Unicode surrogate pair")));
}

View File

@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 202103266
#define CATALOG_VERSION_NO 202103291
#endif

View File

@ -11527,6 +11527,10 @@
proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
prosrc => 'unicode_is_normalized' },
{ oid => '9822', descr => 'unescape Unicode characters',
proname => 'unistr', prorettype => 'text', proargtypes => 'text',
prosrc => 'unistr' },
{ oid => '4596', descr => 'I/O',
proname => 'brin_bloom_summary_in', prorettype => 'pg_brin_bloom_summary',
proargtypes => 'cstring', prosrc => 'brin_bloom_summary_in' },

View File

@ -2234,3 +2234,39 @@ SELECT bit_count('\x1234567890'::bytea);
15
(1 row)
SELECT unistr('\0064at\+0000610');
unistr
--------
data0
(1 row)
SELECT unistr('d\u0061t\U000000610');
unistr
--------
data0
(1 row)
SELECT unistr('a\\b');
unistr
--------
a\b
(1 row)
-- errors:
SELECT unistr('wrong: \db99');
ERROR: invalid Unicode surrogate pair
SELECT unistr('wrong: \db99\0061');
ERROR: invalid Unicode surrogate pair
SELECT unistr('wrong: \+00db99\+000061');
ERROR: invalid Unicode surrogate pair
SELECT unistr('wrong: \+2FFFFF');
ERROR: invalid Unicode code point: 2FFFFF
SELECT unistr('wrong: \udb99\u0061');
ERROR: invalid Unicode surrogate pair
SELECT unistr('wrong: \U0000db99\U00000061');
ERROR: invalid Unicode surrogate pair
SELECT unistr('wrong: \U002FFFFF');
ERROR: invalid Unicode code point: 2FFFFF
SELECT unistr('wrong: \xyz');
ERROR: invalid Unicode escape
HINT: Unicode escapes must be \XXXX, \+XXXXXX, \uXXXX, or \UXXXXXXXX.

View File

@ -746,3 +746,16 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8)
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
SELECT bit_count('\x1234567890'::bytea);
SELECT unistr('\0064at\+0000610');
SELECT unistr('d\u0061t\U000000610');
SELECT unistr('a\\b');
-- errors:
SELECT unistr('wrong: \db99');
SELECT unistr('wrong: \db99\0061');
SELECT unistr('wrong: \+00db99\+000061');
SELECT unistr('wrong: \+2FFFFF');
SELECT unistr('wrong: \udb99\u0061');
SELECT unistr('wrong: \U0000db99\U00000061');
SELECT unistr('wrong: \U002FFFFF');
SELECT unistr('wrong: \xyz');