Unicode escapes in E'...' strings

Author: Marko Kreen <markokr@gmail.com>
This commit is contained in:
Peter Eisentraut 2009-09-22 23:52:53 +00:00
parent 9048b73184
commit c2bb0378cf
3 changed files with 98 additions and 9 deletions

View File

@ -1,4 +1,4 @@
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.136 2009/09/22 23:52:53 petere Exp $ -->
<chapter id="sql-syntax">
<title>SQL Syntax</title>
@ -398,6 +398,14 @@ SELECT 'foo' 'bar';
</entry>
<entry>hexadecimal byte value</entry>
</row>
<row>
<entry>
<literal>\u<replaceable>xxxx</replaceable></literal>,
<literal>\U<replaceable>xxxxxxxx</replaceable></literal>
(<replaceable>x</replaceable> = 0 - 9, A - F)
</entry>
<entry>16 or 32-bit hexadecimal Unicode character value</entry>
</row>
</tbody>
</tgroup>
</table>
@ -411,13 +419,25 @@ SELECT 'foo' 'bar';
</para>
<para>
It is your responsibility that the byte sequences you create are
It is your responsibility that the byte sequences you create,
especially when using the octal or hexadecimal escapes, compose
valid characters in the server character set encoding. When the
server encoding is UTF-8, then the alternative Unicode escape
syntax, explained in <xref linkend="sql-syntax-strings-uescape">,
should be used instead. (The alternative would be doing the
UTF-8 encoding by hand and writing out the bytes, which would be
very cumbersome.)
server encoding is UTF-8, then the Unicode escapes or the
alternative Unicode escape syntax, explained
in <xref linkend="sql-syntax-strings-uescape">, should be used
instead. (The alternative would be doing the UTF-8 encoding by
hand and writing out the bytes, which would be very cumbersome.)
</para>
<para>
The Unicode escape syntax works fully only when the server
encoding is UTF-8. When other server encodings are used, only
code points in the ASCII range (up to <literal>\u007F</>) can be
specified. Both the 4-digit and the 8-digit form can be used to
specify UTF-16 surrogate pairs to compose characters with code
points larger than <literal>\FFFF</literal> (although the
availability of the 8-digit form technically makes this
unnecessary).
</para>
<caution>

View File

@ -24,7 +24,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
*
*-------------------------------------------------------------------------
*/
@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
static char *litbufdup(base_yyscan_t yyscanner);
static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner);
static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner);
static bool is_utf16_surrogate_first(pg_wchar c);
static bool is_utf16_surrogate_second(pg_wchar c);
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
extern int base_yyget_column(yyscan_t yyscanner);
extern void base_yyset_column(int column_no, yyscan_t yyscanner);
static void addunicode(pg_wchar c, yyscan_t yyscanner);
%}
%option reentrant
@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
* <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes
* <xus> quoted string with Unicode escapes
* <xeu> Unicode surrogate pair in extended quoted string
*/
%x xb
@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
%x xdolq
%x xui
%x xus
%x xeu
/*
* In order to make the world safe for Windows and Mac clients as well as
@ -223,6 +230,8 @@ xeinside [^\\']+
xeescape [\\][^0-7]
xeoctesc [\\][0-7]{1,3}
xehexesc [\\]x[0-9A-Fa-f]{1,2}
xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
xeunicodebad [\\]([uU])
/* Extended quote
* xqdouble implements embedded quote, ''''
@ -535,6 +544,45 @@ other .
<xe>{xeinside} {
addlit(yytext, yyleng, yyscanner);
}
<xe>{xeunicode} {
pg_wchar c = strtoul(yytext+2, NULL, 16);
check_escape_warning(yyscanner);
if (is_utf16_surrogate_first(c))
{
yyextra->utf16_first_part = c;
BEGIN(xeu);
}
else if (is_utf16_surrogate_second(c))
yyerror("invalid Unicode surrogate pair");
else
addunicode(c, yyscanner);
}
<xeu>{xeunicode} {
pg_wchar c = strtoul(yytext+2, NULL, 16);
if (!is_utf16_surrogate_second(c))
yyerror("invalid Unicode surrogate pair");
c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
addunicode(c, yyscanner);
BEGIN(xe);
}
<xeu>. |
<xeu>\n |
<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
<xe>{xeunicodebad} {
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("invalid Unicode escape"),
errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
lexer_errposition()));
}
<xe>{xeescape} {
if (yytext[1] == '\'')
{
@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
if (ptr)
pfree(ptr);
}
static void
addunicode(pg_wchar c, base_yyscan_t yyscanner)
{
char buf[8];
if (c == 0 || c > 0x10FFFF)
yyerror("invalid Unicode escape value");
if (c > 0x7F)
{
if (GetDatabaseEncoding() != PG_UTF8)
yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
yyextra->saw_non_ascii = true;
}
unicode_to_utf8(c, (unsigned char *)buf);
addlit(buf, pg_mblen(buf), yyscanner);
}

View File

@ -11,7 +11,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $
*
*-------------------------------------------------------------------------
*/
@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
int xcdepth; /* depth of nesting in slash-star comments */
char *dolqstart; /* current $foo$ quote start string */
/* first part of UTF16 surrogate pair for Unicode escapes */
int32 utf16_first_part;
/* state variables for literal-lexing warnings */
bool warn_on_first_escape;
bool saw_non_ascii;