From c2bb0378cfcba28d57e357d0daa5ec895a51d8a8 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 22 Sep 2009 23:52:53 +0000 Subject: [PATCH] Unicode escapes in E'...' strings Author: Marko Kreen --- doc/src/sgml/syntax.sgml | 34 +++++++++++++---- src/backend/parser/scan.l | 68 +++++++++++++++++++++++++++++++++- src/include/parser/gramparse.h | 5 ++- 3 files changed, 98 insertions(+), 9 deletions(-) diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index c805e2e714..73db3235bd 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -1,4 +1,4 @@ - + SQL Syntax @@ -398,6 +398,14 @@ SELECT 'foo' 'bar'; hexadecimal byte value + + + \uxxxx, + \Uxxxxxxxx + (x = 0 - 9, A - F) + + 16 or 32-bit hexadecimal Unicode character value + @@ -411,13 +419,25 @@ SELECT 'foo' 'bar'; - It is your responsibility that the byte sequences you create are + It is your responsibility that the byte sequences you create, + especially when using the octal or hexadecimal escapes, compose valid characters in the server character set encoding. When the - server encoding is UTF-8, then the alternative Unicode escape - syntax, explained in , - should be used instead. (The alternative would be doing the - UTF-8 encoding by hand and writing out the bytes, which would be - very cumbersome.) + server encoding is UTF-8, then the Unicode escapes or the + alternative Unicode escape syntax, explained + in , should be used + instead. (The alternative would be doing the UTF-8 encoding by + hand and writing out the bytes, which would be very cumbersome.) + + + + The Unicode escape syntax works fully only when the server + encoding is UTF-8. When other server encodings are used, only + code points in the ASCII range (up to \u007F) can be + specified. Both the 4-digit and the 8-digit form can be used to + specify UTF-16 surrogate pairs to compose characters with code + points larger than \FFFF (although the + availability of the 8-digit form technically makes this + unnecessary). diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index d40bd9dd97..fcfe2b3c40 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -24,7 +24,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $ + * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $ * *------------------------------------------------------------------------- */ @@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner); static char *litbufdup(base_yyscan_t yyscanner); static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner); static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner); +static bool is_utf16_surrogate_first(pg_wchar c); +static bool is_utf16_surrogate_second(pg_wchar c); +static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second); #define yyerror(msg) scanner_yyerror(msg, yyscanner) @@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner); extern int base_yyget_column(yyscan_t yyscanner); extern void base_yyset_column(int column_no, yyscan_t yyscanner); +static void addunicode(pg_wchar c, yyscan_t yyscanner); + %} %option reentrant @@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner); * $foo$ quoted strings * quoted identifier with Unicode escapes * quoted string with Unicode escapes + * Unicode surrogate pair in extended quoted string */ %x xb @@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner); %x xdolq %x xui %x xus +%x xeu /* * In order to make the world safe for Windows and Mac clients as well as @@ -223,6 +230,8 @@ xeinside [^\\']+ xeescape [\\][^0-7] xeoctesc [\\][0-7]{1,3} xehexesc [\\]x[0-9A-Fa-f]{1,2} +xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) +xeunicodebad [\\]([uU]) /* Extended quote * xqdouble implements embedded quote, '''' @@ -535,6 +544,45 @@ other . {xeinside} { addlit(yytext, yyleng, yyscanner); } +{xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + check_escape_warning(yyscanner); + + if (is_utf16_surrogate_first(c)) + { + yyextra->utf16_first_part = c; + BEGIN(xeu); + } + else if (is_utf16_surrogate_second(c)) + yyerror("invalid Unicode surrogate pair"); + else + addunicode(c, yyscanner); + } +{xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + if (!is_utf16_surrogate_second(c)) + yyerror("invalid Unicode surrogate pair"); + + c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c); + + addunicode(c, yyscanner); + + BEGIN(xe); + } +. | +\n | +<> { yyerror("invalid Unicode surrogate pair"); } + +{xeunicodebad} { + ereport(ERROR, + (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."), + lexer_errposition())); + } + {xeescape} { if (yytext[1] == '\'') { @@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner) if (ptr) pfree(ptr); } + +static void +addunicode(pg_wchar c, base_yyscan_t yyscanner) +{ + char buf[8]; + + if (c == 0 || c > 0x10FFFF) + yyerror("invalid Unicode escape value"); + if (c > 0x7F) + { + if (GetDatabaseEncoding() != PG_UTF8) + yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); + yyextra->saw_non_ascii = true; + } + unicode_to_utf8(c, (unsigned char *)buf); + addlit(buf, pg_mblen(buf), yyscanner); +} + diff --git a/src/include/parser/gramparse.h b/src/include/parser/gramparse.h index 4b061e0504..df384a11ca 100644 --- a/src/include/parser/gramparse.h +++ b/src/include/parser/gramparse.h @@ -11,7 +11,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $ + * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $ * *------------------------------------------------------------------------- */ @@ -71,6 +71,9 @@ typedef struct base_yy_extra_type int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */ + /* first part of UTF16 surrogate pair for Unicode escapes */ + int32 utf16_first_part; + /* state variables for literal-lexing warnings */ bool warn_on_first_escape; bool saw_non_ascii;