Change the way UESCAPE is lexed, to reduce the size of the flex tables.

The error rule used to avoid backtracking with the U&'...' UESCAPE 'x'
syntax bloated the flex tables, so refactor that. This patch makes the error
rule shorter, by introducing a new exclusive flex state that's entered after
parsing U&'...'. This shrinks the postgres binary by about 220kB.
This commit is contained in:
Heikki Linnakangas 2013-03-14 19:00:09 +02:00
parent 59d0bf9dca
commit a5ff502fce
1 changed files with 62 additions and 19 deletions

View File

@ -97,6 +97,7 @@ static bool is_utf16_surrogate_first(pg_wchar c);
static bool is_utf16_surrogate_second(pg_wchar c); static bool is_utf16_surrogate_second(pg_wchar c);
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second); static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
static void addunicode(pg_wchar c, yyscan_t yyscanner); static void addunicode(pg_wchar c, yyscan_t yyscanner);
static bool check_uescapechar(unsigned char escape);
#define yyerror(msg) scanner_yyerror(msg, yyscanner) #define yyerror(msg) scanner_yyerror(msg, yyscanner)
@ -150,7 +151,9 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
* <xe> extended quoted strings (support backslash escape sequences) * <xe> extended quoted strings (support backslash escape sequences)
* <xdolq> $foo$ quoted strings * <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes * <xui> quoted identifier with Unicode escapes
* <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
* <xus> quoted string with Unicode escapes * <xus> quoted string with Unicode escapes
* <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
* <xeu> Unicode surrogate pair in extended quoted string * <xeu> Unicode surrogate pair in extended quoted string
*/ */
@ -162,7 +165,9 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
%x xq %x xq
%x xdolq %x xdolq
%x xui %x xui
%x xuiend
%x xus %x xus
%x xusend
%x xeu %x xeu
/* /*
@ -279,17 +284,17 @@ xdinside [^"]+
/* Unicode escapes */ /* Unicode escapes */
uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
/* error rule to avoid backup */ /* error rule to avoid backup */
uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]) uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
/* Quoted identifier with Unicode escapes */ /* Quoted identifier with Unicode escapes */
xuistart [uU]&{dquote} xuistart [uU]&{dquote}
xuistop1 {dquote}{whitespace}*{uescapefail}?
xuistop2 {dquote}{whitespace}*{uescape}
/* Quoted string with Unicode escapes */ /* Quoted string with Unicode escapes */
xusstart [uU]&{quote} xusstart [uU]&{quote}
xusstop1 {quote}{whitespace}*{uescapefail}?
xusstop2 {quote}{whitespace}*{uescape} /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
xustop1 {uescapefail}?
xustop2 {uescape}
/* error rule to avoid backup */ /* error rule to avoid backup */
xufailed [uU]& xufailed [uU]&
@ -536,15 +541,31 @@ other .
yylval->str = litbufdup(yyscanner); yylval->str = litbufdup(yyscanner);
return SCONST; return SCONST;
} }
<xus>{xusstop1} { <xus>{quotestop} |
<xus>{quotefail} {
/* throw back all but the quote */ /* throw back all but the quote */
yyless(1); yyless(1);
/* handle possible UESCAPE in xusend mode */
BEGIN(xusend);
}
<xusend>{whitespace}
<xusend>{other} |
<xusend>{xustop1} {
/* no UESCAPE after the quote, throw back everything */
yyless(0);
BEGIN(INITIAL); BEGIN(INITIAL);
yylval->str = litbuf_udeescape('\\', yyscanner); yylval->str = litbuf_udeescape('\\', yyscanner);
return SCONST; return SCONST;
} }
<xus>{xusstop2} { <xusend>{xustop2} {
/* found UESCAPE after the end quote */
BEGIN(INITIAL); BEGIN(INITIAL);
if (!check_uescapechar(yytext[yyleng-2]))
{
SET_YYLLOC();
ADVANCE_YYLLOC(yyleng-2);
yyerror("invalid Unicode escape character");
}
yylval->str = litbuf_udeescape(yytext[yyleng-2], yyscanner); yylval->str = litbuf_udeescape(yytext[yyleng-2], yyscanner);
return SCONST; return SCONST;
} }
@ -702,9 +723,19 @@ other .
yylval->str = ident; yylval->str = ident;
return IDENT; return IDENT;
} }
<xui>{xuistop1} { <xui>{dquote} {
yyless(1);
/* handle possible UESCAPE in xuiend mode */
BEGIN(xuiend);
}
<xuiend>{whitespace} { }
<xuiend>{other} |
<xuiend>{xustop1} {
/* no UESCAPE after the quote, throw back everything */
char *ident; char *ident;
yyless(0);
BEGIN(INITIAL); BEGIN(INITIAL);
if (yyextra->literallen == 0) if (yyextra->literallen == 0)
yyerror("zero-length delimited identifier"); yyerror("zero-length delimited identifier");
@ -712,16 +743,21 @@ other .
if (yyextra->literallen >= NAMEDATALEN) if (yyextra->literallen >= NAMEDATALEN)
truncate_identifier(ident, yyextra->literallen, true); truncate_identifier(ident, yyextra->literallen, true);
yylval->str = ident; yylval->str = ident;
/* throw back all but the quote */
yyless(1);
return IDENT; return IDENT;
} }
<xui>{xuistop2} { <xuiend>{xustop2} {
/* found UESCAPE after the end quote */
char *ident; char *ident;
BEGIN(INITIAL); BEGIN(INITIAL);
if (yyextra->literallen == 0) if (yyextra->literallen == 0)
yyerror("zero-length delimited identifier"); yyerror("zero-length delimited identifier");
if (!check_uescapechar(yytext[yyleng-2]))
{
SET_YYLLOC();
ADVANCE_YYLLOC(yyleng-2);
yyerror("invalid Unicode escape character");
}
ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner); ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
if (yyextra->literallen >= NAMEDATALEN) if (yyextra->literallen >= NAMEDATALEN)
truncate_identifier(ident, yyextra->literallen, true); truncate_identifier(ident, yyextra->literallen, true);
@ -1203,22 +1239,29 @@ addunicode(pg_wchar c, core_yyscan_t yyscanner)
addlit(buf, pg_mblen(buf), yyscanner); addlit(buf, pg_mblen(buf), yyscanner);
} }
static char * /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner) static bool
check_uescapechar(unsigned char escape)
{ {
char *new;
char *litbuf, *in, *out;
pg_wchar pair_first = 0;
if (isxdigit(escape) if (isxdigit(escape)
|| escape == '+' || escape == '+'
|| escape == '\'' || escape == '\''
|| escape == '"' || escape == '"'
|| scanner_isspace(escape)) || scanner_isspace(escape))
{ {
ADVANCE_YYLLOC(yyextra->literallen + yyleng + 1); return false;
yyerror("invalid Unicode escape character");
} }
else
return true;
}
/* like litbufdup, but handle unicode escapes */
static char *
litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
{
char *new;
char *litbuf, *in, *out;
pg_wchar pair_first = 0;
/* Make literalbuf null-terminated to simplify the scanning loop */ /* Make literalbuf null-terminated to simplify the scanning loop */
litbuf = yyextra->literalbuf; litbuf = yyextra->literalbuf;