From ec937d0805b205e5b33ed4f3cb54f40230a826e3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 13 Nov 2018 12:57:52 -0500 Subject: [PATCH] Align ECPG lexer more closely with the core and psql lexers. Make a bunch of basically-cosmetic changes to reduce the diffs between the flex rules in scan.l, psqlscan.l, and pgc.l. Reorder some code, adjust a lot of whitespace, sync some comments, make use of flex start condition scopes to do that. There are a few non-cosmetic changes in the ECPG lexer: * Bring over the decimalfail rule (and support function process_integer_literal) so that ECPG will lex "1..10" into the same tokens as the backend would. I'm not sure this makes any visible difference to users, but I'm not sure it doesn't, either. * <> gets its own rule so as to produce a more on-point error message. * Remove duplicate {xdstart} rule. John Naylor, with a few additional changes by me Discussion: https://postgr.es/m/CAJVSVGWGqY9YBs2EwtRUkbNv=hXkN8yRPOoD1wxE6COgvvrz5g@mail.gmail.com --- src/backend/parser/scan.l | 23 +- src/fe_utils/psqlscan.l | 22 +- src/interfaces/ecpg/preproc/pgc.l | 1049 ++++++++++++++++------------- 3 files changed, 623 insertions(+), 471 deletions(-) diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 950b8b8591..6c6a6e320f 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -6,7 +6,8 @@ * * NOTE NOTE NOTE: * - * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l! + * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l + * and src/interfaces/ecpg/preproc/pgc.l! * * The rules are designed so that the scanner never has to backtrack, * in the sense that there is always a rule that can match the input @@ -168,8 +169,8 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); %x xc %x xd %x xh -%x xe %x xq +%x xe %x xdolq %x xui %x xuiend @@ -192,7 +193,7 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner); * XXX perhaps \f (formfeed) should be treated as a newline as well? * * XXX if you change the set of whitespace characters, fix scanner_isspace() - * to agree, and see also the plpgsql lexer. + * to agree. */ space [ \t\n\r\f] @@ -417,32 +418,36 @@ other . yyless(2); } -{xcstart} { +{ +{xcstart} { (yyextra->xcdepth)++; /* Put back any characters past slash-star; see above */ yyless(2); } -{xcstop} { +{xcstop} { if (yyextra->xcdepth <= 0) BEGIN(INITIAL); else (yyextra->xcdepth)--; } -{xcinside} { +{xcinside} { /* ignore */ } -{op_chars} { +{op_chars} { /* ignore */ } -\*+ { +\*+ { /* ignore */ } -<> { yyerror("unterminated /* comment"); } +<> { + yyerror("unterminated /* comment"); + } +} /* */ {xbstart} { /* Binary bit type. diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l index fdf49875a7..ae5418e7da 100644 --- a/src/fe_utils/psqlscan.l +++ b/src/fe_utils/psqlscan.l @@ -23,6 +23,7 @@ * * See psqlscan_int.h for additional commentary. * + * * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -39,6 +40,9 @@ } %{ + +/* LCOV_EXCL_START */ + #include "fe_utils/psqlscan_int.h" /* @@ -71,8 +75,6 @@ typedef int YYSTYPE; extern int psql_yyget_column(yyscan_t yyscanner); extern void psql_yyset_column(int column_no, yyscan_t yyscanner); -/* LCOV_EXCL_START */ - %} %option reentrant @@ -128,8 +130,8 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); %x xc %x xd %x xh -%x xe %x xq +%x xe %x xdolq %x xui %x xuiend @@ -151,7 +153,7 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner); * XXX perhaps \f (formfeed) should be treated as a newline as well? * * XXX if you change the set of whitespace characters, fix scanner_isspace() - * to agree, and see also the plpgsql lexer. + * to agree. */ space [ \t\n\r\f] @@ -402,14 +404,15 @@ other . ECHO; } -{xcstart} { +{ +{xcstart} { cur_state->xcdepth++; /* Put back any characters past slash-star; see above */ yyless(2); ECHO; } -{xcstop} { +{xcstop} { if (cur_state->xcdepth <= 0) BEGIN(INITIAL); else @@ -417,17 +420,18 @@ other . ECHO; } -{xcinside} { +{xcinside} { ECHO; } -{op_chars} { +{op_chars} { ECHO; } -\*+ { +\*+ { ECHO; } +} /* */ {xbstart} { BEGIN(xb); diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l index 0792118cfe..91ee44f091 100644 --- a/src/interfaces/ecpg/preproc/pgc.l +++ b/src/interfaces/ecpg/preproc/pgc.l @@ -10,7 +10,6 @@ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * * IDENTIFICATION * src/interfaces/ecpg/preproc/pgc.l * @@ -28,6 +27,9 @@ } %{ + +/* LCOV_EXCL_START */ + extern YYSTYPE base_yylval; static int xcdepth = 0; /* depth of nesting in slash-star comments */ @@ -53,8 +55,9 @@ static bool include_next; #define startlit() (literalbuf[0] = '\0', literallen = 0) static void addlit(char *ytext, int yleng); -static void addlitchar (unsigned char); -static void parse_include (void); +static void addlitchar(unsigned char); +static int process_integer_literal(const char *token, YYSTYPE *lval); +static void parse_include(void); static bool ecpg_isspace(char ch); static bool isdefine(void); static bool isinformixdefine(void); @@ -81,8 +84,6 @@ static struct _if_value short else_branch; } stacked_if_value[MAX_NESTED_IF]; -/* LCOV_EXCL_START */ - %} %option 8bit @@ -91,11 +92,8 @@ static struct _if_value %option noinput %option noyywrap %option warn -%option prefix="base_yy" - %option yylineno - -%x C SQL incl def def_ident undef +%option prefix="base_yy" /* * OK, here is a short description of lex/flex rules behavior. @@ -108,18 +106,24 @@ static struct _if_value * We use exclusive states for quoted strings, extended comments, * and to eliminate parsing troubles for numeric strings. * Exclusive states: - * bit string literal - * extended C-style comments in C - * extended C-style comments in SQL - * delimited identifiers (double-quoted identifiers) - thomas 1997-10-27 - * hexadecimal numeric string - thomas 1997-11-16 - * standard quoted strings - thomas 1997-07-30 - * standard quoted strings in C - michael - * extended quoted strings (support backslash escape sequences) - * national character quoted strings + * bit string literal + * extended C-style comments in C + * extended C-style comments in SQL + * delimited identifiers (double-quoted identifiers) + * double-quoted strings in C + * hexadecimal numeric string + * national character quoted strings + * standard quoted strings + * extended quoted strings (support backslash escape sequences) + * single-quoted strings in C * $foo$ quoted strings * quoted identifier with Unicode escapes * quoted string with Unicode escapes + * condition of an EXEC SQL IFDEF construct + * skipping the inactive part of an EXEC SQL IFDEF construct + * + * Remember to add an <> case whenever you add a new exclusive state! + * The default one is probably not the right thing. */ %x xb @@ -128,15 +132,60 @@ static struct _if_value %x xd %x xdc %x xh -%x xe %x xn %x xq +%x xe %x xqc %x xdolq -%x xcond -%x xskip %x xui %x xus +%x xcond +%x xskip + +/* Additional exclusive states that are specific to ECPG */ +%x C SQL incl def def_ident undef + +/* + * In order to make the world safe for Windows and Mac clients as well as + * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n + * sequence will be seen as two successive newlines, but that doesn't cause + * any problems. SQL-style comments, which start with -- and extend to the + * next newline, are treated as equivalent to a single whitespace character. + * + * NOTE a fine point: if there is no newline following --, we will absorb + * everything to the end of the input as a comment. This is correct. Older + * versions of Postgres failed to recognize -- as a comment if the input + * did not end with a newline. + * + * XXX perhaps \f (formfeed) should be treated as a newline as well? + * + * XXX if you change the set of whitespace characters, fix ecpg_isspace() + * to agree. + */ + +space [ \t\n\r\f] +horiz_space [ \t\f] +newline [\n\r] +non_newline [^\n\r] + +comment ("--"{non_newline}*) + +whitespace ({space}+|{comment}) + +/* + * SQL requires at least one newline in the whitespace separating + * string literals that are to be concatenated. Silly, but who are we + * to argue? Note that {whitespace_with_newline} should not have * after + * it, whereas {whitespace} should generally have a * after it... + */ + +horiz_whitespace ({horiz_space}|{comment}) +whitespace_with_newline ({horiz_whitespace}*{newline}{whitespace}*) + +quote ' +quotestop {quote}{whitespace}* +quotecontinue {quote}{whitespace_with_newline}{quote} +quotefail {quote}{whitespace}*"-" /* Bit string */ @@ -158,9 +207,6 @@ xeoctesc [\\][0-7]{1,3} xehexesc [\\]x[0-9A-Fa-f]{1,2} xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) -/* C version of hex number */ -xch 0[xX][0-9A-Fa-f]* - /* Extended quote * xqdouble implements embedded quote, '''' */ @@ -194,7 +240,9 @@ xddouble {dquote}{dquote} xdinside [^"]+ /* Unicode escapes */ -/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are not needed here, but could be added if desired.) */ +/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are + * not needed here, but could be added if desired.) + */ uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} /* Quoted identifier with Unicode escapes */ @@ -211,22 +259,23 @@ xdcqdq \\\" xdcother [^"] xdcinside ({xdcqq}|{xdcqdq}|{xdcother}) + /* C-style comments * * The "extended comment" syntax closely resembles allowable operator syntax. * The tricky part here is to get lex to recognize a string starting with * slash-star as a comment, when interpreting it as an operator would produce - * a longer match --- remember lex will prefer a longer match! Also, if we + * a longer match --- remember lex will prefer a longer match! Also, if we * have something like plus-slash-star, lex will think this is a 3-character * operator whereas we want to see it as a + operator and a comment start. * The solution is two-fold: * 1. append {op_chars}* to xcstart so that it matches as much text as - * {operator} would. Then the tie-breaker (first matching rule of same - * length) ensures xcstart wins. We put back the extra stuff with yyless() - * in case it contains a star-slash that should terminate the comment. + * {operator} would. Then the tie-breaker (first matching rule of same + * length) ensures xcstart wins. We put back the extra stuff with yyless() + * in case it contains a star-slash that should terminate the comment. * 2. In the operator rule, check for slash-star within the operator, and - * if found throw it back with yyless(). This handles the plus-slash-star - * problem. + * if found throw it back with yyless(). This handles the plus-slash-star + * problem. * Dash-dash comments have similar interactions with the operator rule. */ xcstart \/\*{op_chars}* @@ -262,7 +311,7 @@ not_equals "!=" /* * "self" is the set of chars that should be returned as single-character - * tokens. "op_chars" is the set of chars that can make up "Op" tokens, + * tokens. "op_chars" is the set of chars that can make up "Op" tokens, * which can be one or more characters long (but if a single-char token * appears in the "self" set, it is not to be returned as an Op). Note * that the sets overlap, but each has some chars that are not in the other. @@ -278,68 +327,40 @@ operator {op_chars}+ * instead we pass it separately to parser. there it gets * coerced via doNegate() -- Leon aug 20 1999 * + * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10. + * * {realfail1} and {realfail2} are added to prevent the need for scanner * backup when the {real} rule fails to match completely. */ integer {digit}+ decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*)) +decimalfail {digit}+\.\. real ({integer}|{decimal})[Ee][-+]?{digit}+ realfail1 ({integer}|{decimal})[Ee] realfail2 ({integer}|{decimal})[Ee][-+] param \${integer} -/* - * In order to make the world safe for Windows and Mac clients as well as - * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n - * sequence will be seen as two successive newlines, but that doesn't cause - * any problems. SQL-style comments, which start with -- and extend to the - * next newline, are treated as equivalent to a single whitespace character. - * - * NOTE a fine point: if there is no newline following --, we will absorb - * everything to the end of the input as a comment. This is correct. Older - * versions of Postgres failed to recognize -- as a comment if the input - * did not end with a newline. - * - * XXX perhaps \f (formfeed) should be treated as a newline as well? - * - * XXX if you change the set of whitespace characters, fix ecpg_isspace() - * to agree. - */ - -ccomment "//".*\n - -space [ \t\n\r\f] -horiz_space [ \t\f] -newline [\n\r] -non_newline [^\n\r] - -comment ("--"{non_newline}*) - -whitespace ({space}+|{comment}) - -/* - * SQL requires at least one newline in the whitespace separating - * string literals that are to be concatenated. Silly, but who are we - * to argue? Note that {whitespace_with_newline} should not have * after - * it, whereas {whitespace} should generally have a * after it... - */ - -horiz_whitespace ({horiz_space}|{comment}) -whitespace_with_newline ({horiz_whitespace}*{newline}{whitespace}*) - -quote ' -quotestop {quote}{whitespace}* -quotecontinue {quote}{whitespace_with_newline}{quote} -quotefail {quote}{whitespace}*"-" - /* special characters for other dbms */ /* we have to react differently in compat mode */ informix_special [\$] other . +/* + * Dollar quoted strings are totally opaque, and no escaping is done on them. + * Other quoted strings must allow some special characters such as single-quote + * and newline. + * Embedded single-quotes are implemented both in the SQL standard + * style of two adjacent single quotes "''" and in the Postgres/Java style + * of escaped-quote "\'". + * Other embedded escaped characters are matched explicitly and the leading + * backslash is dropped from the string. + * Note that xcstart must appear before operator, as explained above! + * Also whitespace (comment) must appear before operator. + */ + /* some stuff needed for ecpg */ exec [eE][xX][eE][cC] sql [sS][qQ][lL] @@ -349,6 +370,11 @@ include_next [iI][nN][cC][lL][uU][dD][eE]_[nN][eE][xX][tT] import [iI][mM][pP][oO][rR][tT] undef [uU][nN][dD][eE][fF] +/* C version of hex number */ +xch 0[xX][0-9A-Fa-f]* + +ccomment "//".*\n + if [iI][fF] ifdef [iI][fF][dD][eE][fF] ifndef [iI][fF][nN][dD][eE][fF] @@ -366,24 +392,12 @@ ip {ipdigit}\.{ipdigit}\.{ipdigit}\.{ipdigit} cppinclude {space}*#{include}{space}* cppinclude_next {space}*#{include_next}{space}* -/* take care of cpp lines, they may also be continuated */ +/* take care of cpp lines, they may also be continued */ /* first a general line for all commands not starting with "i" */ /* and then the other commands starting with "i", we have to add these - * separately because the cppline production would match on "include" too */ -cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+\/)|.|\\{space}*{newline})*{newline} - -/* - * Dollar quoted strings are totally opaque, and no escaping is done on them. - * Other quoted strings must allow some special characters such as single-quote - * and newline. - * Embedded single-quotes are implemented both in the SQL standard - * style of two adjacent single quotes "''" and in the Postgres/Java style - * of escaped-quote "\'". - * Other embedded escaped characters are matched explicitly and the leading - * backslash is dropped from the string. - thomas 1997-09-24 - * Note that xcstart must appear before operator, as explained above! - * Also whitespace (comment) must appear before operator. + * separately because the cppline production would match on "include" too */ +cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+\/)|.|\\{space}*{newline})*{newline} %% @@ -392,18 +406,12 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ token_start = NULL; %} -{whitespace} { /* ignore */ } - -{xcstart} { - token_start = yytext; - state_before = YYSTATE; - xcdepth = 0; - BEGIN(xcc); - /* Put back any characters past slash-star; see above */ - yyless(2); - fputs("/*", yyout); +{ +{whitespace} { + /* ignore */ } -{xcstart} { + +{xcstart} { token_start = yytext; state_before = YYSTATE; xcdepth = 0; @@ -412,6 +420,17 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ yyless(2); fputs("/*", yyout); } +} /* */ + +{xcstart} { + token_start = yytext; + state_before = YYSTATE; + xcdepth = 0; + BEGIN(xcc); + /* Put back any characters past slash-star; see above */ + yyless(2); + fputs("/*", yyout); + } {xcstart} { ECHO; } {xcstart} { xcdepth++; @@ -437,20 +456,36 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ BEGIN(state_before); token_start = NULL; } -{xcinside} { ECHO; } -{op_chars} { ECHO; } -\*+ { ECHO; } -<> { mmfatal(PARSE_ERROR, "unterminated /* comment"); } +{ +{xcinside} { + ECHO; + } -{xbstart} { +{op_chars} { + ECHO; + } + +\*+ { + ECHO; + } + +<> { + mmfatal(PARSE_ERROR, "unterminated /* comment"); + } +} /* */ + +{ +{xbstart} { token_start = yytext; BEGIN(xb); startlit(); addlitchar('b'); } -{quotestop} | -{quotefail} { +} /* */ + +{quotestop} | +{quotefail} { yyless(1); BEGIN(SQL); if (literalbuf[strspn(literalbuf, "01") + 1] != '\0') @@ -458,11 +493,14 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ base_yylval.str = mm_strdup(literalbuf); return BCONST; } - {xhinside} | -{xbinside} { addlit(yytext, yyleng); } +{xbinside} { + addlit(yytext, yyleng); + } {quotecontinue} | -{quotecontinue} { /* ignore */ } +{quotecontinue} { + /* ignore */ + } <> { mmfatal(PARSE_ERROR, "unterminated bit string literal"); } {xhstart} { @@ -472,186 +510,251 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ addlitchar('x'); } {quotestop} | -{quotefail} { - yyless(1); - BEGIN(SQL); - base_yylval.str = mm_strdup(literalbuf); - return XCONST; - } - -<> { mmfatal(PARSE_ERROR, "unterminated hexadecimal string literal"); } -{xnstart} { - /* National character. - * Transfer it as-is to the backend. - */ - token_start = yytext; - state_before = YYSTATE; - BEGIN(xn); - startlit(); - } -{xqstart} { - token_start = yytext; - state_before = YYSTATE; - BEGIN(xqc); - startlit(); - } -{xqstart} { - token_start = yytext; - state_before = YYSTATE; - BEGIN(xq); - startlit(); - } -{xestart} { - token_start = yytext; - state_before = YYSTATE; - BEGIN(xe); - startlit(); - } -{xusstart} { - token_start = yytext; - state_before = YYSTATE; - BEGIN(xus); - startlit(); - addlit(yytext, yyleng); - } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return SCONST; - } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return ECONST; - } -{quotestop} | -{quotefail} { - yyless(1); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return NCONST; - } -{xusstop} { - addlit(yytext, yyleng); - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return UCONST; - } -{xqdouble} { addlitchar('\''); } -{xqcquote} { - addlitchar('\\'); - addlitchar('\''); - } -{xqinside} { addlit(yytext, yyleng); } -{xeinside} { addlit(yytext, yyleng); } -{xeunicode} { addlit(yytext, yyleng); } -{xeescape} { addlit(yytext, yyleng); } -{xeoctesc} { addlit(yytext, yyleng); } -{xehexesc} { addlit(yytext, yyleng); } -{quotecontinue} { /* ignore */ } -. { - /* This is only needed for \ just before EOF */ - addlitchar(yytext[0]); - } -<> { mmfatal(PARSE_ERROR, "unterminated quoted string"); } -{dolqfailed} { - /* throw back all but the initial "$" */ - yyless(1); - /* and treat it as {other} */ - return yytext[0]; - } -{dolqdelim} { - token_start = yytext; - if (dolqstart) - free(dolqstart); - dolqstart = mm_strdup(yytext); - BEGIN(xdolq); - startlit(); - addlit(yytext, yyleng); - } -{dolqdelim} { - if (strcmp(yytext, dolqstart) == 0) - { - addlit(yytext, yyleng); - free(dolqstart); - dolqstart = NULL; +{quotefail} { + yyless(1); BEGIN(SQL); base_yylval.str = mm_strdup(literalbuf); - return DOLCONST; + return XCONST; } - else - { - /* - * When we fail to match $...$ to dolqstart, transfer - * the $... part to the output, but put back the final - * $ for rescanning. Consider $delim$...$junk$delim$ + +<> { mmfatal(PARSE_ERROR, "unterminated hexadecimal string literal"); } + +{xqstart} { + token_start = yytext; + state_before = YYSTATE; + BEGIN(xqc); + startlit(); + } + +{ +{xnstart} { + /* National character. + * Transfer it as-is to the backend. */ - addlit(yytext, yyleng-1); - yyless(yyleng-1); + token_start = yytext; + state_before = YYSTATE; + BEGIN(xn); + startlit(); } - } -{dolqinside} { addlit(yytext, yyleng); } -{dolqfailed} { addlit(yytext, yyleng); } -{other} { - /* single quote or dollar sign */ - addlitchar(yytext[0]); - } -<> { base_yyerror("unterminated dollar-quoted string"); } -{xdstart} { - state_before = YYSTATE; - BEGIN(xd); - startlit(); - } -{xuistart} { - state_before = YYSTATE; - BEGIN(xui); - startlit(); + +{xqstart} { + token_start = yytext; + state_before = YYSTATE; + BEGIN(xq); + startlit(); + } +{xestart} { + token_start = yytext; + state_before = YYSTATE; + BEGIN(xe); + startlit(); + } +{xusstart} { + token_start = yytext; + state_before = YYSTATE; + BEGIN(xus); + startlit(); + addlit(yytext, yyleng); + } +} /* */ + +{quotestop} | +{quotefail} { + yyless(1); + BEGIN(state_before); + base_yylval.str = mm_strdup(literalbuf); + return SCONST; + } +{quotestop} | +{quotefail} { + yyless(1); + BEGIN(state_before); + base_yylval.str = mm_strdup(literalbuf); + return ECONST; + } +{quotestop} | +{quotefail} { + yyless(1); + BEGIN(state_before); + base_yylval.str = mm_strdup(literalbuf); + return NCONST; + } +{xusstop} { + addlit(yytext, yyleng); + BEGIN(state_before); + base_yylval.str = mm_strdup(literalbuf); + return UCONST; + } +{xqdouble} { addlitchar('\''); } +{xqcquote} { + addlitchar('\\'); + addlitchar('\''); + } +{xqinside} { addlit(yytext, yyleng); } +{xeinside} { + addlit(yytext, yyleng); + } +{xeunicode} { + addlit(yytext, yyleng); + } +{xeescape} { + addlit(yytext, yyleng); + } +{xeoctesc} { + addlit(yytext, yyleng); + } +{xehexesc} { + addlit(yytext, yyleng); + } +{quotecontinue} { + /* ignore */ + } +. { + /* This is only needed for \ just before EOF */ + addlitchar(yytext[0]); + } +<> { mmfatal(PARSE_ERROR, "unterminated quoted string"); } + +{ +{dolqdelim} { + token_start = yytext; + if (dolqstart) + free(dolqstart); + dolqstart = mm_strdup(yytext); + BEGIN(xdolq); + startlit(); + addlit(yytext, yyleng); + } +{dolqfailed} { + /* throw back all but the initial "$" */ + yyless(1); + /* and treat it as {other} */ + return yytext[0]; + } +} /* */ + +{dolqdelim} { + if (strcmp(yytext, dolqstart) == 0) + { addlit(yytext, yyleng); - } -{xdstop} { - BEGIN(state_before); - if (literallen == 0) - mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); - /* The backend will truncate the identifier here. We do not as it does not change the result. */ + free(dolqstart); + dolqstart = NULL; + BEGIN(SQL); base_yylval.str = mm_strdup(literalbuf); - return CSTRING; + return DOLCONST; } -{xdstop} { - BEGIN(state_before); - base_yylval.str = mm_strdup(literalbuf); - return CSTRING; + else + { + /* + * When we fail to match $...$ to dolqstart, transfer + * the $... part to the output, but put back the final + * $ for rescanning. Consider $delim$...$junk$delim$ + */ + addlit(yytext, yyleng - 1); + yyless(yyleng - 1); } -{xuistop} { - BEGIN(state_before); - if (literallen == 2) /* "U&" */ - mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); - /* The backend will truncate the identifier here. We do not as it does not change the result. */ - addlit(yytext, yyleng); - base_yylval.str = mm_strdup(literalbuf); - return UIDENT; - } -{xddouble} { addlitchar('"'); } -{xdinside} { addlit(yytext, yyleng); } -<> { mmfatal(PARSE_ERROR, "unterminated quoted identifier"); } -{xdstart} { - state_before = YYSTATE; - BEGIN(xdc); - startlit(); - } -{xdcinside} { addlit(yytext, yyleng); } -{typecast} { return TYPECAST; } -{dot_dot} { return DOT_DOT; } -{colon_equals} { return COLON_EQUALS; } -{equals_greater} { return EQUALS_GREATER; } -{less_equals} { return LESS_EQUALS; } -{greater_equals} { return GREATER_EQUALS; } -{less_greater} { return NOT_EQUALS; } -{not_equals} { return NOT_EQUALS; } -{informix_special} { + } +{dolqinside} { + addlit(yytext, yyleng); + } +{dolqfailed} { + addlit(yytext, yyleng); + } +. { + /* single quote or dollar sign */ + addlitchar(yytext[0]); + } +<> { mmfatal(PARSE_ERROR, "unterminated dollar-quoted string"); } + +{ +{xdstart} { + state_before = YYSTATE; + BEGIN(xd); + startlit(); + } +{xuistart} { + state_before = YYSTATE; + BEGIN(xui); + startlit(); + addlit(yytext, yyleng); + } +} /* */ + +{xdstop} { + BEGIN(state_before); + if (literallen == 0) + mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); + /* The backend will truncate the identifier here. We do not as it does not change the result. */ + base_yylval.str = mm_strdup(literalbuf); + return CSTRING; + } +{xdstop} { + BEGIN(state_before); + base_yylval.str = mm_strdup(literalbuf); + return CSTRING; + } +{xuistop} { + BEGIN(state_before); + if (literallen == 2) /* "U&" */ + mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); + /* The backend will truncate the identifier here. We do not as it does not change the result. */ + addlit(yytext, yyleng); + base_yylval.str = mm_strdup(literalbuf); + return UIDENT; + } +{xddouble} { + addlitchar('"'); + } +{xdinside} { + addlit(yytext, yyleng); + } +<> { mmfatal(PARSE_ERROR, "unterminated quoted identifier"); } +{xdstart} { + state_before = YYSTATE; + BEGIN(xdc); + startlit(); + } +{xdcinside} { + addlit(yytext, yyleng); + } +<> { mmfatal(PARSE_ERROR, "unterminated quoted string"); } + +{ +{typecast} { + return TYPECAST; + } + +{dot_dot} { + return DOT_DOT; + } + +{colon_equals} { + return COLON_EQUALS; + } + +{equals_greater} { + return EQUALS_GREATER; + } + +{less_equals} { + return LESS_EQUALS; + } + +{greater_equals} { + return GREATER_EQUALS; + } + +{less_greater} { + /* We accept both "<>" and "!=" as meaning NOT_EQUALS */ + return NOT_EQUALS; + } + +{not_equals} { + /* We accept both "<>" and "!=" as meaning NOT_EQUALS */ + return NOT_EQUALS; + } + +{informix_special} { /* are we simulating Informix? */ if (INFORMIX_MODE) { @@ -660,184 +763,205 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ else return yytext[0]; } -{self} { /* - * We may find a ';' inside a structure - * definition in a TYPE or VAR statement. - * This is not an EOL marker. - */ - if (yytext[0] == ';' && struct_level == 0) - BEGIN(C); - return yytext[0]; - } -{operator} { - /* - * Check for embedded slash-star or dash-dash; those - * are comment starts, so operator must stop there. - * Note that slash-star or dash-dash at the first - * character will match a prior rule, not this one. - */ - int nchars = yyleng; - char *slashstar = strstr(yytext, "/*"); - char *dashdash = strstr(yytext, "--"); - if (slashstar && dashdash) - { - /* if both appear, take the first one */ - if (slashstar > dashdash) - slashstar = dashdash; - } - else if (!slashstar) - slashstar = dashdash; - if (slashstar) - nchars = slashstar - yytext; - - /* - * For SQL compatibility, '+' and '-' cannot be the - * last char of a multi-char operator unless the operator - * contains chars that are not in SQL operators. - * The idea is to lex '=-' as two operators, but not - * to forbid operator names like '?-' that could not be - * sequences of SQL operators. - */ - if (nchars > 1 && - (yytext[nchars - 1] == '+' || - yytext[nchars - 1] == '-')) - { - int ic; - - for (ic = nchars - 2; ic >= 0; ic--) - { - char c = yytext[ic]; - if (c == '~' || c == '!' || c == '@' || - c == '#' || c == '^' || c == '&' || - c == '|' || c == '`' || c == '?' || - c == '%') - break; - } - if (ic < 0) - { - /* - * didn't find a qualifying character, so remove - * all trailing [+-] - */ - do { - nchars--; - } while (nchars > 1 && - (yytext[nchars - 1] == '+' || - yytext[nchars - 1] == '-')); - } - } - - if (nchars < yyleng) - { - /* Strip the unwanted chars from the token */ - yyless(nchars); - /* - * If what we have left is only one char, and it's - * one of the characters matching "self", then - * return it as a character token the same way - * that the "self" rule would have. - */ - if (nchars == 1 && - strchr(",()[].;:+-*/%^<>=", yytext[0])) - return yytext[0]; - /* - * Likewise, if what we have left is two chars, and - * those match the tokens ">=", "<=", "=>", "<>" or - * "!=", then we must return the appropriate token - * rather than the generic Op. - */ - if (nchars == 2) - { - if (yytext[0] == '=' && yytext[1] == '>') - return EQUALS_GREATER; - if (yytext[0] == '>' && yytext[1] == '=') - return GREATER_EQUALS; - if (yytext[0] == '<' && yytext[1] == '=') - return LESS_EQUALS; - if (yytext[0] == '<' && yytext[1] == '>') - return NOT_EQUALS; - if (yytext[0] == '!' && yytext[1] == '=') - return NOT_EQUALS; - } - } - - base_yylval.str = mm_strdup(yytext); - return Op; - } -{param} { - base_yylval.ival = atol(yytext+1); - return PARAM; - } -{integer} { - int val; - char* endptr; - - errno = 0; - val = strtoint(yytext, &endptr, 10); - if (*endptr != '\0' || errno == ERANGE) - { - errno = 0; - base_yylval.str = mm_strdup(yytext); - return FCONST; - } - base_yylval.ival = val; - return ICONST; - } -{ip} { - base_yylval.str = mm_strdup(yytext); - return IP; +{self} { + /* + * We may find a ';' inside a structure + * definition in a TYPE or VAR statement. + * This is not an EOL marker. + */ + if (yytext[0] == ';' && struct_level == 0) + BEGIN(C); + return yytext[0]; } -{decimal} { - base_yylval.str = mm_strdup(yytext); - return FCONST; - } -{real} { - base_yylval.str = mm_strdup(yytext); - return FCONST; - } -{realfail1} { - yyless(yyleng-1); - base_yylval.str = mm_strdup(yytext); - return FCONST; - } -{realfail2} { - yyless(yyleng-2); - base_yylval.str = mm_strdup(yytext); - return FCONST; - } -:{identifier}((("->"|\.){identifier})|(\[{array}\]))* { - base_yylval.str = mm_strdup(yytext+1); - return CVARIABLE; - } -{identifier} { - const ScanKeyword *keyword; - if (!isdefine()) +{operator} { + /* + * Check for embedded slash-star or dash-dash; those + * are comment starts, so operator must stop there. + * Note that slash-star or dash-dash at the first + * character will match a prior rule, not this one. + */ + int nchars = yyleng; + char *slashstar = strstr(yytext, "/*"); + char *dashdash = strstr(yytext, "--"); + + if (slashstar && dashdash) + { + /* if both appear, take the first one */ + if (slashstar > dashdash) + slashstar = dashdash; + } + else if (!slashstar) + slashstar = dashdash; + if (slashstar) + nchars = slashstar - yytext; + + /* + * For SQL compatibility, '+' and '-' cannot be the + * last char of a multi-char operator unless the operator + * contains chars that are not in SQL operators. + * The idea is to lex '=-' as two operators, but not + * to forbid operator names like '?-' that could not be + * sequences of SQL operators. + */ + if (nchars > 1 && + (yytext[nchars - 1] == '+' || + yytext[nchars - 1] == '-')) + { + int ic; + + for (ic = nchars - 2; ic >= 0; ic--) + { + char c = yytext[ic]; + if (c == '~' || c == '!' || c == '@' || + c == '#' || c == '^' || c == '&' || + c == '|' || c == '`' || c == '?' || + c == '%') + break; + } + if (ic < 0) { - /* Is it an SQL/ECPG keyword? */ - keyword = ScanECPGKeywordLookup(yytext); - if (keyword != NULL) - return keyword->value; - - /* Is it a C keyword? */ - keyword = ScanCKeywordLookup(yytext); - if (keyword != NULL) - return keyword->value; - /* - * None of the above. Return it as an identifier. - * - * The backend will attempt to truncate and case-fold - * the identifier, but I see no good reason for ecpg - * to do so; that's just another way that ecpg could get - * out of step with the backend. + * didn't find a qualifying character, so remove + * all trailing [+-] */ - base_yylval.str = mm_strdup(yytext); - return IDENT; + do { + nchars--; + } while (nchars > 1 && + (yytext[nchars - 1] == '+' || + yytext[nchars - 1] == '-')); } } -{other} { return yytext[0]; } + + if (nchars < yyleng) + { + /* Strip the unwanted chars from the token */ + yyless(nchars); + /* + * If what we have left is only one char, and it's + * one of the characters matching "self", then + * return it as a character token the same way + * that the "self" rule would have. + */ + if (nchars == 1 && + strchr(",()[].;:+-*/%^<>=", yytext[0])) + return yytext[0]; + /* + * Likewise, if what we have left is two chars, and + * those match the tokens ">=", "<=", "=>", "<>" or + * "!=", then we must return the appropriate token + * rather than the generic Op. + */ + if (nchars == 2) + { + if (yytext[0] == '=' && yytext[1] == '>') + return EQUALS_GREATER; + if (yytext[0] == '>' && yytext[1] == '=') + return GREATER_EQUALS; + if (yytext[0] == '<' && yytext[1] == '=') + return LESS_EQUALS; + if (yytext[0] == '<' && yytext[1] == '>') + return NOT_EQUALS; + if (yytext[0] == '!' && yytext[1] == '=') + return NOT_EQUALS; + } + } + + base_yylval.str = mm_strdup(yytext); + return Op; + } + +{param} { + base_yylval.ival = atol(yytext+1); + return PARAM; + } + +{ip} { + base_yylval.str = mm_strdup(yytext); + return IP; + } +} /* */ + +{ +{integer} { + return process_integer_literal(yytext, &base_yylval); + } +{decimal} { + base_yylval.str = mm_strdup(yytext); + return FCONST; + } +{decimalfail} { + /* throw back the .., and treat as integer */ + yyless(yyleng - 2); + return process_integer_literal(yytext, &base_yylval); + } +{real} { + base_yylval.str = mm_strdup(yytext); + return FCONST; + } +{realfail1} { + /* + * throw back the [Ee], and treat as {decimal}. Note + * that it is possible the input is actually {integer}, + * but since this case will almost certainly lead to a + * syntax error anyway, we don't bother to distinguish. + */ + yyless(yyleng - 1); + base_yylval.str = mm_strdup(yytext); + return FCONST; + } +{realfail2} { + /* throw back the [Ee][+-], and proceed as above */ + yyless(yyleng - 2); + base_yylval.str = mm_strdup(yytext); + return FCONST; + } +} /* */ + +{ +:{identifier}((("->"|\.){identifier})|(\[{array}\]))* { + base_yylval.str = mm_strdup(yytext+1); + return CVARIABLE; + } + +{identifier} { + const ScanKeyword *keyword; + + if (!isdefine()) + { + /* Is it an SQL/ECPG keyword? */ + keyword = ScanECPGKeywordLookup(yytext); + if (keyword != NULL) + return keyword->value; + + /* Is it a C keyword? */ + keyword = ScanCKeywordLookup(yytext); + if (keyword != NULL) + return keyword->value; + + /* + * None of the above. Return it as an identifier. + * + * The backend will attempt to truncate and case-fold + * the identifier, but I see no good reason for ecpg + * to do so; that's just another way that ecpg could get + * out of step with the backend. + */ + base_yylval.str = mm_strdup(yytext); + return IDENT; + } + } + +{other} { + return yytext[0]; + } +} /* */ + + /* + * Begin ECPG-specific rules + */ + {exec_sql} { BEGIN(SQL); return SQL_START; } {informix_special} { /* are we simulating Informix? */ @@ -1288,6 +1412,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+ } } + {other}|\n { mmfatal(PARSE_ERROR, "internal error: unreachable state; please report this to "); } %% @@ -1350,6 +1475,24 @@ addlitchar(unsigned char ychar) literalbuf[literallen] = '\0'; } +static int +process_integer_literal(const char *token, YYSTYPE *lval) +{ + int val; + char *endptr; + + errno = 0; + val = strtoint(token, &endptr, 10); + if (*endptr != '\0' || errno == ERANGE) + { + /* integer too large, treat it as a float */ + lval->str = mm_strdup(token); + return FCONST; + } + lval->ival = val; + return ICONST; +} + static void parse_include(void) {