diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index a0635463bb..ef5c1a639f 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -4,13 +4,27 @@ * scan.l * lexical scanner for PostgreSQL * - * XXX The rules in this file must be kept in sync with psql's lexer!!! + * NOTE NOTE NOTE: + * + * The rules in this file must be kept in sync with psql's lexer!!! + * + * The rules are designed so that the scanner never has to backtrack, + * in the sense that there is always a rule that can match the input + * consumed so far (the rule action may internally throw back some input + * with yyless(), however). As explained in the flex manual, this makes + * for a useful speed increase --- about a third faster than a plain -CF + * lexer, in simple testing. The extra complexity is mostly in the rules + * for handling float numbers and continued string literals. If you change + * the lexical rules, verify that you haven't broken the no-backtrack + * property by running flex with the "-b" option and checking that the + * resulting "lex.backup" file says that no backing up is needed. + * * * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.121 2005/03/11 19:13:42 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.122 2005/05/26 01:24:29 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -138,6 +152,20 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +/* + * To ensure that {quotecontinue} can be scanned without having to back up + * if the full pattern isn't matched, we include trailing whitespace in + * {quotestop}. This matches all cases where {quotecontinue} fails to match, + * except for {quote} followed by whitespace and just one "-" (not two, + * which would start a {comment}). To cover that we have {quotefail}. + * The actions for {quotestop} and {quotefail} must throw back characters + * beyond the quote proper. + */ +quote ' +quotestop {quote}{whitespace}* +quotecontinue {quote}{whitespace_with_newline}{quote} +quotefail {quote}{whitespace}*"-" + /* Bit string * It is tempting to scan the string for only those characters * which are allowed. However, this leads to silently swallowed @@ -148,16 +176,12 @@ whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) * validate the contents. */ xbstart [bB]{quote} -xbstop {quote} xbinside [^']* -xbcat {quote}{whitespace_with_newline}{quote} /* Hexadecimal number */ xhstart [xX]{quote} -xhstop {quote} xhinside [^']* -xhcat {quote}{whitespace_with_newline}{quote} /* National character */ @@ -165,26 +189,26 @@ xnstart [nN]{quote} /* Extended quote * xqdouble implements embedded quote - * xqcat allows strings to cross input lines */ -quote ' xqstart {quote} -xqstop {quote} xqdouble {quote}{quote} xqinside [^\\']+ xqescape [\\][^0-7] xqoctesc [\\][0-7]{1,3} -xqcat {quote}{whitespace_with_newline}{quote} /* $foo$ style quotes ("dollar quoting") * The quoted string starts with $foo$ where "foo" is an optional string * in the form of an identifier, except that it may not contain "$", * and extends to the first occurrence of an identical string. * There is *no* processing of the quoted text. + * + * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim} + * fails to match its trailing "$". */ dolq_start [A-Za-z\200-\377_] dolq_cont [A-Za-z\200-\377_0-9] dolqdelim \$({dolq_start}{dolq_cont}*)?\$ +dolqfailed \${dolq_start}{dolq_cont}* dolqinside [^$]+ /* Double quote @@ -242,12 +266,17 @@ operator {op_chars}+ /* we no longer allow unary minus in numbers. * instead we pass it separately to parser. there it gets - * coerced via doNegate() -- Leon aug 20 1999 + * coerced via doNegate() -- Leon aug 20 1999 + * + * {realfail1} and {realfail2} are added to prevent the need for scanner + * backup when the {real} rule fails to match completely. */ integer {digit}+ decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*)) -real ((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+)) +real ({integer}|{decimal})[Ee][-+]?{digit}+ +realfail1 ({integer}|{decimal})[Ee] +realfail2 ({integer}|{decimal})[Ee][-+] param \${integer} @@ -310,6 +339,10 @@ other . /* ignore */ } +\*+ { + /* ignore */ + } + <> { yyerror("unterminated /* comment"); } {xbstart} { @@ -324,7 +357,9 @@ other . startlit(); addlitchar('b'); } -{xbstop} { +{quotestop} | +{quotefail} { + yyless(1); BEGIN(INITIAL); yylval.str = litbufdup(); return BCONST; @@ -333,8 +368,8 @@ other . {xbinside} { addlit(yytext, yyleng); } -{xhcat} | -{xbcat} { +{quotecontinue} | +{quotecontinue} { /* ignore */ } <> { yyerror("unterminated bit string literal"); } @@ -351,7 +386,9 @@ other . startlit(); addlitchar('x'); } -{xhstop} { +{quotestop} | +{quotefail} { + yyless(1); BEGIN(INITIAL); yylval.str = litbufdup(); return XCONST; @@ -365,13 +402,11 @@ other . */ const ScanKeyword *keyword; - /* This had better be a keyword! */ + yyless(1); /* eat only 'n' this time */ + /* nchar had better be a keyword! */ keyword = ScanKeywordLookup("nchar"); Assert(keyword != NULL); yylval.keyword = keyword->name; - token_start = yytext; - BEGIN(xq); - startlit(); return keyword->value; } @@ -380,7 +415,9 @@ other . BEGIN(xq); startlit(); } -{xqstop} { +{quotestop} | +{quotefail} { + yyless(1); BEGIN(INITIAL); yylval.str = litbufdup(); return SCONST; @@ -398,7 +435,7 @@ other . unsigned char c = strtoul(yytext+1, NULL, 8); addlitchar(c); } -{xqcat} { +{quotecontinue} { /* ignore */ } . { @@ -413,6 +450,12 @@ other . BEGIN(xdolq); startlit(); } +{dolqfailed} { + /* throw back all but the initial "$" */ + yyless(1); + /* and treat it as {other} */ + return yytext[0]; + } {dolqdelim} { if (strcmp(yytext, dolqstart) == 0) { @@ -435,6 +478,9 @@ other . {dolqinside} { addlit(yytext, yyleng); } +{dolqfailed} { + addlit(yytext, yyleng); + } . { /* This is only needed for $ inside the quoted text */ addlitchar(yytext[0]); @@ -576,6 +622,23 @@ other . yylval.str = pstrdup(yytext); return FCONST; } +{realfail1} { + /* + * throw back the [Ee], and treat as {decimal}. Note + * that it is possible the input is actually {integer}, + * but since this case will almost certainly lead to a + * syntax error anyway, we don't bother to distinguish. + */ + yyless(yyleng-1); + yylval.str = pstrdup(yytext); + return FCONST; + } +{realfail2} { + /* throw back the [Ee][+-], and proceed as above */ + yyless(yyleng-2); + yylval.str = pstrdup(yytext); + return FCONST; + } {identifier} { diff --git a/src/bin/psql/psqlscan.l b/src/bin/psql/psqlscan.l index 147d77872d..88763d504b 100644 --- a/src/bin/psql/psqlscan.l +++ b/src/bin/psql/psqlscan.l @@ -11,7 +11,9 @@ * are (except for a few) the same as the backend's, but their actions are * just ECHO whereas the backend's actions generally do other things. * - * XXX The rules in this file must be kept in sync with the main parser!!! + * XXX The rules in this file must be kept in sync with the backend lexer!!! + * + * XXX Avoid creating backtracking cases --- see the backend lexer for info. * * The most difficult aspect of this code is that we need to work in multibyte * encodings that are not ASCII-safe. A "safe" encoding is one in which each @@ -31,7 +33,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.9 2004/12/31 22:03:15 pgsql Exp $ + * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.10 2005/05/26 01:24:29 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -207,6 +209,20 @@ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) +/* + * To ensure that {quotecontinue} can be scanned without having to back up + * if the full pattern isn't matched, we include trailing whitespace in + * {quotestop}. This matches all cases where {quotecontinue} fails to match, + * except for {quote} followed by whitespace and just one "-" (not two, + * which would start a {comment}). To cover that we have {quotefail}. + * The actions for {quotestop} and {quotefail} must throw back characters + * beyond the quote proper. + */ +quote ' +quotestop {quote}{whitespace}* +quotecontinue {quote}{whitespace_with_newline}{quote} +quotefail {quote}{whitespace}*"-" + /* Bit string * It is tempting to scan the string for only those characters * which are allowed. However, this leads to silently swallowed @@ -217,16 +233,12 @@ whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) * validate the contents. */ xbstart [bB]{quote} -xbstop {quote} xbinside [^']* -xbcat {quote}{whitespace_with_newline}{quote} /* Hexadecimal number */ xhstart [xX]{quote} -xhstop {quote} xhinside [^']* -xhcat {quote}{whitespace_with_newline}{quote} /* National character */ @@ -234,26 +246,26 @@ xnstart [nN]{quote} /* Extended quote * xqdouble implements embedded quote - * xqcat allows strings to cross input lines */ -quote ' xqstart {quote} -xqstop {quote} xqdouble {quote}{quote} xqinside [^\\']+ xqescape [\\][^0-7] xqoctesc [\\][0-7]{1,3} -xqcat {quote}{whitespace_with_newline}{quote} /* $foo$ style quotes ("dollar quoting") * The quoted string starts with $foo$ where "foo" is an optional string * in the form of an identifier, except that it may not contain "$", * and extends to the first occurrence of an identical string. * There is *no* processing of the quoted text. + * + * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim} + * fails to match its trailing "$". */ dolq_start [A-Za-z\200-\377_] dolq_cont [A-Za-z\200-\377_0-9] dolqdelim \$({dolq_start}{dolq_cont}*)?\$ +dolqfailed \${dolq_start}{dolq_cont}* dolqinside [^$]+ /* Double quote @@ -311,12 +323,17 @@ operator {op_chars}+ /* we no longer allow unary minus in numbers. * instead we pass it separately to parser. there it gets - * coerced via doNegate() -- Leon aug 20 1999 + * coerced via doNegate() -- Leon aug 20 1999 + * + * {realfail1} and {realfail2} are added to prevent the need for scanner + * backup when the {real} rule fails to match completely. */ integer {digit}+ decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*)) -real ((({digit}*\.{digit}+)|({digit}+\.{digit}*)|({digit}+))([Ee][-+]?{digit}+)) +real ({integer}|{decimal})[Ee][-+]?{digit}+ +realfail1 ({integer}|{decimal})[Ee] +realfail2 ({integer}|{decimal})[Ee][-+] param \${integer} @@ -383,11 +400,17 @@ other . ECHO; } +\*+ { + ECHO; + } + {xbstart} { BEGIN(xb); ECHO; } -{xbstop} { +{quotestop} | +{quotefail} { + yyless(1); BEGIN(INITIAL); ECHO; } @@ -395,8 +418,8 @@ other . {xbinside} { ECHO; } -{xhcat} | -{xbcat} { +{quotecontinue} | +{quotecontinue} { ECHO; } @@ -410,13 +433,15 @@ other . BEGIN(xh); ECHO; } -{xhstop} { +{quotestop} | +{quotefail} { + yyless(1); BEGIN(INITIAL); ECHO; } {xnstart} { - BEGIN(xq); + yyless(1); /* eat only 'n' this time */ ECHO; } @@ -424,7 +449,9 @@ other . BEGIN(xq); ECHO; } -{xqstop} { +{quotestop} | +{quotefail} { + yyless(1); BEGIN(INITIAL); ECHO; } @@ -440,7 +467,7 @@ other . {xqoctesc} { ECHO; } -{xqcat} { +{quotecontinue} { ECHO; } . { @@ -453,6 +480,11 @@ other . BEGIN(xdolq); ECHO; } +{dolqfailed} { + /* throw back all but the initial "$" */ + yyless(1); + ECHO; + } {dolqdelim} { if (strcmp(yytext, cur_state->dolqstart) == 0) { @@ -474,6 +506,9 @@ other . {dolqinside} { ECHO; } +{dolqfailed} { + ECHO; + } . { /* This is only needed for $ inside the quoted text */ ECHO; @@ -636,6 +671,21 @@ other . {real} { ECHO; } +{realfail1} { + /* + * throw back the [Ee], and treat as {decimal}. Note + * that it is possible the input is actually {integer}, + * but since this case will almost certainly lead to a + * syntax error anyway, we don't bother to distinguish. + */ + yyless(yyleng-1); + ECHO; + } +{realfail2} { + /* throw back the [Ee][+-], and proceed as above */ + yyless(yyleng-2); + ECHO; + } {identifier} { @@ -817,6 +867,13 @@ other . (char) strtol(yytext + 1, NULL, 0)); } +"\\"0[xX] { + /* failed hex case */ + yyless(2); + appendPQExpBufferChar(output_buf, + (char) strtol(yytext + 1, NULL, 0)); + } + "\\". { emit(yytext + 1, 1); } {other}|\n { ECHO; }