From d809fd0008a2e26de463f47b7aba0365264078f3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 24 Feb 2015 17:53:42 -0500 Subject: [PATCH] Improve parser's one-extra-token lookahead mechanism. There are a couple of places in our grammar that fail to be strict LALR(1), by requiring more than a single token of lookahead to decide what to do. Up to now we've dealt with that by using a filter between the lexer and parser that merges adjacent tokens into one in the places where two tokens of lookahead are necessary. But that creates a number of user-visible anomalies, for instance that you can't name a CTE "ordinality" because "WITH ordinality AS ..." triggers folding of WITH and ORDINALITY into one token. I realized that there's a better way. In this patch, we still do the lookahead basically as before, but we never merge the second token into the first; we replace just the first token by a special lookahead symbol when one of the lookahead pairs is seen. This requires a couple extra productions in the grammar, but it involves fewer special tokens, so that the grammar tables come out a bit smaller than before. The filter logic is no slower than before, perhaps a bit faster. I also fixed the filter logic so that when backing up after a lookahead, the current token's terminator is correctly restored; this eliminates some weird behavior in error message issuance, as is shown by the one change in existing regression test outputs. I believe that this patch entirely eliminates odd behaviors caused by lookahead for WITH. It doesn't really improve the situation for NULLS followed by FIRST/LAST unfortunately: those sequences still act like a reserved word, even though there are cases where they should be seen as two ordinary identifiers, eg "SELECT nulls first FROM ...". I experimented with additional grammar hacks but couldn't find any simple solution for that. Still, this is better than before, and it seems much more likely that we *could* somehow solve the NULLS case on the basis of this filter behavior than the previous one. --- src/backend/parser/gram.y | 35 +++++-- src/backend/parser/parser.c | 105 ++++++++++--------- src/include/parser/gramparse.h | 2 + src/interfaces/ecpg/preproc/parse.pl | 6 +- src/interfaces/ecpg/preproc/parser.c | 116 ++++++++++++--------- src/test/regress/expected/foreign_data.out | 2 +- src/test/regress/expected/with.out | 15 +++ src/test/regress/sql/with.sql | 5 + 8 files changed, 173 insertions(+), 113 deletions(-) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 6c21002875..581f7a1c1c 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -633,9 +633,9 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); /* * The grammar thinks these are keywords, but they are not in the kwlist.h * list and so can never be entered directly. The filter in parser.c - * creates these tokens when required. + * creates these tokens when required (based on looking one token ahead). */ -%token NULLS_FIRST NULLS_LAST WITH_ORDINALITY WITH_TIME +%token NULLS_LA WITH_LA /* Precedence: lowest to highest */ @@ -873,6 +873,7 @@ CreateRoleStmt: opt_with: WITH {} + | WITH_LA {} | /*EMPTY*/ {} ; @@ -6673,8 +6674,8 @@ opt_asc_desc: ASC { $$ = SORTBY_ASC; } | /*EMPTY*/ { $$ = SORTBY_DEFAULT; } ; -opt_nulls_order: NULLS_FIRST { $$ = SORTBY_NULLS_FIRST; } - | NULLS_LAST { $$ = SORTBY_NULLS_LAST; } +opt_nulls_order: NULLS_LA FIRST_P { $$ = SORTBY_NULLS_FIRST; } + | NULLS_LA LAST_P { $$ = SORTBY_NULLS_LAST; } | /*EMPTY*/ { $$ = SORTBY_NULLS_DEFAULT; } ; @@ -8923,7 +8924,7 @@ AlterTSDictionaryStmt: ; AlterTSConfigurationStmt: - ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list WITH any_name_list + ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list any_with any_name_list { AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt); n->cfgname = $5; @@ -8933,7 +8934,7 @@ AlterTSConfigurationStmt: n->replace = false; $$ = (Node*)n; } - | ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list WITH any_name_list + | ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list any_with any_name_list { AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt); n->cfgname = $5; @@ -8943,7 +8944,7 @@ AlterTSConfigurationStmt: n->replace = false; $$ = (Node*)n; } - | ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING REPLACE any_name WITH any_name + | ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING REPLACE any_name any_with any_name { AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt); n->cfgname = $5; @@ -8953,7 +8954,7 @@ AlterTSConfigurationStmt: n->replace = true; $$ = (Node*)n; } - | ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list REPLACE any_name WITH any_name + | ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list REPLACE any_name any_with any_name { AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt); n->cfgname = $5; @@ -8981,6 +8982,11 @@ AlterTSConfigurationStmt: } ; +/* Use this if TIME or ORDINALITY after WITH should be taken as an identifier */ +any_with: WITH {} + | WITH_LA {} + ; + /***************************************************************************** * @@ -9891,6 +9897,8 @@ simple_select: * AS (query) [ SEARCH or CYCLE clause ] * * We don't currently support the SEARCH or CYCLE clause. + * + * Recognizing WITH_LA here allows a CTE to be named TIME or ORDINALITY. */ with_clause: WITH cte_list @@ -9900,6 +9908,13 @@ with_clause: $$->recursive = false; $$->location = @1; } + | WITH_LA cte_list + { + $$ = makeNode(WithClause); + $$->ctes = $2; + $$->recursive = false; + $$->location = @1; + } | WITH RECURSIVE cte_list { $$ = makeNode(WithClause); @@ -10601,7 +10616,7 @@ opt_col_def_list: AS '(' TableFuncElementList ')' { $$ = $3; } | /*EMPTY*/ { $$ = NIL; } ; -opt_ordinality: WITH_ORDINALITY { $$ = true; } +opt_ordinality: WITH_LA ORDINALITY { $$ = true; } | /*EMPTY*/ { $$ = false; } ; @@ -11057,7 +11072,7 @@ ConstInterval: ; opt_timezone: - WITH_TIME ZONE { $$ = TRUE; } + WITH_LA TIME ZONE { $$ = TRUE; } | WITHOUT TIME ZONE { $$ = FALSE; } | /*EMPTY*/ { $$ = FALSE; } ; diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index db49275e00..b17771d4cc 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -64,13 +64,13 @@ raw_parser(const char *str) /* * Intermediate filter between parser and core lexer (core_yylex in scan.l). * - * The filter is needed because in some cases the standard SQL grammar + * This filter is needed because in some cases the standard SQL grammar * requires more than one token lookahead. We reduce these cases to one-token - * lookahead by combining tokens here, in order to keep the grammar LALR(1). + * lookahead by replacing tokens here, in order to keep the grammar LALR(1). * * Using a filter is simpler than trying to recognize multiword tokens * directly in scan.l, because we'd have to allow for comments between the - * words. Furthermore it's not clear how to do it without re-introducing + * words. Furthermore it's not clear how to do that without re-introducing * scanner backtrack, which would cost more performance than this filter * layer does. * @@ -84,7 +84,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); int cur_token; int next_token; - core_YYSTYPE cur_yylval; + int cur_token_length; YYLTYPE cur_yylloc; /* Get next token --- we might already have it */ @@ -93,74 +93,85 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) cur_token = yyextra->lookahead_token; lvalp->core_yystype = yyextra->lookahead_yylval; *llocp = yyextra->lookahead_yylloc; + *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; yyextra->have_lookahead = false; } else cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); - /* Do we need to look ahead for a possible multiword token? */ + /* + * If this token isn't one that requires lookahead, just return it. If it + * does, determine the token length. (We could get that via strlen(), but + * since we have such a small set of possibilities, hardwiring seems + * feasible and more efficient.) + */ switch (cur_token) { case NULLS_P: + cur_token_length = 5; + break; + case WITH: + cur_token_length = 4; + break; + default: + return cur_token; + } - /* - * NULLS FIRST and NULLS LAST must be reduced to one token - */ - cur_yylval = lvalp->core_yystype; - cur_yylloc = *llocp; - next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); + /* + * Identify end+1 of current token. core_yylex() has temporarily stored a + * '\0' here, and will undo that when we call it again. We need to redo + * it to fully revert the lookahead call for error reporting purposes. + */ + yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + + *llocp + cur_token_length; + Assert(*(yyextra->lookahead_end) == '\0'); + + /* + * Save and restore *llocp around the call. It might look like we could + * avoid this by just passing &lookahead_yylloc to core_yylex(), but that + * does not work because flex actually holds onto the last-passed pointer + * internally, and will use that for error reporting. We need any error + * reports to point to the current token, not the next one. + */ + cur_yylloc = *llocp; + + /* Get next token, saving outputs into lookahead variables */ + next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); + yyextra->lookahead_token = next_token; + yyextra->lookahead_yylloc = *llocp; + + *llocp = cur_yylloc; + + /* Now revert the un-truncation of the current token */ + yyextra->lookahead_hold_char = *(yyextra->lookahead_end); + *(yyextra->lookahead_end) = '\0'; + + yyextra->have_lookahead = true; + + /* Replace cur_token if needed, based on lookahead */ + switch (cur_token) + { + case NULLS_P: + /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ switch (next_token) { case FIRST_P: - cur_token = NULLS_FIRST; - break; case LAST_P: - cur_token = NULLS_LAST; - break; - default: - /* save the lookahead token for next time */ - yyextra->lookahead_token = next_token; - yyextra->lookahead_yylval = lvalp->core_yystype; - yyextra->lookahead_yylloc = *llocp; - yyextra->have_lookahead = true; - /* and back up the output info to cur_token */ - lvalp->core_yystype = cur_yylval; - *llocp = cur_yylloc; + cur_token = NULLS_LA; break; } break; case WITH: - - /* - * WITH TIME and WITH ORDINALITY must each be reduced to one token - */ - cur_yylval = lvalp->core_yystype; - cur_yylloc = *llocp; - next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); + /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ switch (next_token) { case TIME: - cur_token = WITH_TIME; - break; case ORDINALITY: - cur_token = WITH_ORDINALITY; - break; - default: - /* save the lookahead token for next time */ - yyextra->lookahead_token = next_token; - yyextra->lookahead_yylval = lvalp->core_yystype; - yyextra->lookahead_yylloc = *llocp; - yyextra->have_lookahead = true; - /* and back up the output info to cur_token */ - lvalp->core_yystype = cur_yylval; - *llocp = cur_yylloc; + cur_token = WITH_LA; break; } break; - - default: - break; } return cur_token; diff --git a/src/include/parser/gramparse.h b/src/include/parser/gramparse.h index d9df30374b..100fdfb213 100644 --- a/src/include/parser/gramparse.h +++ b/src/include/parser/gramparse.h @@ -46,6 +46,8 @@ typedef struct base_yy_extra_type int lookahead_token; /* one-token lookahead */ core_YYSTYPE lookahead_yylval; /* yylval for lookahead token */ YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */ + char *lookahead_end; /* end of current token */ + char lookahead_hold_char; /* to be put back at *lookahead_end */ /* * State variables that belong to the grammar. diff --git a/src/interfaces/ecpg/preproc/parse.pl b/src/interfaces/ecpg/preproc/parse.pl index f998310ef4..36dce80386 100644 --- a/src/interfaces/ecpg/preproc/parse.pl +++ b/src/interfaces/ecpg/preproc/parse.pl @@ -42,10 +42,8 @@ my %replace_token = ( # or in the block my %replace_string = ( - 'WITH_TIME' => 'with time', - 'WITH_ORDINALITY' => 'with ordinality', - 'NULLS_FIRST' => 'nulls first', - 'NULLS_LAST' => 'nulls last', + 'NULLS_LA' => 'nulls', + 'WITH_LA' => 'with', 'TYPECAST' => '::', 'DOT_DOT' => '..', 'COLON_EQUALS' => ':=',); diff --git a/src/interfaces/ecpg/preproc/parser.c b/src/interfaces/ecpg/preproc/parser.c index f118826953..099a213d11 100644 --- a/src/interfaces/ecpg/preproc/parser.c +++ b/src/interfaces/ecpg/preproc/parser.c @@ -3,11 +3,8 @@ * parser.c * Main entry point/driver for PostgreSQL grammar * - * Note that the grammar is not allowed to perform any table access - * (since we need to be able to do basic parsing even while inside an - * aborted transaction). Therefore, the data structures returned by - * the grammar are "raw" parsetrees that still need to be analyzed by - * analyze.c and related files. + * This should match src/backend/parser/parser.c, except that we do not + * need to bother with re-entrant interfaces. * * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group @@ -29,18 +26,21 @@ static bool have_lookahead; /* is lookahead info valid? */ static int lookahead_token; /* one-token lookahead */ static YYSTYPE lookahead_yylval; /* yylval for lookahead token */ static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */ +static char *lookahead_yytext; /* start current token */ +static char *lookahead_end; /* end of current token */ +static char lookahead_hold_char; /* to be put back at *lookahead_end */ /* * Intermediate filter between parser and base lexer (base_yylex in scan.l). * - * The filter is needed because in some cases the standard SQL grammar + * This filter is needed because in some cases the standard SQL grammar * requires more than one token lookahead. We reduce these cases to one-token - * lookahead by combining tokens here, in order to keep the grammar LALR(1). + * lookahead by replacing tokens here, in order to keep the grammar LALR(1). * * Using a filter is simpler than trying to recognize multiword tokens * directly in scan.l, because we'd have to allow for comments between the - * words. Furthermore it's not clear how to do it without re-introducing + * words. Furthermore it's not clear how to do that without re-introducing * scanner backtrack, which would cost more performance than this filter * layer does. */ @@ -49,8 +49,10 @@ filtered_base_yylex(void) { int cur_token; int next_token; + int cur_token_length; YYSTYPE cur_yylval; YYLTYPE cur_yylloc; + char *cur_yytext; /* Get next token --- we might already have it */ if (have_lookahead) @@ -58,74 +60,86 @@ filtered_base_yylex(void) cur_token = lookahead_token; base_yylval = lookahead_yylval; base_yylloc = lookahead_yylloc; + yytext = lookahead_yytext; + *lookahead_end = lookahead_hold_char; have_lookahead = false; } else cur_token = base_yylex(); - /* Do we need to look ahead for a possible multiword token? */ + /* + * If this token isn't one that requires lookahead, just return it. If it + * does, determine the token length. (We could get that via strlen(), but + * since we have such a small set of possibilities, hardwiring seems + * feasible and more efficient.) + */ switch (cur_token) { case NULLS_P: + cur_token_length = 5; + break; + case WITH: + cur_token_length = 4; + break; + default: + return cur_token; + } - /* - * NULLS FIRST and NULLS LAST must be reduced to one token - */ - cur_yylval = base_yylval; - cur_yylloc = base_yylloc; - next_token = base_yylex(); + /* + * Identify end+1 of current token. base_yylex() has temporarily stored a + * '\0' here, and will undo that when we call it again. We need to redo + * it to fully revert the lookahead call for error reporting purposes. + */ + lookahead_end = yytext + cur_token_length; + Assert(*lookahead_end == '\0'); + + /* Save and restore lexer output variables around the call */ + cur_yylval = base_yylval; + cur_yylloc = base_yylloc; + cur_yytext = yytext; + + /* Get next token, saving outputs into lookahead variables */ + next_token = base_yylex(); + + lookahead_token = next_token; + lookahead_yylval = base_yylval; + lookahead_yylloc = base_yylloc; + lookahead_yytext = yytext; + + base_yylval = cur_yylval; + base_yylloc = cur_yylloc; + yytext = cur_yytext; + + /* Now revert the un-truncation of the current token */ + lookahead_hold_char = *lookahead_end; + *lookahead_end = '\0'; + + have_lookahead = true; + + /* Replace cur_token if needed, based on lookahead */ + switch (cur_token) + { + case NULLS_P: + /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ switch (next_token) { case FIRST_P: - cur_token = NULLS_FIRST; - break; case LAST_P: - cur_token = NULLS_LAST; - break; - default: - /* save the lookahead token for next time */ - lookahead_token = next_token; - lookahead_yylval = base_yylval; - lookahead_yylloc = base_yylloc; - have_lookahead = true; - /* and back up the output info to cur_token */ - base_yylval = cur_yylval; - base_yylloc = cur_yylloc; + cur_token = NULLS_LA; break; } break; case WITH: - - /* - * WITH TIME must be reduced to one token - */ - cur_yylval = base_yylval; - cur_yylloc = base_yylloc; - next_token = base_yylex(); + /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ switch (next_token) { case TIME: - cur_token = WITH_TIME; - break; case ORDINALITY: - cur_token = WITH_ORDINALITY; - break; - default: - /* save the lookahead token for next time */ - lookahead_token = next_token; - lookahead_yylval = base_yylval; - lookahead_yylloc = base_yylloc; - have_lookahead = true; - /* and back up the output info to cur_token */ - base_yylval = cur_yylval; - base_yylloc = cur_yylloc; + cur_token = WITH_LA; break; } break; - - default: - break; } return cur_token; diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out index 4795c83c9b..632b7e5bff 100644 --- a/src/test/regress/expected/foreign_data.out +++ b/src/test/regress/expected/foreign_data.out @@ -663,7 +663,7 @@ LINE 1: CREATE FOREIGN TABLE ft1 (); CREATE FOREIGN TABLE ft1 () SERVER no_server; -- ERROR ERROR: server "no_server" does not exist CREATE FOREIGN TABLE ft1 () SERVER s0 WITH OIDS; -- ERROR -ERROR: syntax error at or near "WITH OIDS" +ERROR: syntax error at or near "WITH" LINE 1: CREATE FOREIGN TABLE ft1 () SERVER s0 WITH OIDS; ^ CREATE FOREIGN TABLE ft1 ( diff --git a/src/test/regress/expected/with.out b/src/test/regress/expected/with.out index 06b372bf51..524e0ef2c6 100644 --- a/src/test/regress/expected/with.out +++ b/src/test/regress/expected/with.out @@ -2155,3 +2155,18 @@ WITH t AS ( VALUES(FALSE); ERROR: conditional DO INSTEAD rules are not supported for data-modifying statements in WITH DROP RULE y_rule ON y; +-- check that parser lookahead for WITH doesn't cause any odd behavior +create table foo (with baz); -- fail, WITH is a reserved word +ERROR: syntax error at or near "with" +LINE 1: create table foo (with baz); + ^ +create table foo (with ordinality); -- fail, WITH is a reserved word +ERROR: syntax error at or near "with" +LINE 1: create table foo (with ordinality); + ^ +with ordinality as (select 1 as x) select * from ordinality; + x +--- + 1 +(1 row) + diff --git a/src/test/regress/sql/with.sql b/src/test/regress/sql/with.sql index c716369957..1687c11983 100644 --- a/src/test/regress/sql/with.sql +++ b/src/test/regress/sql/with.sql @@ -956,3 +956,8 @@ WITH t AS ( ) VALUES(FALSE); DROP RULE y_rule ON y; + +-- check that parser lookahead for WITH doesn't cause any odd behavior +create table foo (with baz); -- fail, WITH is a reserved word +create table foo (with ordinality); -- fail, WITH is a reserved word +with ordinality as (select 1 as x) select * from ordinality;