/*------------------------------------------------------------------------- * * parser.c * Main entry point/driver for PostgreSQL grammar * * Note that the grammar is not allowed to perform any table access * (since we need to be able to do basic parsing even while inside an * aborted transaction). Therefore, the data structures returned by * the grammar are "raw" parsetrees that still need to be analyzed by * analyze.c and related files. * * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/parser/parser.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "mb/pg_wchar.h" #include "parser/gramparse.h" #include "parser/parser.h" #include "parser/scansup.h" static bool check_uescapechar(unsigned char escape); static char *str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner); /* * raw_parser * Given a query in string form, do lexical and grammatical analysis. * * Returns a list of raw (un-analyzed) parse trees. The immediate elements * of the list are always RawStmt nodes. */ List * raw_parser(const char *str) { core_yyscan_t yyscanner; base_yy_extra_type yyextra; int yyresult; /* initialize the flex scanner */ yyscanner = scanner_init(str, &yyextra.core_yy_extra, &ScanKeywords, ScanKeywordTokens); /* base_yylex() only needs this much initialization */ yyextra.have_lookahead = false; /* initialize the bison parser */ parser_init(&yyextra); /* Parse! */ yyresult = base_yyparse(yyscanner); /* Clean up (release memory) */ scanner_finish(yyscanner); if (yyresult) /* error */ return NIL; return yyextra.parsetree; } /* * Intermediate filter between parser and core lexer (core_yylex in scan.l). * * This filter is needed because in some cases the standard SQL grammar * requires more than one token lookahead. We reduce these cases to one-token * lookahead by replacing tokens here, in order to keep the grammar LALR(1). * * Using a filter is simpler than trying to recognize multiword tokens * directly in scan.l, because we'd have to allow for comments between the * words. Furthermore it's not clear how to do that without re-introducing * scanner backtrack, which would cost more performance than this filter * layer does. * * We also use this filter to convert UIDENT and USCONST sequences into * plain IDENT and SCONST tokens. While that could be handled by additional * productions in the main grammar, it's more efficient to do it like this. * * The filter also provides a convenient place to translate between * the core_YYSTYPE and YYSTYPE representations (which are really the * same thing anyway, but notationally they're different). */ int base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) { base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner); int cur_token; int next_token; int cur_token_length; YYLTYPE cur_yylloc; /* Get next token --- we might already have it */ if (yyextra->have_lookahead) { cur_token = yyextra->lookahead_token; lvalp->core_yystype = yyextra->lookahead_yylval; *llocp = yyextra->lookahead_yylloc; *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; yyextra->have_lookahead = false; } else cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner); /* * If this token isn't one that requires lookahead, just return it. If it * does, determine the token length. (We could get that via strlen(), but * since we have such a small set of possibilities, hardwiring seems * feasible and more efficient --- at least for the fixed-length cases.) */ switch (cur_token) { case NOT: cur_token_length = 3; break; case NULLS_P: cur_token_length = 5; break; case WITH: cur_token_length = 4; break; case UIDENT: case USCONST: cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp); break; default: return cur_token; } /* * Identify end+1 of current token. core_yylex() has temporarily stored a * '\0' here, and will undo that when we call it again. We need to redo * it to fully revert the lookahead call for error reporting purposes. */ yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf + *llocp + cur_token_length; Assert(*(yyextra->lookahead_end) == '\0'); /* * Save and restore *llocp around the call. It might look like we could * avoid this by just passing &lookahead_yylloc to core_yylex(), but that * does not work because flex actually holds onto the last-passed pointer * internally, and will use that for error reporting. We need any error * reports to point to the current token, not the next one. */ cur_yylloc = *llocp; /* Get next token, saving outputs into lookahead variables */ next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); yyextra->lookahead_token = next_token; yyextra->lookahead_yylloc = *llocp; *llocp = cur_yylloc; /* Now revert the un-truncation of the current token */ yyextra->lookahead_hold_char = *(yyextra->lookahead_end); *(yyextra->lookahead_end) = '\0'; yyextra->have_lookahead = true; /* Replace cur_token if needed, based on lookahead */ switch (cur_token) { case NOT: /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */ switch (next_token) { case BETWEEN: case IN_P: case LIKE: case ILIKE: case SIMILAR: cur_token = NOT_LA; break; } break; case NULLS_P: /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */ switch (next_token) { case FIRST_P: case LAST_P: cur_token = NULLS_LA; break; } break; case WITH: /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */ switch (next_token) { case TIME: case ORDINALITY: cur_token = WITH_LA; break; } break; case UIDENT: case USCONST: /* Look ahead for UESCAPE */ if (next_token == UESCAPE) { /* Yup, so get third token, which had better be SCONST */ const char *escstr; /* Again save and restore *llocp */ cur_yylloc = *llocp; /* Un-truncate current token so errors point to third token */ *(yyextra->lookahead_end) = yyextra->lookahead_hold_char; /* Get third token */ next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner); /* If we throw error here, it will point to third token */ if (next_token != SCONST) scanner_yyerror("UESCAPE must be followed by a simple string literal", yyscanner); escstr = yyextra->lookahead_yylval.str; if (strlen(escstr) != 1 || !check_uescapechar(escstr[0])) scanner_yyerror("invalid Unicode escape character", yyscanner); /* Now restore *llocp; errors will point to first token */ *llocp = cur_yylloc; /* Apply Unicode conversion */ lvalp->core_yystype.str = str_udeescape(lvalp->core_yystype.str, escstr[0], *llocp, yyscanner); /* * We don't need to revert the un-truncation of UESCAPE. What * we do want to do is clear have_lookahead, thereby consuming * all three tokens. */ yyextra->have_lookahead = false; } else { /* No UESCAPE, so convert using default escape character */ lvalp->core_yystype.str = str_udeescape(lvalp->core_yystype.str, '\\', *llocp, yyscanner); } if (cur_token == UIDENT) { /* It's an identifier, so truncate as appropriate */ truncate_identifier(lvalp->core_yystype.str, strlen(lvalp->core_yystype.str), true); cur_token = IDENT; } else if (cur_token == USCONST) { cur_token = SCONST; } break; } return cur_token; } /* convert hex digit (caller should have verified that) to value */ static unsigned int hexval(unsigned char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; elog(ERROR, "invalid hexadecimal digit"); return 0; /* not reached */ } /* is Unicode code point acceptable in database's encoding? */ static void check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner) { /* See also addunicode() in scan.l */ if (c == 0 || c > 0x10FFFF) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid Unicode escape value"), scanner_errposition(pos, yyscanner))); if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"), scanner_errposition(pos, yyscanner))); } /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ static bool check_uescapechar(unsigned char escape) { if (isxdigit(escape) || escape == '+' || escape == '\'' || escape == '"' || scanner_isspace(escape)) return false; else return true; } /* * Process Unicode escapes in "str", producing a palloc'd plain string * * escape: the escape character to use * position: start position of U&'' or U&"" string token * yyscanner: context information needed for error reports */ static char * str_udeescape(const char *str, char escape, int position, core_yyscan_t yyscanner) { const char *in; char *new, *out; pg_wchar pair_first = 0; /* * This relies on the subtle assumption that a UTF-8 expansion cannot be * longer than its escaped representation. */ new = palloc(strlen(str) + 1); in = str; out = new; while (*in) { if (in[0] == escape) { if (in[1] == escape) { if (pair_first) goto invalid_pair; *out++ = escape; in += 2; } else if (isxdigit((unsigned char) in[1]) && isxdigit((unsigned char) in[2]) && isxdigit((unsigned char) in[3]) && isxdigit((unsigned char) in[4])) { pg_wchar unicode; unicode = (hexval(in[1]) << 12) + (hexval(in[2]) << 8) + (hexval(in[3]) << 4) + hexval(in[4]); check_unicode_value(unicode, in - str + position + 3, /* 3 for U&" */ yyscanner); if (pair_first) { if (is_utf16_surrogate_second(unicode)) { unicode = surrogate_pair_to_codepoint(pair_first, unicode); pair_first = 0; } else goto invalid_pair; } else if (is_utf16_surrogate_second(unicode)) goto invalid_pair; if (is_utf16_surrogate_first(unicode)) pair_first = unicode; else { unicode_to_utf8(unicode, (unsigned char *) out); out += pg_mblen(out); } in += 5; } else if (in[1] == '+' && isxdigit((unsigned char) in[2]) && isxdigit((unsigned char) in[3]) && isxdigit((unsigned char) in[4]) && isxdigit((unsigned char) in[5]) && isxdigit((unsigned char) in[6]) && isxdigit((unsigned char) in[7])) { pg_wchar unicode; unicode = (hexval(in[2]) << 20) + (hexval(in[3]) << 16) + (hexval(in[4]) << 12) + (hexval(in[5]) << 8) + (hexval(in[6]) << 4) + hexval(in[7]); check_unicode_value(unicode, in - str + position + 3, /* 3 for U&" */ yyscanner); if (pair_first) { if (is_utf16_surrogate_second(unicode)) { unicode = surrogate_pair_to_codepoint(pair_first, unicode); pair_first = 0; } else goto invalid_pair; } else if (is_utf16_surrogate_second(unicode)) goto invalid_pair; if (is_utf16_surrogate_first(unicode)) pair_first = unicode; else { unicode_to_utf8(unicode, (unsigned char *) out); out += pg_mblen(out); } in += 8; } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid Unicode escape value"), scanner_errposition(in - str + position + 3, /* 3 for U&" */ yyscanner))); } else { if (pair_first) goto invalid_pair; *out++ = *in++; } } /* unfinished surrogate pair? */ if (pair_first) goto invalid_pair; *out = '\0'; /* * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII * codes; but it's probably not worth the trouble, since this isn't likely * to be a performance-critical path. */ pg_verifymbstr(new, out - new, false); return new; invalid_pair: ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid Unicode surrogate pair"), scanner_errposition(in - str + position + 3, /* 3 for U&" */ yyscanner))); return NULL; /* keep compiler quiet */ }