%{ /*------------------------------------------------------------------------- * * psqlscan.l * lexical scanner for psql * * This code is mainly needed to determine where the end of a SQL statement * is: we are looking for semicolons that are not within quotes, comments, * or parentheses. The most reliable way to handle this is to borrow the * backend's flex lexer rules, lock, stock, and barrel. The rules below * are (except for a few) the same as the backend's, but their actions are * just ECHO whereas the backend's actions generally do other things. * * XXX The rules in this file must be kept in sync with the backend lexer!!! * * XXX Avoid creating backtracking cases --- see the backend lexer for info. * * The most difficult aspect of this code is that we need to work in multibyte * encodings that are not ASCII-safe. A "safe" encoding is one in which each * byte of a multibyte character has the high bit set (it's >= 0x80). Since * all our lexing rules treat all high-bit-set characters alike, we don't * really need to care whether such a byte is part of a sequence or not. * In an "unsafe" encoding, we still expect the first byte of a multibyte * sequence to be >= 0x80, but later bytes might not be. If we scan such * a sequence as-is, the lexing rules could easily be fooled into matching * such bytes to ordinary ASCII characters. Our solution for this is to * substitute 0xFF for each non-first byte within the data presented to flex. * The flex rules will then pass the FF's through unmolested. The emit() * subroutine is responsible for looking back to the original string and * replacing FF's with the corresponding original bytes. * * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.31 2010/01/02 16:57:59 momjian Exp $ * *------------------------------------------------------------------------- */ #include "postgres_fe.h" #include "psqlscan.h" #include #include "common.h" #include "settings.h" #include "variables.h" /* * We use a stack of flex buffers to handle substitution of psql variables. * Each stacked buffer contains the as-yet-unread text from one psql variable. * When we pop the stack all the way, we resume reading from the outer buffer * identified by scanbufhandle. */ typedef struct StackElem { YY_BUFFER_STATE buf; /* flex input control structure */ char *bufstring; /* data actually being scanned by flex */ char *origstring; /* copy of original data, if needed */ struct StackElem *next; } StackElem; /* * All working state of the lexer must be stored in PsqlScanStateData * between calls. This allows us to have multiple open lexer operations, * which is needed for nested include files. The lexer itself is not * recursive, but it must be re-entrant. */ typedef struct PsqlScanStateData { StackElem *buffer_stack; /* stack of variable expansion buffers */ /* * These variables always refer to the outer buffer, never to any * stacked variable-expansion buffer. */ YY_BUFFER_STATE scanbufhandle; char *scanbuf; /* start of outer-level input buffer */ const char *scanline; /* current input line at outer level */ /* safe_encoding, curline, refline are used by emit() to replace FFs */ int encoding; /* encoding being used now */ bool safe_encoding; /* is current encoding "safe"? */ const char *curline; /* actual flex input string for cur buf */ const char *refline; /* original data for cur buffer */ /* * All this state lives across successive input lines, until explicitly * reset by psql_scan_reset. */ int start_state; /* saved YY_START */ int paren_depth; /* depth of nesting in parentheses */ int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */ } PsqlScanStateData; static PsqlScanState cur_state; /* current state while active */ static PQExpBuffer output_buf; /* current output buffer */ /* these variables do not need to be saved across calls */ static enum slash_option_type option_type; static char *option_quote; /* Return values from yylex() */ #define LEXRES_EOL 0 /* end of input */ #define LEXRES_SEMI 1 /* command-terminating semicolon found */ #define LEXRES_BACKSLASH 2 /* backslash command start */ #define LEXRES_OK 3 /* OK completion of backslash argument */ int yylex(void); static void push_new_buffer(const char *newstr); static YY_BUFFER_STATE prepare_buffer(const char *txt, int len, char **txtcopy); static void emit(const char *txt, int len); static bool is_utf16_surrogate_first(uint32 c); #define ECHO emit(yytext, yyleng) %} %option 8bit %option never-interactive %option nodefault %option noinput %option nounput %option noyywrap /* * All of the following definitions and rules should exactly match * src/backend/parser/scan.l so far as the flex patterns are concerned. * The rule bodies are just ECHO as opposed to what the backend does, * however. (But be sure to duplicate code that affects the lexing process, * such as BEGIN().) Also, psqlscan uses a single <> rule whereas * scan.l has a separate one for each exclusive state. */ /* * OK, here is a short description of lex/flex rules behavior. * The longest pattern which matches an input string is always chosen. * For equal-length patterns, the first occurring in the rules list is chosen. * INITIAL is the starting state, to which all non-conditional rules apply. * Exclusive states change parsing rules while the state is active. When in * an exclusive state, only those rules defined for that state apply. * * We use exclusive states for quoted strings, extended comments, * and to eliminate parsing troubles for numeric strings. * Exclusive states: * bit string literal * extended C-style comments * delimited identifiers (double-quoted identifiers) * hexadecimal numeric string * standard quoted strings * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings * quoted identifier with Unicode escapes * quoted string with Unicode escapes * Unicode surrogate pair in extended quoted string */ %x xb %x xc %x xd %x xh %x xe %x xq %x xdolq %x xui %x xus %x xeu /* Additional exclusive states for psql only: lex backslash commands */ %x xslashcmd %x xslasharg %x xslashquote %x xslashbackquote %x xslashdefaultarg %x xslashquotedarg %x xslashwholeline %x xslashend /* * In order to make the world safe for Windows and Mac clients as well as * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n * sequence will be seen as two successive newlines, but that doesn't cause * any problems. Comments that start with -- and extend to the next * newline are treated as equivalent to a single whitespace character. * * NOTE a fine point: if there is no newline following --, we will absorb * everything to the end of the input as a comment. This is correct. Older * versions of Postgres failed to recognize -- as a comment if the input * did not end with a newline. * * XXX perhaps \f (formfeed) should be treated as a newline as well? * * XXX if you change the set of whitespace characters, fix scanner_isspace() * to agree, and see also the plpgsql lexer. */ space [ \t\n\r\f] horiz_space [ \t\f] newline [\n\r] non_newline [^\n\r] comment ("--"{non_newline}*) whitespace ({space}+|{comment}) /* * SQL requires at least one newline in the whitespace separating * string literals that are to be concatenated. Silly, but who are we * to argue? Note that {whitespace_with_newline} should not have * after * it, whereas {whitespace} should generally have a * after it... */ special_whitespace ({space}+|{comment}{newline}) horiz_whitespace ({horiz_space}|{comment}) whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*) /* * To ensure that {quotecontinue} can be scanned without having to back up * if the full pattern isn't matched, we include trailing whitespace in * {quotestop}. This matches all cases where {quotecontinue} fails to match, * except for {quote} followed by whitespace and just one "-" (not two, * which would start a {comment}). To cover that we have {quotefail}. * The actions for {quotestop} and {quotefail} must throw back characters * beyond the quote proper. */ quote ' quotestop {quote}{whitespace}* quotecontinue {quote}{whitespace_with_newline}{quote} quotefail {quote}{whitespace}*"-" /* Bit string * It is tempting to scan the string for only those characters * which are allowed. However, this leads to silently swallowed * characters if illegal characters are included in the string. * For example, if xbinside is [01] then B'ABCD' is interpreted * as a zero-length string, and the ABCD' is lost! * Better to pass the string forward and let the input routines * validate the contents. */ xbstart [bB]{quote} xbinside [^']* /* Hexadecimal number */ xhstart [xX]{quote} xhinside [^']* /* National character */ xnstart [nN]{quote} /* Quoted string that allows backslash escapes */ xestart [eE]{quote} xeinside [^\\']+ xeescape [\\][^0-7] xeoctesc [\\][0-7]{1,3} xehexesc [\\]x[0-9A-Fa-f]{1,2} xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7}) /* Extended quote * xqdouble implements embedded quote, '''' */ xqstart {quote} xqdouble {quote}{quote} xqinside [^']+ /* $foo$ style quotes ("dollar quoting") * The quoted string starts with $foo$ where "foo" is an optional string * in the form of an identifier, except that it may not contain "$", * and extends to the first occurrence of an identical string. * There is *no* processing of the quoted text. * * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim} * fails to match its trailing "$". */ dolq_start [A-Za-z\200-\377_] dolq_cont [A-Za-z\200-\377_0-9] dolqdelim \$({dolq_start}{dolq_cont}*)?\$ dolqfailed \${dolq_start}{dolq_cont}* dolqinside [^$]+ /* Double quote * Allows embedded spaces and other special characters into identifiers. */ dquote \" xdstart {dquote} xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ /* Unicode escapes */ uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} /* error rule to avoid backup */ uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]) /* Quoted identifier with Unicode escapes */ xuistart [uU]&{dquote} xuistop1 {dquote}{whitespace}*{uescapefail}? xuistop2 {dquote}{whitespace}*{uescape} /* Quoted string with Unicode escapes */ xusstart [uU]&{quote} xusstop1 {quote}{whitespace}*{uescapefail}? xusstop2 {quote}{whitespace}*{uescape} /* error rule to avoid backup */ xufailed [uU]& /* C-style comments * * The "extended comment" syntax closely resembles allowable operator syntax. * The tricky part here is to get lex to recognize a string starting with * slash-star as a comment, when interpreting it as an operator would produce * a longer match --- remember lex will prefer a longer match! Also, if we * have something like plus-slash-star, lex will think this is a 3-character * operator whereas we want to see it as a + operator and a comment start. * The solution is two-fold: * 1. append {op_chars}* to xcstart so that it matches as much text as * {operator} would. Then the tie-breaker (first matching rule of same * length) ensures xcstart wins. We put back the extra stuff with yyless() * in case it contains a star-slash that should terminate the comment. * 2. In the operator rule, check for slash-star within the operator, and * if found throw it back with yyless(). This handles the plus-slash-star * problem. * Dash-dash comments have similar interactions with the operator rule. */ xcstart \/\*{op_chars}* xcstop \*+\/ xcinside [^*/]+ digit [0-9] ident_start [A-Za-z\200-\377_] ident_cont [A-Za-z\200-\377_0-9\$] identifier {ident_start}{ident_cont}* typecast "::" /* these two token types are used by PL/pgsql, though not in core SQL */ dot_dot \.\. colon_equals ":=" /* * "self" is the set of chars that should be returned as single-character * tokens. "op_chars" is the set of chars that can make up "Op" tokens, * which can be one or more characters long (but if a single-char token * appears in the "self" set, it is not to be returned as an Op). Note * that the sets overlap, but each has some chars that are not in the other. * * If you change either set, adjust the character lists appearing in the * rule for "operator"! */ self [,()\[\].;\:\+\-\*\/\%\^\<\>\=] op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=] operator {op_chars}+ /* we no longer allow unary minus in numbers. * instead we pass it separately to parser. there it gets * coerced via doNegate() -- Leon aug 20 1999 * * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10. * * {realfail1} and {realfail2} are added to prevent the need for scanner * backup when the {real} rule fails to match completely. */ integer {digit}+ decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*)) decimalfail {digit}+\.\. real ({integer}|{decimal})[Ee][-+]?{digit}+ realfail1 ({integer}|{decimal})[Ee] realfail2 ({integer}|{decimal})[Ee][-+] param \${integer} other . /* * Dollar quoted strings are totally opaque, and no escaping is done on them. * Other quoted strings must allow some special characters such as single-quote * and newline. * Embedded single-quotes are implemented both in the SQL standard * style of two adjacent single quotes "''" and in the Postgres/Java style * of escaped-quote "\'". * Other embedded escaped characters are matched explicitly and the leading * backslash is dropped from the string. * Note that xcstart must appear before operator, as explained above! * Also whitespace (comment) must appear before operator. */ %% {whitespace} { /* * Note that the whitespace rule includes both true * whitespace and single-line ("--" style) comments. * We suppress whitespace at the start of the query * buffer. We also suppress all single-line comments, * which is pretty dubious but is the historical * behavior. */ if (!(output_buf->len == 0 || yytext[0] == '-')) ECHO; } {xcstart} { cur_state->xcdepth = 0; BEGIN(xc); /* Put back any characters past slash-star; see above */ yyless(2); ECHO; } {xcstart} { cur_state->xcdepth++; /* Put back any characters past slash-star; see above */ yyless(2); ECHO; } {xcstop} { if (cur_state->xcdepth <= 0) { BEGIN(INITIAL); } else cur_state->xcdepth--; ECHO; } {xcinside} { ECHO; } {op_chars} { ECHO; } \*+ { ECHO; } {xbstart} { BEGIN(xb); ECHO; } {quotestop} | {quotefail} { yyless(1); BEGIN(INITIAL); ECHO; } {xhinside} | {xbinside} { ECHO; } {quotecontinue} | {quotecontinue} { ECHO; } {xhstart} { /* Hexadecimal bit type. * At some point we should simply pass the string * forward to the parser and label it there. * In the meantime, place a leading "x" on the string * to mark it for the input routine as a hex string. */ BEGIN(xh); ECHO; } {quotestop} | {quotefail} { yyless(1); BEGIN(INITIAL); ECHO; } {xnstart} { yyless(1); /* eat only 'n' this time */ ECHO; } {xqstart} { if (standard_strings()) BEGIN(xq); else BEGIN(xe); ECHO; } {xestart} { BEGIN(xe); ECHO; } {xusstart} { BEGIN(xus); ECHO; } {quotestop} | {quotefail} { yyless(1); BEGIN(INITIAL); ECHO; } {xusstop1} { yyless(1); BEGIN(INITIAL); ECHO; } {xusstop2} { BEGIN(INITIAL); ECHO; } {xqdouble} { ECHO; } {xqinside} { ECHO; } {xeinside} { ECHO; } {xeunicode} { uint32 c = strtoul(yytext+2, NULL, 16); if (is_utf16_surrogate_first(c)) BEGIN(xeu); ECHO; } {xeunicode} { BEGIN(xe); ECHO; } . { ECHO; } \n { ECHO; } {xeunicodefail} { ECHO; } {xeescape} { ECHO; } {xeoctesc} { ECHO; } {xehexesc} { ECHO; } {quotecontinue} { ECHO; } . { /* This is only needed for \ just before EOF */ ECHO; } {dolqdelim} { cur_state->dolqstart = pg_strdup(yytext); BEGIN(xdolq); ECHO; } {dolqfailed} { /* throw back all but the initial "$" */ yyless(1); ECHO; } {dolqdelim} { if (strcmp(yytext, cur_state->dolqstart) == 0) { free(cur_state->dolqstart); cur_state->dolqstart = NULL; BEGIN(INITIAL); } else { /* * When we fail to match $...$ to dolqstart, transfer * the $... part to the output, but put back the final * $ for rescanning. Consider $delim$...$junk$delim$ */ yyless(yyleng-1); } ECHO; } {dolqinside} { ECHO; } {dolqfailed} { ECHO; } . { /* This is only needed for $ inside the quoted text */ ECHO; } {xdstart} { BEGIN(xd); ECHO; } {xuistart} { BEGIN(xui); ECHO; } {xdstop} { BEGIN(INITIAL); ECHO; } {xuistop1} { yyless(1); BEGIN(INITIAL); ECHO; } {xuistop2} { BEGIN(INITIAL); ECHO; } {xddouble} { ECHO; } {xdinside} { ECHO; } {xufailed} { /* throw back all but the initial u/U */ yyless(1); ECHO; } {typecast} { ECHO; } {dot_dot} { ECHO; } {colon_equals} { ECHO; } /* * These rules are specific to psql --- they implement parenthesis * counting and detection of command-ending semicolon. These must * appear before the {self} rule so that they take precedence over it. */ "(" { cur_state->paren_depth++; ECHO; } ")" { if (cur_state->paren_depth > 0) cur_state->paren_depth--; ECHO; } ";" { ECHO; if (cur_state->paren_depth == 0) { /* Terminate lexing temporarily */ return LEXRES_SEMI; } } /* * psql-specific rules to handle backslash commands and variable * substitution. We want these before {self}, also. */ "\\"[;:] { /* Force a semicolon or colon into the query buffer */ emit(yytext + 1, 1); } "\\" { /* Terminate lexing temporarily */ return LEXRES_BACKSLASH; } :[A-Za-z0-9_]+ { /* Possible psql variable substitution */ const char *value; value = GetVariable(pset.vars, yytext + 1); if (value) { /* It is a variable, perform substitution */ push_new_buffer(value); /* yy_scan_string already made buffer active */ } else { /* * if the variable doesn't exist we'll copy the * string as is */ ECHO; } } /* * Back to backend-compatible rules. */ {self} { ECHO; } {operator} { /* * Check for embedded slash-star or dash-dash; those * are comment starts, so operator must stop there. * Note that slash-star or dash-dash at the first * character will match a prior rule, not this one. */ int nchars = yyleng; char *slashstar = strstr(yytext, "/*"); char *dashdash = strstr(yytext, "--"); if (slashstar && dashdash) { /* if both appear, take the first one */ if (slashstar > dashdash) slashstar = dashdash; } else if (!slashstar) slashstar = dashdash; if (slashstar) nchars = slashstar - yytext; /* * For SQL compatibility, '+' and '-' cannot be the * last char of a multi-char operator unless the operator * contains chars that are not in SQL operators. * The idea is to lex '=-' as two operators, but not * to forbid operator names like '?-' that could not be * sequences of SQL operators. */ while (nchars > 1 && (yytext[nchars-1] == '+' || yytext[nchars-1] == '-')) { int ic; for (ic = nchars-2; ic >= 0; ic--) { if (strchr("~!@#^&|`?%", yytext[ic])) break; } if (ic >= 0) break; /* found a char that makes it OK */ nchars--; /* else remove the +/-, and check again */ } if (nchars < yyleng) { /* Strip the unwanted chars from the token */ yyless(nchars); } ECHO; } {param} { ECHO; } {integer} { ECHO; } {decimal} { ECHO; } {decimalfail} { /* throw back the .., and treat as integer */ yyless(yyleng-2); ECHO; } {real} { ECHO; } {realfail1} { /* * throw back the [Ee], and treat as {decimal}. Note * that it is possible the input is actually {integer}, * but since this case will almost certainly lead to a * syntax error anyway, we don't bother to distinguish. */ yyless(yyleng-1); ECHO; } {realfail2} { /* throw back the [Ee][+-], and proceed as above */ yyless(yyleng-2); ECHO; } {identifier} { ECHO; } {other} { ECHO; } /* * Everything from here down is psql-specific. */ <> { StackElem *stackelem = cur_state->buffer_stack; if (stackelem == NULL) return LEXRES_EOL; /* end of input reached */ /* * We were expanding a variable, so pop the inclusion * stack and keep lexing */ cur_state->buffer_stack = stackelem->next; yy_delete_buffer(stackelem->buf); free(stackelem->bufstring); if (stackelem->origstring) free(stackelem->origstring); free(stackelem); stackelem = cur_state->buffer_stack; if (stackelem != NULL) { yy_switch_to_buffer(stackelem->buf); cur_state->curline = stackelem->bufstring; cur_state->refline = stackelem->origstring ? stackelem->origstring : stackelem->bufstring; } else { yy_switch_to_buffer(cur_state->scanbufhandle); cur_state->curline = cur_state->scanbuf; cur_state->refline = cur_state->scanline; } } /* * Exclusive lexer states to handle backslash command lexing */ { /* command name ends at whitespace or backslash; eat all else */ {space}|"\\" { yyless(0); return LEXRES_OK; } {other} { ECHO; } } { /* eat any whitespace, then decide what to do at first nonblank */ {space}+ { } "\\" { /* * backslash is end of command or next command, do not eat * * XXX this means we can't conveniently accept options * that start with a backslash; therefore, option * processing that encourages use of backslashes is rather * broken. */ yyless(0); return LEXRES_OK; } {quote} { *option_quote = '\''; BEGIN(xslashquote); } "`" { if (option_type == OT_VERBATIM) { /* in verbatim mode, backquote is not special */ ECHO; BEGIN(xslashdefaultarg); } else { *option_quote = '`'; BEGIN(xslashbackquote); } } :[A-Za-z0-9_]* { /* Possible psql variable substitution */ if (option_type == OT_VERBATIM) ECHO; else { const char *value; value = GetVariable(pset.vars, yytext + 1); /* * The variable value is just emitted without any * further examination. This is consistent with the * pre-8.0 code behavior, if not with the way that * variables are handled outside backslash commands. */ if (value) appendPQExpBufferStr(output_buf, value); } *option_quote = ':'; return LEXRES_OK; } "|" { ECHO; if (option_type == OT_FILEPIPE) { /* treat like whole-string case */ BEGIN(xslashwholeline); } else { /* treat like default case */ BEGIN(xslashdefaultarg); } } {dquote} { *option_quote = '"'; ECHO; BEGIN(xslashquotedarg); } {other} { ECHO; BEGIN(xslashdefaultarg); } } { /* * single-quoted text: copy literally except for '' and backslash * sequences */ {quote} { return LEXRES_OK; } {xqdouble} { appendPQExpBufferChar(output_buf, '\''); } "\\n" { appendPQExpBufferChar(output_buf, '\n'); } "\\t" { appendPQExpBufferChar(output_buf, '\t'); } "\\b" { appendPQExpBufferChar(output_buf, '\b'); } "\\r" { appendPQExpBufferChar(output_buf, '\r'); } "\\f" { appendPQExpBufferChar(output_buf, '\f'); } {xeoctesc} { /* octal case */ appendPQExpBufferChar(output_buf, (char) strtol(yytext + 1, NULL, 8)); } {xehexesc} { /* hex case */ appendPQExpBufferChar(output_buf, (char) strtol(yytext + 2, NULL, 16)); } "\\". { emit(yytext + 1, 1); } {other}|\n { ECHO; } } { /* * backticked text: copy everything until next backquote or end of line. * Invocation of the command will happen in psql_scan_slash_option. */ "`" { return LEXRES_OK; } {other}|\n { ECHO; } } { /* * Copy everything until unquoted whitespace or end of line. Quotes * do not get stripped yet. */ {space} { yyless(0); return LEXRES_OK; } "\\" { /* * unquoted backslash is end of command or next command, * do not eat * * (this was not the behavior pre-8.0, but it seems * consistent) */ yyless(0); return LEXRES_OK; } {dquote} { *option_quote = '"'; ECHO; BEGIN(xslashquotedarg); } {other} { ECHO; } } { /* double-quoted text within a default-type argument: copy */ {dquote} { ECHO; BEGIN(xslashdefaultarg); } {other}|\n { ECHO; } } { /* copy everything until end of input line */ /* but suppress leading whitespace */ {space}+ { if (output_buf->len > 0) ECHO; } {other} { ECHO; } } { /* at end of command, eat a double backslash, but not anything else */ "\\\\" { return LEXRES_OK; } {other}|\n { yyless(0); return LEXRES_OK; } } %% /* * Create a lexer working state struct. */ PsqlScanState psql_scan_create(void) { PsqlScanState state; state = (PsqlScanStateData *) pg_malloc_zero(sizeof(PsqlScanStateData)); psql_scan_reset(state); return state; } /* * Destroy a lexer working state struct, releasing all resources. */ void psql_scan_destroy(PsqlScanState state) { psql_scan_finish(state); psql_scan_reset(state); free(state); } /* * Set up to perform lexing of the given input line. * * The text at *line, extending for line_len bytes, will be scanned by * subsequent calls to the psql_scan routines. psql_scan_finish should * be called when scanning is complete. Note that the lexer retains * a pointer to the storage at *line --- this string must not be altered * or freed until after psql_scan_finish is called. */ void psql_scan_setup(PsqlScanState state, const char *line, int line_len) { /* Mustn't be scanning already */ psql_assert(state->scanbufhandle == NULL); psql_assert(state->buffer_stack == NULL); /* Do we need to hack the character set encoding? */ state->encoding = pset.encoding; state->safe_encoding = pg_valid_server_encoding_id(state->encoding); /* needed for prepare_buffer */ cur_state = state; /* Set up flex input buffer with appropriate translation and padding */ state->scanbufhandle = prepare_buffer(line, line_len, &state->scanbuf); state->scanline = line; /* Set lookaside data in case we have to map unsafe encoding */ state->curline = state->scanbuf; state->refline = state->scanline; } /* * Do lexical analysis of SQL command text. * * The text previously passed to psql_scan_setup is scanned, and appended * (possibly with transformation) to query_buf. * * The return value indicates the condition that stopped scanning: * * PSCAN_SEMICOLON: found a command-ending semicolon. (The semicolon is * transferred to query_buf.) The command accumulated in query_buf should * be executed, then clear query_buf and call again to scan the remainder * of the line. * * PSCAN_BACKSLASH: found a backslash that starts a psql special command. * Any previous data on the line has been transferred to query_buf. * The caller will typically next call psql_scan_slash_command(), * perhaps psql_scan_slash_option(), and psql_scan_slash_command_end(). * * PSCAN_INCOMPLETE: the end of the line was reached, but we have an * incomplete SQL command. *prompt is set to the appropriate prompt type. * * PSCAN_EOL: the end of the line was reached, and there is no lexical * reason to consider the command incomplete. The caller may or may not * choose to send it. *prompt is set to the appropriate prompt type if * the caller chooses to collect more input. * * In the PSCAN_INCOMPLETE and PSCAN_EOL cases, psql_scan_finish() should * be called next, then the cycle may be repeated with a fresh input line. * * In all cases, *prompt is set to an appropriate prompt type code for the * next line-input operation. */ PsqlScanResult psql_scan(PsqlScanState state, PQExpBuffer query_buf, promptStatus_t *prompt) { PsqlScanResult result; int lexresult; /* Must be scanning already */ psql_assert(state->scanbufhandle); /* Set up static variables that will be used by yylex */ cur_state = state; output_buf = query_buf; if (state->buffer_stack != NULL) yy_switch_to_buffer(state->buffer_stack->buf); else yy_switch_to_buffer(state->scanbufhandle); BEGIN(state->start_state); /* And lex. */ lexresult = yylex(); /* Update static vars back to the state struct */ state->start_state = YY_START; /* * Check termination state and return appropriate result info. */ switch (lexresult) { case LEXRES_EOL: /* end of input */ switch (state->start_state) { case INITIAL: if (state->paren_depth > 0) { result = PSCAN_INCOMPLETE; *prompt = PROMPT_PAREN; } else if (query_buf->len > 0) { result = PSCAN_EOL; *prompt = PROMPT_CONTINUE; } else { /* never bother to send an empty buffer */ result = PSCAN_INCOMPLETE; *prompt = PROMPT_READY; } break; case xb: result = PSCAN_INCOMPLETE; *prompt = PROMPT_SINGLEQUOTE; break; case xc: result = PSCAN_INCOMPLETE; *prompt = PROMPT_COMMENT; break; case xd: result = PSCAN_INCOMPLETE; *prompt = PROMPT_DOUBLEQUOTE; break; case xh: result = PSCAN_INCOMPLETE; *prompt = PROMPT_SINGLEQUOTE; break; case xq: result = PSCAN_INCOMPLETE; *prompt = PROMPT_SINGLEQUOTE; break; case xe: result = PSCAN_INCOMPLETE; *prompt = PROMPT_SINGLEQUOTE; break; case xdolq: result = PSCAN_INCOMPLETE; *prompt = PROMPT_DOLLARQUOTE; break; default: /* can't get here */ fprintf(stderr, "invalid YY_START\n"); exit(1); } break; case LEXRES_SEMI: /* semicolon */ result = PSCAN_SEMICOLON; *prompt = PROMPT_READY; break; case LEXRES_BACKSLASH: /* backslash */ result = PSCAN_BACKSLASH; *prompt = PROMPT_READY; break; default: /* can't get here */ fprintf(stderr, "invalid yylex result\n"); exit(1); } return result; } /* * Clean up after scanning a string. This flushes any unread input and * releases resources (but not the PsqlScanState itself). Note however * that this does not reset the lexer scan state; that can be done by * psql_scan_reset(), which is an orthogonal operation. * * It is legal to call this when not scanning anything (makes it easier * to deal with error recovery). */ void psql_scan_finish(PsqlScanState state) { /* Drop any incomplete variable expansions. */ while (state->buffer_stack != NULL) { StackElem *stackelem = state->buffer_stack; state->buffer_stack = stackelem->next; yy_delete_buffer(stackelem->buf); free(stackelem->bufstring); if (stackelem->origstring) free(stackelem->origstring); free(stackelem); } /* Done with the outer scan buffer, too */ if (state->scanbufhandle) yy_delete_buffer(state->scanbufhandle); state->scanbufhandle = NULL; if (state->scanbuf) free(state->scanbuf); state->scanbuf = NULL; } /* * Reset lexer scanning state to start conditions. This is appropriate * for executing \r psql commands (or any other time that we discard the * prior contents of query_buf). It is not, however, necessary to do this * when we execute and clear the buffer after getting a PSCAN_SEMICOLON or * PSCAN_EOL scan result, because the scan state must be INITIAL when those * conditions are returned. * * Note that this is unrelated to flushing unread input; that task is * done by psql_scan_finish(). */ void psql_scan_reset(PsqlScanState state) { state->start_state = INITIAL; state->paren_depth = 0; state->xcdepth = 0; /* not really necessary */ if (state->dolqstart) free(state->dolqstart); state->dolqstart = NULL; } /* * Return true if lexer is currently in an "inside quotes" state. * * This is pretty grotty but is needed to preserve the old behavior * that mainloop.c drops blank lines not inside quotes without even * echoing them. */ bool psql_scan_in_quote(PsqlScanState state) { return state->start_state != INITIAL; } /* * Scan the command name of a psql backslash command. This should be called * after psql_scan() returns PSCAN_BACKSLASH. It is assumed that the input * has been consumed through the leading backslash. * * The return value is a malloc'd copy of the command name, as parsed off * from the input. */ char * psql_scan_slash_command(PsqlScanState state) { PQExpBufferData mybuf; int lexresult; /* Must be scanning already */ psql_assert(state->scanbufhandle); /* Build a local buffer that we'll return the data of */ initPQExpBuffer(&mybuf); /* Set up static variables that will be used by yylex */ cur_state = state; output_buf = &mybuf; if (state->buffer_stack != NULL) yy_switch_to_buffer(state->buffer_stack->buf); else yy_switch_to_buffer(state->scanbufhandle); BEGIN(xslashcmd); /* And lex. */ lexresult = yylex(); /* There are no possible errors in this lex state... */ return mybuf.data; } /* * Parse off the next argument for a backslash command, and return it as a * malloc'd string. If there are no more arguments, returns NULL. * * type tells what processing, if any, to perform on the option string; * for example, if it's a SQL identifier, we want to downcase any unquoted * letters. * * if quote is not NULL, *quote is set to 0 if no quoting was found, else * the quote symbol. * * if semicolon is true, unquoted trailing semicolon(s) that would otherwise * be taken as part of the option string will be stripped. * * NOTE: the only possible syntax errors for backslash options are unmatched * quotes, which are detected when we run out of input. Therefore, on a * syntax error we just throw away the string and return NULL; there is no * need to worry about flushing remaining input. */ char * psql_scan_slash_option(PsqlScanState state, enum slash_option_type type, char *quote, bool semicolon) { PQExpBufferData mybuf; int lexresult; char local_quote; bool badarg; /* Must be scanning already */ psql_assert(state->scanbufhandle); if (quote == NULL) quote = &local_quote; *quote = 0; /* Build a local buffer that we'll return the data of */ initPQExpBuffer(&mybuf); /* Set up static variables that will be used by yylex */ cur_state = state; output_buf = &mybuf; option_type = type; option_quote = quote; if (state->buffer_stack != NULL) yy_switch_to_buffer(state->buffer_stack->buf); else yy_switch_to_buffer(state->scanbufhandle); if (type == OT_WHOLE_LINE) BEGIN(xslashwholeline); else BEGIN(xslasharg); /* And lex. */ lexresult = yylex(); /* * Check the lex result: we should have gotten back either LEXRES_OK * or LEXRES_EOL (the latter indicating end of string). If we were inside * a quoted string, as indicated by YY_START, EOL is an error. */ psql_assert(lexresult == LEXRES_EOL || lexresult == LEXRES_OK); badarg = false; switch (YY_START) { case xslasharg: /* empty arg, or possibly a psql variable substitution */ break; case xslashquote: if (lexresult != LEXRES_OK) badarg = true; /* hit EOL not ending quote */ break; case xslashbackquote: if (lexresult != LEXRES_OK) badarg = true; /* hit EOL not ending quote */ else { /* Perform evaluation of backticked command */ char *cmd = mybuf.data; FILE *fd; bool error = false; PQExpBufferData output; char buf[512]; size_t result; fd = popen(cmd, PG_BINARY_R); if (!fd) { psql_error("%s: %s\n", cmd, strerror(errno)); error = true; } initPQExpBuffer(&output); if (!error) { do { result = fread(buf, 1, sizeof(buf), fd); if (ferror(fd)) { psql_error("%s: %s\n", cmd, strerror(errno)); error = true; break; } appendBinaryPQExpBuffer(&output, buf, result); } while (!feof(fd)); } if (fd && pclose(fd) == -1) { psql_error("%s: %s\n", cmd, strerror(errno)); error = true; } if (PQExpBufferBroken(&output)) { psql_error("%s: out of memory\n", cmd); error = true; } /* Now done with cmd, transfer result to mybuf */ resetPQExpBuffer(&mybuf); if (!error) { /* strip any trailing newline */ if (output.len > 0 && output.data[output.len - 1] == '\n') output.len--; appendBinaryPQExpBuffer(&mybuf, output.data, output.len); } termPQExpBuffer(&output); } break; case xslashdefaultarg: /* Strip any trailing semi-colons if requested */ if (semicolon) { while (mybuf.len > 0 && mybuf.data[mybuf.len - 1] == ';') { mybuf.data[--mybuf.len] = '\0'; } } /* * If SQL identifier processing was requested, then we strip out * excess double quotes and downcase unquoted letters. * Doubled double-quotes become output double-quotes, per spec. * * Note that a string like FOO"BAR"BAZ will be converted to * fooBARbaz; this is somewhat inconsistent with the SQL spec, * which would have us parse it as several identifiers. But * for psql's purposes, we want a string like "foo"."bar" to * be treated as one option, so there's little choice. */ if (type == OT_SQLID || type == OT_SQLIDHACK) { bool inquotes = false; char *cp = mybuf.data; while (*cp) { if (*cp == '"') { if (inquotes && cp[1] == '"') { /* Keep the first quote, remove the second */ cp++; } inquotes = !inquotes; /* Collapse out quote at *cp */ memmove(cp, cp + 1, strlen(cp)); mybuf.len--; /* do not advance cp */ } else { if (!inquotes && type == OT_SQLID) *cp = pg_tolower((unsigned char) *cp); cp += PQmblen(cp, pset.encoding); } } } break; case xslashquotedarg: /* must have hit EOL inside double quotes */ badarg = true; break; case xslashwholeline: /* always okay */ break; default: /* can't get here */ fprintf(stderr, "invalid YY_START\n"); exit(1); } if (badarg) { psql_error("unterminated quoted string\n"); termPQExpBuffer(&mybuf); return NULL; } /* * An unquoted empty argument isn't possible unless we are at end of * command. Return NULL instead. */ if (mybuf.len == 0 && *quote == 0) { termPQExpBuffer(&mybuf); return NULL; } /* Else return the completed string. */ return mybuf.data; } /* * Eat up any unused \\ to complete a backslash command. */ void psql_scan_slash_command_end(PsqlScanState state) { int lexresult; /* Must be scanning already */ psql_assert(state->scanbufhandle); /* Set up static variables that will be used by yylex */ cur_state = state; output_buf = NULL; if (state->buffer_stack != NULL) yy_switch_to_buffer(state->buffer_stack->buf); else yy_switch_to_buffer(state->scanbufhandle); BEGIN(xslashend); /* And lex. */ lexresult = yylex(); /* There are no possible errors in this lex state... */ } /* * Push the given string onto the stack of stuff to scan. * * cur_state must point to the active PsqlScanState. * * NOTE SIDE EFFECT: the new buffer is made the active flex input buffer. */ static void push_new_buffer(const char *newstr) { StackElem *stackelem; stackelem = (StackElem *) pg_malloc(sizeof(StackElem)); stackelem->buf = prepare_buffer(newstr, strlen(newstr), &stackelem->bufstring); cur_state->curline = stackelem->bufstring; if (cur_state->safe_encoding) { stackelem->origstring = NULL; cur_state->refline = stackelem->bufstring; } else { stackelem->origstring = pg_strdup(newstr); cur_state->refline = stackelem->origstring; } stackelem->next = cur_state->buffer_stack; cur_state->buffer_stack = stackelem; } /* * Set up a flex input buffer to scan the given data. We always make a * copy of the data. If working in an unsafe encoding, the copy has * multibyte sequences replaced by FFs to avoid fooling the lexer rules. * * cur_state must point to the active PsqlScanState. * * NOTE SIDE EFFECT: the new buffer is made the active flex input buffer. */ static YY_BUFFER_STATE prepare_buffer(const char *txt, int len, char **txtcopy) { char *newtxt; /* Flex wants two \0 characters after the actual data */ newtxt = pg_malloc(len + 2); *txtcopy = newtxt; newtxt[len] = newtxt[len + 1] = YY_END_OF_BUFFER_CHAR; if (cur_state->safe_encoding) memcpy(newtxt, txt, len); else { /* Gotta do it the hard way */ int i = 0; while (i < len) { int thislen = PQmblen(txt + i, cur_state->encoding); /* first byte should always be okay... */ newtxt[i] = txt[i]; i++; while (--thislen > 0) newtxt[i++] = (char) 0xFF; } } return yy_scan_buffer(newtxt, len + 2); } /* * emit() --- body for ECHO macro * * NB: this must be used for ALL and ONLY the text copied from the flex * input data. If you pass it something that is not part of the yytext * string, you are making a mistake. Internally generated text can be * appended directly to output_buf. */ static void emit(const char *txt, int len) { if (cur_state->safe_encoding) appendBinaryPQExpBuffer(output_buf, txt, len); else { /* Gotta do it the hard way */ const char *reference = cur_state->refline; int i; reference += (txt - cur_state->curline); for (i = 0; i < len; i++) { char ch = txt[i]; if (ch == (char) 0xFF) ch = reference[i]; appendPQExpBufferChar(output_buf, ch); } } } static bool is_utf16_surrogate_first(uint32 c) { return (c >= 0xD800 && c <= 0xDBFF); }