From f945f46193690841315b79f5961d3721c73621d9 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 18 Mar 2000 18:03:12 +0000 Subject: [PATCH] Modify lexing of multi-char operators per pghackers discussion around 16-Mar-00: trailing + or - is not part of the operator unless the operator also contains characters not present in SQL92-defined operators. This solves the 'X=-Y' problem without unduly constraining users' choice of operator names --- in particular, no existing Postgres operator names become invalid. Also, remove processing of // comments, as agreed in the same thread. --- doc/src/sgml/ref/create_operator.sgml | 87 ++++++++++++++++-------- doc/src/sgml/syntax.sgml | 25 ++++--- src/backend/parser/scan.l | 80 +++++++++++++++++----- src/bin/psql/mainloop.c | 5 +- src/interfaces/ecpg/preproc/pgc.l | 95 ++++++++++++++++++++------- 5 files changed, 211 insertions(+), 81 deletions(-) diff --git a/doc/src/sgml/ref/create_operator.sgml b/doc/src/sgml/ref/create_operator.sgml index 7f06c4cece..36d791d2a7 100644 --- a/doc/src/sgml/ref/create_operator.sgml +++ b/doc/src/sgml/ref/create_operator.sgml @@ -1,5 +1,5 @@ @@ -60,8 +60,8 @@ CREATE OPERATOR name ( PROCEDURE = type1 - The type for the left-hand side of the operator, if any. This option would be - omitted for a right-unary operator. + The type of the left-hand argument of the operator, if any. + This option would be omitted for a left-unary operator. @@ -69,8 +69,8 @@ CREATE OPERATOR name ( PROCEDURE = type2 - The type for the right-hand side of the operator, if any. This option would be - omitted for a left-unary operator. + The type of the right-hand argument of the operator, if any. + This option would be omitted for a right-unary operator. @@ -78,7 +78,7 @@ CREATE OPERATOR name ( PROCEDURE = com_op - The commutator for this operator. + The commutator of this operator. @@ -110,7 +110,7 @@ CREATE OPERATOR name ( PROCEDURE = HASHES -Indicates this operator can support a hash-join algorithm. + Indicates this operator can support a hash join. @@ -118,7 +118,8 @@ Indicates this operator can support a hash-join algorithm. left_sort_op - Operator that sorts the left-hand data type of this operator. + If this operator can support a merge join, the + operator that sorts the left-hand data type of this operator. @@ -126,7 +127,8 @@ Indicates this operator can support a hash-join algorithm. right_sort_op - Operator that sorts the right-hand data type of this operator. + If this operator can support a merge join, the + operator that sorts the right-hand data type of this operator. @@ -172,22 +174,56 @@ CREATE The operator name - is a sequence of up to thirty two (32) characters in any combination - from the following: + is a sequence of up to NAMEDATALEN-1 (31 by default) characters + from the following list: -+ - * / < > = ~ ! @ # % ^ & | ` ? $ : ++ - * / < > = ~ ! @ # % ^ & | ` ? $ : + + There are a few restrictions on your choice of name: + + + + "$" and ":" cannot be defined as single-character operators, + although they can be part of a multi-character operator name. + + + + + "--" and "/*" cannot appear anywhere in an operator name, + since they will be taken as the start of a comment. + + + + + A multi-character operator name cannot end in "+" or "-", + unless the name also contains at least one of these characters: + +~ ! @ # % ^ & | ` ? $ : + + For example, @- is an allowed operator name, + but *- is not. + This restriction allows Postgres to + parse SQL-compliant queries without requiring spaces between tokens. + + + + - No alphabetic characters are allowed in an operator name. - This enables Postgres to parse SQL input - into tokens without requiring spaces between each token. + When working with non-SQL-standard operator names, you will usually + need to separate adjacent operators with spaces to avoid ambiguity. + For example, if you have defined a left-unary operator named "@", + you cannot write X*@Y; you must write + X* @Y to ensure that + Postgres reads it as two operator names + not one. - The operator "!=" is mapped to "<>" on input, so they are - therefore equivalent. + The operator "!=" is mapped to "<>" on input, so these two names + are always equivalent. At least one of LEFTARG and RIGHTARG must be defined. For @@ -196,11 +232,11 @@ CREATE unary operators only RIGHTARG should be defined. - Also, the + The func_name procedure must have been previously defined using CREATE FUNCTION and must be defined to accept the correct number of arguments - (either one or two). + (either one or two) of the indicated types. The commutator operator should be identified if one exists, @@ -247,8 +283,6 @@ MYBOXES.description !== "0,0,1,1"::box does not yet have a commutator itself, then the commutator's entry is updated to have the newly created operator as its commutator. This applies to the negator, as well. - - This is to allow the definition of two operators that are the commutators or the negators of each other. The first operator should be defined without a commutator or negator @@ -258,7 +292,7 @@ MYBOXES.description !== "0,0,1,1"::box it also works to just have both operators refer to each other.) - The next three specifications are present to support the + The HASHES, SORT1, and SORT2 options are present to support the query optimizer in performing joins. Postgres can always evaluate a join (i.e., processing a clause with two tuple @@ -294,9 +328,8 @@ MYBOXES.description !== "0,0,1,1"::box be worth the complexity involved. - The last two pieces of the specification are present so - the query optimizer can estimate result sizes. If a - clause of the form: + The RESTRICT and JOIN options assist the query optimizer in estimating + result sizes. If a clause of the form: MYBOXES.description <<< "0,0,1,1"::box @@ -310,7 +343,7 @@ MYBOXES.description <<< "0,0,1,1"::box data types and returns a floating point number. The query optimizer simply calls this function, passing the parameter "0,0,1,1" and multiplies the result by the relation - size to get the desired expected number of instances. + size to get the expected number of instances. Similarly, when the operands of the operator both contain @@ -318,7 +351,7 @@ MYBOXES.description <<< "0,0,1,1"::box size of the resulting join. The function join_proc will return another floating point number which will be multiplied by the cardinalities of the two classes involved to - compute the desired expected result size. + compute the expected result size. The difference between the function diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index 332464429c..918d91a05c 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -315,12 +315,11 @@ UNCOMMITTED UNNAMED A comment - is an arbitrary sequence of characters following double dashes up to the end - of the line. We also support double-slashes as comments, e.g.: + is an arbitrary sequence of characters beginning with double dashes + and extending to the end of the line, e.g.: -- This is a standard SQL comment -// And this is another supported comment style, like C++ We also support C-style block comments, e.g.: @@ -331,6 +330,9 @@ We also support C-style block comments, e.g.: comment */ + +A comment beginning with "/*" extends to the first occurrence of "*/". + @@ -340,17 +342,22 @@ We also support C-style block comments, e.g.: Names in SQL are sequences of less than NAMEDATALEN alphanumeric characters, starting with an alphabetic character. By default, NAMEDATALEN is set - to 32, but at the time the system is built, NAMEDATALEN can be changed + to 32 (but at the time the system is built, NAMEDATALEN can be changed by changing the #define in - src/backend/include/postgres.h. + src/backend/include/postgres.h). Underscore ("_") is considered an alphabetic character. - In some contexts, names may contain other characters if surrounded - by double quotes. For example, table or column names may contain otherwise - disallowed characters such as spaces, ampersands, etc. using this - technique. + Names containing other characters may be formed by surrounding them + with double quotes. For example, table or column names may contain + otherwise disallowed characters such as spaces, ampersands, etc. if + quoted. Quoting a name also makes it case-sensitive, + whereas unquoted names are always folded to lower case. For example, + the names FOO, foo + and "foo" are + considered the same by Postgres, but + "Foo" is a different name. diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index f972d6ead1..64a389b768 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/parser/scan.l,v 1.67 2000/03/13 01:52:06 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/parser/scan.l,v 1.68 2000/03/18 18:03:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -87,10 +87,10 @@ static void addlit(char *ytext, int yleng); * and to eliminate parsing troubles for numeric strings. * Exclusive states: * binary numeric string - thomas 1997-11-16 - * extended C-style comments - tgl 1997-07-12 - * delimited identifiers (double-quoted identifiers) - tgl 1997-10-27 + * extended C-style comments - thomas 1997-07-12 + * delimited identifiers (double-quoted identifiers) - thomas 1997-10-27 * hexadecimal numeric string - thomas 1997-11-16 - * quoted strings - tgl 1997-07-30 + * quoted strings - thomas 1997-07-30 */ %x xb @@ -144,7 +144,7 @@ xdinside [^"]+ * have something like plus-slash-star, lex will think this is a 3-character * operator whereas we want to see it as a + operator and a comment start. * The solution is two-fold: - * 1. append {op_and_self}* to xcstart so that it matches as much text as + * 1. append {op_chars}* to xcstart so that it matches as much text as * {operator} would. Then the tie-breaker (first matching rule of same * length) ensures xcstart wins. We put back the extra stuff with yyless() * in case it contains a star-slash that should terminate the comment. @@ -154,7 +154,7 @@ xdinside [^"]+ * SQL92-style comments, which start with dash-dash, have similar interactions * with the operator rule. */ -xcstart \/\*{op_and_self}* +xcstart \/\*{op_chars}* xcstop \*+\/ xcinside ([^*]+)|(\*+[^/]) @@ -166,10 +166,19 @@ identifier {letter}{letter_or_digit}* typecast "::" -/* NB: if you change "self", fix the copy in the operator rule too! */ +/* + * "self" is the set of chars that should be returned as single-character + * tokens. "op_chars" is the set of chars that can make up "Op" tokens, + * which can be one or more characters long (but if a single-char token + * appears in the "self" set, it is not to be returned as an Op). Note + * that the sets overlap, but each has some chars that are not in the other. + * + * If you change either set, adjust the character lists appearing in the + * rule for "operator"! + */ self [,()\[\].;$\:\+\-\*\/\%\^\<\>\=\|] -op_and_self [\~\!\@\#\^\&\|\`\?\$\:\+\-\*\/\%\<\>\=] -operator {op_and_self}+ +op_chars [\~\!\@\#\^\&\|\`\?\$\:\+\-\*\/\%\<\>\=] +operator {op_chars}+ /* we no longer allow unary minus in numbers. * instead we pass it separately to parser. there it gets @@ -202,7 +211,7 @@ horiz_space [ \t\f] newline [\n\r] non_newline [^\n\r] -comment (("--"|"//"){non_newline}*) +comment ("--"{non_newline}*) whitespace ({space}|{comment}) @@ -220,7 +229,7 @@ other . /* DO NOT PUT ANY COMMENTS IN THE FOLLOWING SECTION. * AT&T lex does not properly handle C-style comments in this second lex block. - * So, put comments here. tgl - 1997-09-08 + * So, put comments here. thomas - 1997-09-08 * * Quoted strings must allow some special characters such as single-quote * and newline. @@ -329,23 +338,57 @@ other . {self} { return yytext[0]; } {operator} { - /* Check for embedded slash-star or dash-dash */ - char *slashstar = strstr((char*)yytext, "/*"); - char *dashdash = strstr((char*)yytext, "--"); + /* + * Check for embedded slash-star or dash-dash; those + * are comment starts, so operator must stop there. + * Note that slash-star or dash-dash at the first + * character will match a prior rule, not this one. + */ + int nchars = yyleng; + char *slashstar = strstr((char*)yytext, "/*"); + char *dashdash = strstr((char*)yytext, "--"); if (slashstar && dashdash) { + /* if both appear, take the first one */ if (slashstar > dashdash) slashstar = dashdash; } else if (!slashstar) slashstar = dashdash; - if (slashstar) + nchars = slashstar - ((char*)yytext); + + /* + * For SQL92 compatibility, '+' and '-' cannot be the + * last char of a multi-char operator unless the operator + * contains chars that are not in SQL92 operators. + * The idea is to lex '=-' as two operators, but not + * to forbid operator names like '?-' that could not be + * sequences of SQL92 operators. + */ + while (nchars > 1 && + (yytext[nchars-1] == '+' || + yytext[nchars-1] == '-')) { - int nchars = slashstar - ((char*)yytext); + int ic; + + for (ic = nchars-2; ic >= 0; ic--) + { + if (strchr("~!@#&`?$:%^|", yytext[ic])) + break; + } + if (ic >= 0) + break; /* found a char that makes it OK */ + nchars--; /* else remove the +/-, and check again */ + } + + if (nchars < yyleng) + { + /* Strip the unwanted chars from the token */ yyless(nchars); - /* If what we have left is only one char, and it's + /* + * If what we have left is only one char, and it's * one of the characters matching "self", then * return it as a character token the same way * that the "self" rule would have. @@ -355,8 +398,9 @@ other . return yytext[0]; } + /* Convert "!=" operator to "<>" for compatibility */ if (strcmp((char*)yytext, "!=") == 0) - yylval.str = pstrdup("<>"); /* compatibility */ + yylval.str = pstrdup("<>"); else yylval.str = pstrdup((char*)yytext); return Op; diff --git a/src/bin/psql/mainloop.c b/src/bin/psql/mainloop.c index 4f71f3e410..eadd50e94a 100644 --- a/src/bin/psql/mainloop.c +++ b/src/bin/psql/mainloop.c @@ -3,7 +3,7 @@ * * Copyright 2000 by PostgreSQL Global Development Group * - * $Header: /cvsroot/pgsql/src/bin/psql/mainloop.c,v 1.25 2000/03/13 13:46:32 petere Exp $ + * $Header: /cvsroot/pgsql/src/bin/psql/mainloop.c,v 1.26 2000/03/18 18:03:11 tgl Exp $ */ #include "postgres.h" #include "mainloop.h" @@ -318,8 +318,7 @@ MainLoop(FILE *source) } /* single-line comment? truncate line */ - else if ((line[i] == '-' && line[i + thislen] == '-') || - (line[i] == '/' && line[i + thislen] == '/')) + else if (line[i] == '-' && line[i + thislen] == '-') { line[i] = '\0'; /* remove comment */ break; diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l index bcc8e6430e..992b293085 100644 --- a/src/interfaces/ecpg/preproc/pgc.l +++ b/src/interfaces/ecpg/preproc/pgc.l @@ -12,7 +12,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/interfaces/ecpg/preproc/pgc.l,v 1.55 2000/03/18 05:44:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/interfaces/ecpg/preproc/pgc.l,v 1.56 2000/03/18 18:03:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -86,10 +86,10 @@ static struct _if_value { * and to eliminate parsing troubles for numeric strings. * Exclusive states: * binary numeric string - thomas 1997-11-16 - * extended C-style comments - tgl 1997-07-12 - * delimited identifiers (double-quoted identifiers) - tgl 1997-10-27 + * extended C-style comments - thomas 1997-07-12 + * delimited identifiers (double-quoted identifiers) - thomas 1997-10-27 * hexadecimal numeric string - thomas 1997-11-16 - * quoted strings - tgl 1997-07-30 + * quoted strings - thomas 1997-07-30 */ %x xb @@ -146,14 +146,16 @@ xdcqdq \\\" xdcother [^"] xdcinside ({xdcqq}|{xdcqdq}|{xdcother}) -/* C-Style Comments +/* C-style comments + * * The "extended comment" syntax closely resembles allowable operator syntax. * The tricky part here is to get lex to recognize a string starting with * slash-star as a comment, when interpreting it as an operator would produce - * a longer match --- remember lex will prefer a longer match! Also, if we - * have tor whereas we want to see it as a + operator and a comment start. + * a longer match --- remember lex will prefer a longer match! Also, if we + * have something like plus-slash-star, lex will think this is a 3-character + * operator whereas we want to see it as a + operator and a comment start. * The solution is two-fold: - * 1. append {op_and_self}* to xcstart so that it matches as much text as + * 1. append {op_chars}* to xcstart so that it matches as much text as * {operator} would. Then the tie-breaker (first matching rule of same * length) ensures xcstart wins. We put back the extra stuff with yyless() * in case it contains a star-slash that should terminate the comment. @@ -163,22 +165,31 @@ xdcinside ({xdcqq}|{xdcqdq}|{xdcother}) * SQL92-style comments, which start with dash-dash, have similar interactions * with the operator rule. */ -xcstart \/\*{op_and_self}* +xcstart \/\*{op_chars}* xcstop \*+\/ xcinside ([^*]+)|(\*+[^/]) digit [0-9] letter [\200-\377_A-Za-z] -letter_or_digit [\200-\377_A-Za-z0-9] +letter_or_digit [\200-\377_A-Za-z0-9] identifier {letter}{letter_or_digit}* typecast "::" -/* NB: if you change "self", fix the copy in the operator rule too! */ +/* + * "self" is the set of chars that should be returned as single-character + * tokens. "op_chars" is the set of chars that can make up "Op" tokens, + * which can be one or more characters long (but if a single-char token + * appears in the "self" set, it is not to be returned as an Op). Note + * that the sets overlap, but each has some chars that are not in the other. + * + * If you change either set, adjust the character lists appearing in the + * rule for "operator"! + */ self [,()\[\].;$\:\+\-\*\/\%\^\<\>\=\|] -op_and_self [\~\!\@\#\^\&\|\`\?\$\:\+\-\*\/\%\<\>\=] -operator {op_and_self}+ +op_chars [\~\!\@\#\^\&\|\`\?\$\:\+\-\*\/\%\<\>\=] +operator {op_chars}+ /* we no longer allow unary minus in numbers. * instead we pass it separately to parser. there it gets @@ -215,7 +226,7 @@ horiz_space [ \t\f] newline [\n\r] non_newline [^\n\r] -comment (("--"|"//"){non_newline}*) +comment ("--"{non_newline}*) whitespace ({space}|{comment}) @@ -250,7 +261,7 @@ cppline {space}*#(.*\\{line_end})*.* /* DO NOT PUT ANY COMMENTS IN THE FOLLOWING SECTION. * AT&T lex does not properly handle C-style comments in this second lex block. - * So, put comments here. tgl - 1997-09-08 + * So, put comments here. thomas - 1997-09-08 * * Quoted strings must allow some special characters such as single-quote * and newline. @@ -294,15 +305,16 @@ cppline {space}*#(.*\\{line_end})*.* mmerror(ET_ERROR, "Bad binary integer input!"); return ICONST; } -<> { mmerror(ET_ERROR, "Unterminated binary integer"); } {xhinside} | {xbinside} { addlit(yytext, yyleng); } {xhcat} | -{xbcat} { /* ignore */ +{xbcat} { + /* ignore */ } +<> { mmerror(ET_ERROR, "Unterminated binary integer"); } {xhstart} { BEGIN(xh); @@ -367,23 +379,57 @@ cppline {space}*#(.*\\{line_end})*.* return yytext[0]; } {operator} { - /* Check for embedded slash-star or dash-dash */ - char *slashstar = strstr((char*)yytext, "/*"); - char *dashdash = strstr((char*)yytext, "--"); + /* + * Check for embedded slash-star or dash-dash; those + * are comment starts, so operator must stop there. + * Note that slash-star or dash-dash at the first + * character will match a prior rule, not this one. + */ + int nchars = yyleng; + char *slashstar = strstr((char*)yytext, "/*"); + char *dashdash = strstr((char*)yytext, "--"); if (slashstar && dashdash) { + /* if both appear, take the first one */ if (slashstar > dashdash) slashstar = dashdash; } else if (!slashstar) slashstar = dashdash; - if (slashstar) + nchars = slashstar - ((char*)yytext); + + /* + * For SQL92 compatibility, '+' and '-' cannot be the + * last char of a multi-char operator unless the operator + * contains chars that are not in SQL92 operators. + * The idea is to lex '=-' as two operators, but not + * to forbid operator names like '?-' that could not be + * sequences of SQL92 operators. + */ + while (nchars > 1 && + (yytext[nchars-1] == '+' || + yytext[nchars-1] == '-')) { - int nchars = slashstar - ((char*)yytext); + int ic; + + for (ic = nchars-2; ic >= 0; ic--) + { + if (strchr("~!@#&`?$:%^|", yytext[ic])) + break; + } + if (ic >= 0) + break; /* found a char that makes it OK */ + nchars--; /* else remove the +/-, and check again */ + } + + if (nchars < yyleng) + { + /* Strip the unwanted chars from the token */ yyless(nchars); - /* If what we have left is only one char, and it's + /* + * If what we have left is only one char, and it's * one of the characters matching "self", then * return it as a character token the same way * that the "self" rule would have. @@ -393,8 +439,9 @@ cppline {space}*#(.*\\{line_end})*.* return yytext[0]; } + /* Convert "!=" operator to "<>" for compatibility */ if (strcmp((char*)yytext, "!=") == 0) - yylval.str = mm_strdup("<>"); /* compatability */ + yylval.str = mm_strdup("<>"); else yylval.str = mm_strdup((char*)yytext); return Op;