postgresql/src/backend/parser/scansup.c

/*-------------------------------------------------------------------------
 *
 * scansup.c
 *	  support routines for the lex/flex scanner, used by both the normal
 * backend as well as the bootstrap backend
 *
 * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.39 2010/01/02 16:57:50 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <ctype.h>

#include "parser/scansup.h"
#include "mb/pg_wchar.h"


/* ----------------
 *		scanstr
 *
 * if the string passed in has escaped codes, map the escape codes to actual
 * chars
 *
 * the string returned is palloc'd and should eventually be pfree'd by the
 * caller!
 * ----------------
 */

char *
scanstr(const char *s)
{
	char	   *newStr;
	int			len,
				i,
				j;

	if (s == NULL || s[0] == '\0')
		return pstrdup("");

	len = strlen(s);

	newStr = palloc(len + 1);	/* string cannot get longer */

	for (i = 0, j = 0; i < len; i++)
	{
		if (s[i] == '\'')
		{
			/*
			 * Note: if scanner is working right, unescaped quotes can only
			 * appear in pairs, so there should be another character.
			 */
			i++;
			newStr[j] = s[i];
		}
		else if (s[i] == '\\')
		{
			i++;
			switch (s[i])
			{
				case 'b':
					newStr[j] = '\b';
					break;
				case 'f':
					newStr[j] = '\f';
					break;
				case 'n':
					newStr[j] = '\n';
					break;
				case 'r':
					newStr[j] = '\r';
					break;
				case 't':
					newStr[j] = '\t';
					break;
				case '0':
				case '1':
				case '2':
				case '3':
				case '4':
				case '5':
				case '6':
				case '7':
					{
						int			k;
						long		octVal = 0;

						for (k = 0;
							 s[i + k] >= '0' && s[i + k] <= '7' && k < 3;
							 k++)
							octVal = (octVal << 3) + (s[i + k] - '0');
						i += k - 1;
						newStr[j] = ((char) octVal);
					}
					break;
				default:
					newStr[j] = s[i];
					break;
			}					/* switch */
		}						/* s[i] == '\\' */
		else
			newStr[j] = s[i];
		j++;
	}
	newStr[j] = '\0';
	return newStr;
}


/*
 * downcase_truncate_identifier() --- do appropriate downcasing and
 * truncation of an unquoted identifier.  Optionally warn of truncation.
 *
 * Returns a palloc'd string containing the adjusted identifier.
 *
 * Note: in some usages the passed string is not null-terminated.
 *
 * Note: the API of this function is designed to allow for downcasing
 * transformations that increase the string length, but we don't yet
 * support that.  If you want to implement it, you'll need to fix
 * SplitIdentifierString() in utils/adt/varlena.c.
 */
char *
downcase_truncate_identifier(const char *ident, int len, bool warn)
{
	char	   *result;
	int			i;

	result = palloc(len + 1);

	/*
	 * SQL99 specifies Unicode-aware case normalization, which we don't yet
	 * have the infrastructure for.  Instead we use tolower() to provide a
	 * locale-aware translation.  However, there are some locales where this
	 * is not right either (eg, Turkish may do strange things with 'i' and
	 * 'I').  Our current compromise is to use tolower() for characters with
	 * the high bit set, and use an ASCII-only downcasing for 7-bit
	 * characters.
	 */
	for (i = 0; i < len; i++)
	{
		unsigned char ch = (unsigned char) ident[i];

		if (ch >= 'A' && ch <= 'Z')
			ch += 'a' - 'A';
		else if (IS_HIGHBIT_SET(ch) && isupper(ch))
			ch = tolower(ch);
		result[i] = (char) ch;
	}
	result[i] = '\0';

	if (i >= NAMEDATALEN)
		truncate_identifier(result, i, warn);

	return result;
}

/*
 * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
 *
 * The given string is modified in-place, if necessary.  A warning is
 * issued if requested.
 *
 * We require the caller to pass in the string length since this saves a
 * strlen() call in some common usages.
 */
void
truncate_identifier(char *ident, int len, bool warn)
{
	if (len >= NAMEDATALEN)
	{
		len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
		if (warn)
			ereport(NOTICE,
					(errcode(ERRCODE_NAME_TOO_LONG),
					 errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
							ident, len, ident)));
		ident[len] = '\0';
	}
}

/*
 * scanner_isspace() --- return TRUE if flex scanner considers char whitespace
 *
 * This should be used instead of the potentially locale-dependent isspace()
 * function when it's important to match the lexer's behavior.
 *
 * In principle we might need similar functions for isalnum etc, but for the
 * moment only isspace seems needed.
 */
bool
scanner_isspace(char ch)
{
	/* This must match scan.l's list of {space} characters */
	if (ch == ' ' ||
		ch == '\t' ||
		ch == '\n' ||
		ch == '\r' ||
		ch == '\f')
		return true;
	return false;
}
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`/*-------------------------------------------------------------------------`
			`*`
Change my-function-name-- to my_function_name, and optimizer renames. 1999-02-14 00:22:53 +01:00			`* scansup.c`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`* support routines for the lex/flex scanner, used by both the normal`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`* backend as well as the bootstrap backend`
			`*`
Update copyright for the year 2010. 2010-01-02 17:58:17 +01:00			`* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group`
Add: * Portions Copyright (c) 1996-2000, PostgreSQL, Inc to all files copyright Regents of Berkeley. Man, that's a lot of files. 2000-01-26 06:58:53 +01:00			`* Portions Copyright (c) 1994, Regents of the University of California`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`*`
			`*`
			`* IDENTIFICATION`
Update copyright for the year 2010. 2010-01-02 17:58:17 +01:00			`* $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.39 2010/01/02 16:57:50 momjian Exp $`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`*`
			`*-------------------------------------------------------------------------`
			`*/`
Be careful to include postgres.h before any system headers, to ensure that the right flavors of largefile-related definitions are seen. Most of these changes are probably unnecessary, but better safe than sorry. 2002-09-05 02:43:07 +02:00			`#include "postgres.h"`
added #include "config.h" for ESCAPE_PATCH define 1996-08-27 09:42:29 +02:00
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`#include <ctype.h>`
Cleanup up include files. 1997-11-26 02:14:33 +01:00
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`#include "parser/scansup.h"`
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00			`#include "mb/pg_wchar.h"`

Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
			`/* ----------------`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`* scanstr`
			`*`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`* if the string passed in has escaped codes, map the escape codes to actual`
			`* chars`
			`*`
Eliminate token length assumption in scanstr(). 1999-09-12 00:26:47 +02:00			`* the string returned is palloc'd and should eventually be pfree'd by the`
			`* caller!`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`* ----------------`
			`*/`

pgindent run before 6.3 release, with Thomas' requested changes. 1998-02-26 05:46:47 +01:00			`char *`
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00			`scanstr(const char *s)`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`{`
Eliminate token length assumption in scanstr(). 1999-09-12 00:26:47 +02:00			`char *newStr;`
Another PGINDENT run that changes variable indenting and case label indenting. Also static variable indenting. 1997-09-08 04:41:22 +02:00			`int len,`
			`i,`
			`j;`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`if (s == NULL \|\| s[0] == '\0')`
Eliminate token length assumption in scanstr(). 1999-09-12 00:26:47 +02:00			`return pstrdup("");`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`len = strlen(s);`

Ye-old pgindent run. Same 4-space tabs. 2000-04-12 19:17:23 +02:00			`newStr = palloc(len + 1); /* string cannot get longer */`
Eliminate token length assumption in scanstr(). 1999-09-12 00:26:47 +02:00
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`for (i = 0, j = 0; i < len; i++)`
			`{`
			`if (s[i] == '\'')`
			`{`
pgindent run over code. 1999-05-25 18:15:34 +02:00			`/*`
Standard pgindent run for 8.1. 2005-10-15 04:49:52 +02:00			`* Note: if scanner is working right, unescaped quotes can only`
			`* appear in pairs, so there should be another character.`
Simplify scanstr(), fix broken octal-escape code. 1999-02-08 00:59:59 +01:00			`*/`
			`i++;`
			`newStr[j] = s[i];`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`}`
Simplify scanstr(), fix broken octal-escape code. 1999-02-08 00:59:59 +01:00			`else if (s[i] == '\\')`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`{`
Simplify scanstr(), fix broken octal-escape code. 1999-02-08 00:59:59 +01:00			`i++;`
			`switch (s[i])`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`{`
Simplify scanstr(), fix broken octal-escape code. 1999-02-08 00:59:59 +01:00			`case 'b':`
			`newStr[j] = '\b';`
			`break;`
			`case 'f':`
			`newStr[j] = '\f';`
			`break;`
			`case 'n':`
			`newStr[j] = '\n';`
			`break;`
			`case 'r':`
			`newStr[j] = '\r';`
			`break;`
			`case 't':`
			`newStr[j] = '\t';`
			`break;`
			`case '0':`
			`case '1':`
			`case '2':`
			`case '3':`
			`case '4':`
			`case '5':`
			`case '6':`
			`case '7':`
			`{`
			`int k;`
			`long octVal = 0;`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00
Simplify scanstr(), fix broken octal-escape code. 1999-02-08 00:59:59 +01:00			`for (k = 0;`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`s[i + k] >= '0' && s[i + k] <= '7' && k < 3;`
Simplify scanstr(), fix broken octal-escape code. 1999-02-08 00:59:59 +01:00			`k++)`
			`octVal = (octVal << 3) + (s[i + k] - '0');`
			`i += k - 1;`
			`newStr[j] = ((char) octVal);`
			`}`
			`break;`
			`default:`
			`newStr[j] = s[i];`
			`break;`
			`} /* switch */`
			`} /* s[i] == '\\' */`
			`else`
			`newStr[j] = s[i];`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`j++;`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`}`
Massive commit to run PGINDENT on all .c and .h files. 1997-09-07 07:04:48 +02:00			`newStr[j] = '\0';`
			`return newStr;`
Postgres95 1.01 Distribution - Virgin Sources 1996-07-09 08:22:35 +02:00			`}`
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00

			`/*`
			`* downcase_truncate_identifier() --- do appropriate downcasing and`
			`* truncation of an unquoted identifier. Optionally warn of truncation.`
			`*`
			`* Returns a palloc'd string containing the adjusted identifier.`
			`*`
			`* Note: in some usages the passed string is not null-terminated.`
			`*`
			`* Note: the API of this function is designed to allow for downcasing`
			`* transformations that increase the string length, but we don't yet`
			`* support that. If you want to implement it, you'll need to fix`
			`* SplitIdentifierString() in utils/adt/varlena.c.`
			`*/`
			`char *`
			`downcase_truncate_identifier(const char *ident, int len, bool warn)`
			`{`
			`char *result;`
			`int i;`

			`result = palloc(len + 1);`
Pgindent run for 8.0. 2004-08-29 07:07:03 +02:00
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00			`/*`
Standard pgindent run for 8.1. 2005-10-15 04:49:52 +02:00			`* SQL99 specifies Unicode-aware case normalization, which we don't yet`
			`* have the infrastructure for. Instead we use tolower() to provide a`
			`* locale-aware translation. However, there are some locales where this`
			`* is not right either (eg, Turkish may do strange things with 'i' and`
			`* 'I'). Our current compromise is to use tolower() for characters with`
			`* the high bit set, and use an ASCII-only downcasing for 7-bit`
			`* characters.`
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00			`*/`
			`for (i = 0; i < len; i++)`
			`{`
Pgindent run for 8.0. 2004-08-29 07:07:03 +02:00			`unsigned char ch = (unsigned char) ident[i];`
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00
			`if (ch >= 'A' && ch <= 'Z')`
			`ch += 'a' - 'A';`
I have added these macros to c.h: #define HIGHBIT (0x80) #define IS_HIGHBIT_SET(ch) ((unsigned char)(ch) & HIGHBIT) and removed CSIGNBIT and mapped it uses to HIGHBIT. I have also added uses for IS_HIGHBIT_SET where appropriate. This change is purely for code clarity. 2005-12-25 03:14:19 +01:00			`else if (IS_HIGHBIT_SET(ch) && isupper(ch))`
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00			`ch = tolower(ch);`
			`result[i] = (char) ch;`
			`}`
			`result[i] = '\0';`

			`if (i >= NAMEDATALEN)`
			`truncate_identifier(result, i, warn);`

			`return result;`
			`}`

			`/*`
			`* truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.`
			`*`
			`* The given string is modified in-place, if necessary. A warning is`
			`* issued if requested.`
			`*`
			`* We require the caller to pass in the string length since this saves a`
			`* strlen() call in some common usages.`
			`*/`
			`void`
			`truncate_identifier(char *ident, int len, bool warn)`
			`{`
			`if (len >= NAMEDATALEN)`
			`{`
Pgindent run for 8.0. 2004-08-29 07:07:03 +02:00			`len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);`
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00			`if (warn)`
			`ereport(NOTICE,`
			`(errcode(ERRCODE_NAME_TOO_LONG),`
Standard pgindent run for 8.1. 2005-10-15 04:49:52 +02:00			`errmsg("identifier \"%s\" will be truncated to \"%.*s\"",`
			`ident, len, ident)));`
Implement a solution to the 'Turkish locale downcases I incorrectly' problem, per previous discussion. Make some additional changes to centralize the knowledge of just how identifier downcasing is done, in hopes of simplifying any future tweaking in this area. 2004-02-21 01:34:53 +01:00			`ident[len] = '\0';`
			`}`
			`}`
Fix bugs in plpgsql and ecpg caused by assuming that isspace() would only return true for exactly the characters treated as whitespace by their flex scanners. Per report from Victor Snezhko and subsequent investigation. Also fix a passel of unsafe usages of <ctype.h> functions, that is, ye olde char-vs-unsigned-char issue. I won't miss <ctype.h> when we are finally able to stop using it. 2006-09-22 23:39:58 +02:00
			`/*`
			`* scanner_isspace() --- return TRUE if flex scanner considers char whitespace`
			`*`
			`* This should be used instead of the potentially locale-dependent isspace()`
			`* function when it's important to match the lexer's behavior.`
			`*`
			`* In principle we might need similar functions for isalnum etc, but for the`
			`* moment only isspace seems needed.`
			`*/`
			`bool`
			`scanner_isspace(char ch)`
			`{`
			`/* This must match scan.l's list of {space} characters */`
			`if (ch == ' ' \|\|`
			`ch == '\t' \|\|`
			`ch == '\n' \|\|`
			`ch == '\r' \|\|`
			`ch == '\f')`
			`return true;`
			`return false;`
			`}`