1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* scansup.c
|
2020-10-04 22:09:55 +02:00
|
|
|
* scanner support routines used by the core lexer
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2023-01-02 21:00:37 +01:00
|
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/parser/scansup.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
2002-09-05 02:43:07 +02:00
|
|
|
#include "postgres.h"
|
1996-08-27 09:42:29 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
#include <ctype.h>
|
1997-11-26 02:14:33 +01:00
|
|
|
|
2004-02-21 01:34:53 +01:00
|
|
|
#include "mb/pg_wchar.h"
|
2019-11-12 04:00:16 +01:00
|
|
|
#include "parser/scansup.h"
|
1996-07-09 08:22:35 +02:00
|
|
|
|
2004-02-21 01:34:53 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* downcase_truncate_identifier() --- do appropriate downcasing and
|
|
|
|
* truncation of an unquoted identifier. Optionally warn of truncation.
|
|
|
|
*
|
|
|
|
* Returns a palloc'd string containing the adjusted identifier.
|
|
|
|
*
|
|
|
|
* Note: in some usages the passed string is not null-terminated.
|
|
|
|
*
|
|
|
|
* Note: the API of this function is designed to allow for downcasing
|
|
|
|
* transformations that increase the string length, but we don't yet
|
|
|
|
* support that. If you want to implement it, you'll need to fix
|
|
|
|
* SplitIdentifierString() in utils/adt/varlena.c.
|
|
|
|
*/
|
|
|
|
char *
|
|
|
|
downcase_truncate_identifier(const char *ident, int len, bool warn)
|
2016-03-18 16:16:14 +01:00
|
|
|
{
|
|
|
|
return downcase_identifier(ident, len, warn, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* a workhorse for downcase_truncate_identifier
|
|
|
|
*/
|
|
|
|
char *
|
|
|
|
downcase_identifier(const char *ident, int len, bool warn, bool truncate)
|
2004-02-21 01:34:53 +01:00
|
|
|
{
|
|
|
|
char *result;
|
|
|
|
int i;
|
2013-06-08 16:00:09 +02:00
|
|
|
bool enc_is_single_byte;
|
2004-02-21 01:34:53 +01:00
|
|
|
|
|
|
|
result = palloc(len + 1);
|
2013-06-08 16:00:09 +02:00
|
|
|
enc_is_single_byte = pg_database_encoding_max_length() == 1;
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2004-02-21 01:34:53 +01:00
|
|
|
/*
|
|
|
|
* SQL99 specifies Unicode-aware case normalization, which we don't yet
|
|
|
|
* have the infrastructure for. Instead we use tolower() to provide a
|
|
|
|
* locale-aware translation. However, there are some locales where this
|
|
|
|
* is not right either (eg, Turkish may do strange things with 'i' and
|
|
|
|
* 'I'). Our current compromise is to use tolower() for characters with
|
2013-06-08 16:00:09 +02:00
|
|
|
* the high bit set, as long as they aren't part of a multi-byte
|
|
|
|
* character, and use an ASCII-only downcasing for 7-bit characters.
|
2004-02-21 01:34:53 +01:00
|
|
|
*/
|
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
{
|
|
|
|
unsigned char ch = (unsigned char) ident[i];
|
|
|
|
|
|
|
|
if (ch >= 'A' && ch <= 'Z')
|
|
|
|
ch += 'a' - 'A';
|
2013-06-08 16:00:09 +02:00
|
|
|
else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
|
2004-02-21 01:34:53 +01:00
|
|
|
ch = tolower(ch);
|
|
|
|
result[i] = (char) ch;
|
|
|
|
}
|
|
|
|
result[i] = '\0';
|
|
|
|
|
2016-03-18 16:16:14 +01:00
|
|
|
if (i >= NAMEDATALEN && truncate)
|
2004-02-21 01:34:53 +01:00
|
|
|
truncate_identifier(result, i, warn);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2016-03-18 16:16:14 +01:00
|
|
|
|
2004-02-21 01:34:53 +01:00
|
|
|
/*
|
|
|
|
* truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
|
|
|
|
*
|
|
|
|
* The given string is modified in-place, if necessary. A warning is
|
|
|
|
* issued if requested.
|
|
|
|
*
|
|
|
|
* We require the caller to pass in the string length since this saves a
|
|
|
|
* strlen() call in some common usages.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
truncate_identifier(char *ident, int len, bool warn)
|
|
|
|
{
|
|
|
|
if (len >= NAMEDATALEN)
|
|
|
|
{
|
|
|
|
len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
|
|
|
|
if (warn)
|
|
|
|
ereport(NOTICE,
|
|
|
|
(errcode(ERRCODE_NAME_TOO_LONG),
|
2020-06-29 23:12:38 +02:00
|
|
|
errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
|
|
|
|
ident, len, ident)));
|
2004-02-21 01:34:53 +01:00
|
|
|
ident[len] = '\0';
|
|
|
|
}
|
|
|
|
}
|
2006-09-22 23:39:58 +02:00
|
|
|
|
|
|
|
/*
|
2017-08-16 06:22:32 +02:00
|
|
|
* scanner_isspace() --- return true if flex scanner considers char whitespace
|
2006-09-22 23:39:58 +02:00
|
|
|
*
|
|
|
|
* This should be used instead of the potentially locale-dependent isspace()
|
|
|
|
* function when it's important to match the lexer's behavior.
|
|
|
|
*
|
|
|
|
* In principle we might need similar functions for isalnum etc, but for the
|
|
|
|
* moment only isspace seems needed.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
scanner_isspace(char ch)
|
|
|
|
{
|
|
|
|
/* This must match scan.l's list of {space} characters */
|
|
|
|
if (ch == ' ' ||
|
|
|
|
ch == '\t' ||
|
|
|
|
ch == '\n' ||
|
|
|
|
ch == '\r' ||
|
Handle \v as a whitespace character in parsers
This commit comes as a continuation of the discussion that has led to
d522b05, as \v was handled inconsistently when parsing array values or
anything going through the parsers, and changing a parser behavior in
stable branches is a scary thing to do. The parsing of array values now
uses the more central scanner_isspace() and array_isspace() is removed.
As pointing out by Peter Eisentraut, fix a confusing reference to
horizontal space in the parsers with the term "horiz_space". \f was
included in this set since 3cfdd8f from 2000, but it is not horizontal.
"horiz_space" is renamed to "non_newline_space", to refer to all
whitespace characters except newlines.
The changes impact the parsers for the backend, psql, seg, cube, ecpg
and replication commands. Note that JSON should not escape \v, as per
RFC 7159, so these are not touched.
Reviewed-by: Peter Eisentraut, Tom Lane
Discussion: https://postgr.es/m/ZJKcjNwWHHvw9ksQ@paquier.xyz
2023-07-06 01:16:24 +02:00
|
|
|
ch == '\v' ||
|
2006-09-22 23:39:58 +02:00
|
|
|
ch == '\f')
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|